diff --git a/cmake/cblas.cmake b/cmake/cblas.cmake
index 6056b53bc2218fb24d2e97b281b9a0d68bc9a306..8e762be646acb814272f522cc229df307d88679e 100644
--- a/cmake/cblas.cmake
+++ b/cmake/cblas.cmake
@@ -69,15 +69,21 @@ if(NOT DEFINED CBLAS_PROVIDER)
     PATHS ${OPENBLAS_LIB_SEARCH_PATHS})
 
   if(OPENBLAS_LAPACKE_INC_DIR AND OPENBLAS_INC_DIR AND OPENBLAS_LIB)
-    set(CBLAS_PROVIDER OPENBLAS)
-    set(CBLAS_INC_DIR ${OPENBLAS_INC_DIR} ${OPENBLAS_LAPACKE_INC_DIR})
-    set(CBLAS_LIBRARIES ${OPENBLAS_LIB})
-
-    add_definitions(-DPADDLE_USE_OPENBLAS)
-    add_definitions(-DLAPACK_FOUND)
-
-    message(STATUS "Found OpenBLAS (include: ${OPENBLAS_INC_DIR}, library: ${CBLAS_LIBRARIES})")
-    message(STATUS "Found lapack in OpenBLAS (include: ${OPENBLAS_LAPACKE_INC_DIR})")
+    file(READ "${OPENBLAS_INC_DIR}/openblas_config.h" config_file)
+    string(REGEX MATCH "OpenBLAS ([0-9]+\.[0-9]+\.[0-9]+)" tmp ${config_file})
+    string(REGEX MATCH "([0-9]+\.[0-9]+\.[0-9]+)" ver ${tmp})
+    
+    if (${ver} VERSION_EQUAL "0.3.7")
+      set(CBLAS_PROVIDER OPENBLAS)
+      set(CBLAS_INC_DIR ${OPENBLAS_INC_DIR} ${OPENBLAS_LAPACKE_INC_DIR})
+      set(CBLAS_LIBRARIES ${OPENBLAS_LIB})
+
+      add_definitions(-DPADDLE_USE_OPENBLAS)
+      add_definitions(-DLAPACK_FOUND)
+
+      message(STATUS "Found OpenBLAS (include: ${OPENBLAS_INC_DIR}, library: ${CBLAS_LIBRARIES})")
+      message(STATUS "Found lapack in OpenBLAS (include: ${OPENBLAS_LAPACKE_INC_DIR})")
+    endif()
   endif()
 endif()
 
diff --git a/cmake/external/mkldnn.cmake b/cmake/external/mkldnn.cmake
index ce5603b24b687daacea784c96fc00b828e513c97..d0d3901641c9348a4e86822da9ce6e7f84de206f 100644
--- a/cmake/external/mkldnn.cmake
+++ b/cmake/external/mkldnn.cmake
@@ -20,7 +20,7 @@ SET(MKLDNN_SOURCE_DIR     ${THIRD_PARTY_PATH}/mkldnn/src/extern_mkldnn)
 SET(MKLDNN_INSTALL_DIR    ${THIRD_PARTY_PATH}/install/mkldnn)
 SET(MKLDNN_INC_DIR        "${MKLDNN_INSTALL_DIR}/include" CACHE PATH "mkldnn include directory." FORCE)
 SET(MKLDNN_REPOSITORY     ${GIT_URL}/oneapi-src/oneDNN.git)
-SET(MKLDNN_TAG            f3999b71d8e4415c1985a0dfb812a3ed77ee21fa)
+SET(MKLDNN_TAG            748528a2d3204b5f401c14a9aacdec16accd5ead)
 
 
 # Introduce variables:
diff --git a/cmake/external/xpu.cmake b/cmake/external/xpu.cmake
index a03ff7d22dcad211af354fd5d51e0d3a44389885..5d1f1776f885cd2bdcd051028e0f96fc1ed1276d 100644
--- a/cmake/external/xpu.cmake
+++ b/cmake/external/xpu.cmake
@@ -7,52 +7,70 @@ SET(XPU_PROJECT                 "extern_xpu")
 SET(XPU_API_LIB_NAME            "libxpuapi.so")
 SET(XPU_RT_LIB_NAME             "libxpurt.so")
 
-if(NOT XPU_SDK_ROOT)
-  if (WITH_AARCH64)
-      SET(XPU_URL    "https://baidu-kunlun-public.su.bcebos.com/paddle_depence/aarch64/xpu_2021_01_13.tar.gz" CACHE STRING "" FORCE)
-  elseif(WITH_SUNWAY)
-      SET(XPU_URL    "https://baidu-kunlun-public.su.bcebos.com/paddle_depence/sunway/xpu_2021_01_13.tar.gz" CACHE STRING "" FORCE)
-  else()
-      SET(XPU_URL    "https://baidu-kunlun-public.su.bcebos.com/paddle_depence/xpu_2021_05_19.tar.gz" CACHE STRING "" FORCE)
-  endif()
-
-  SET(XPU_SOURCE_DIR              "${THIRD_PARTY_PATH}/xpu")
-  SET(XPU_DOWNLOAD_DIR            "${XPU_SOURCE_DIR}/src/${XPU_PROJECT}")
-  SET(XPU_INSTALL_DIR             "${THIRD_PARTY_PATH}/install/xpu")
-  SET(XPU_API_INC_DIR             "${THIRD_PARTY_PATH}/install/xpu/include")
-  SET(XPU_LIB_DIR                 "${THIRD_PARTY_PATH}/install/xpu/lib")
-
-  SET(XPU_API_LIB                 "${XPU_LIB_DIR}/${XPU_API_LIB_NAME}")
-  SET(XPU_RT_LIB                  "${XPU_LIB_DIR}/${XPU_RT_LIB_NAME}")
-
-  SET(CMAKE_INSTALL_RPATH "${CMAKE_INSTALL_RPATH}" "${XPU_INSTALL_DIR}/lib")
-
-  FILE(WRITE ${XPU_DOWNLOAD_DIR}/CMakeLists.txt
-    "PROJECT(XPU)\n"
-    "cmake_minimum_required(VERSION 3.0)\n"
-    "install(DIRECTORY xpu/include xpu/lib \n"
-    "        DESTINATION ${XPU_INSTALL_DIR})\n")
-
-  ExternalProject_Add(
-      ${XPU_PROJECT}
-      ${EXTERNAL_PROJECT_LOG_ARGS}
-      PREFIX                ${XPU_SOURCE_DIR}
-      DOWNLOAD_DIR          ${XPU_DOWNLOAD_DIR}
-      DOWNLOAD_COMMAND      wget --no-check-certificate ${XPU_URL} -c -q -O xpu.tar.gz
-                            && tar xvf xpu.tar.gz
-      DOWNLOAD_NO_PROGRESS  1
-      UPDATE_COMMAND        ""
-      CMAKE_ARGS            -DCMAKE_INSTALL_PREFIX=${XPU_INSTALL_ROOT}
-      CMAKE_CACHE_ARGS      -DCMAKE_INSTALL_PREFIX:PATH=${XPU_INSTALL_ROOT}
-  )
-else()
-  SET(XPU_API_INC_DIR   "${XPU_SDK_ROOT}/XTDK/include/")
-  SET(XPU_API_LIB "${XPU_SDK_ROOT}/XTDK/shlib/libxpuapi.so")
-  SET(XPU_RT_LIB "${XPU_SDK_ROOT}/XTDK/runtime/shlib/libxpurt.so")
-  SET(XPU_LIB_DIR "${XPU_SDK_ROOT}/XTDK/shlib/")
-endif()
+IF(WITH_AARCH64)
+  SET(XPU_XRE_DIR_NAME "xre-kylin_aarch64")
+  SET(XPU_XDNN_DIR_NAME "xdnn-kylin_aarch64")
+  SET(XPU_XCCL_DIR_NAME "xccl-kylin_aarch64")
+ELSEIF(WITH_SUNWAY)
+  SET(XPU_XRE_DIR_NAME "xre-deepin_sw6_64")
+  SET(XPU_XDNN_DIR_NAME "xdnn-deepin_sw6_64")
+  SET(XPU_XCCL_DIR_NAME "xccl-deepin_sw6_64")
+ELSEIF(WITH_BDCENTOS)
+  SET(XPU_XRE_DIR_NAME "xre-bdcentos_x86_64")
+  SET(XPU_XDNN_DIR_NAME "xdnn-bdcentos_x86_64")
+  SET(XPU_XCCL_DIR_NAME "xccl-bdcentos_x86_64")
+ELSEIF(WITH_UBUNTU)
+  SET(XPU_XRE_DIR_NAME "xre-ubuntu_x86_64")
+  SET(XPU_XDNN_DIR_NAME "xdnn-ubuntu_x86_64")
+  SET(XPU_XCCL_DIR_NAME "xccl-bdcentos_x86_64")
+ELSEIF(WITH_CENTOS)
+  SET(XPU_XRE_DIR_NAME "xre-centos7_x86_64")
+  SET(XPU_XDNN_DIR_NAME "xdnn-centos7_x86_64")
+  SET(XPU_XCCL_DIR_NAME "xccl-bdcentos_x86_64")
+ELSE ()
+  SET(XPU_XRE_DIR_NAME "xre-ubuntu_x86_64")
+  SET(XPU_XDNN_DIR_NAME "xdnn-ubuntu_x86_64")
+  SET(XPU_XCCL_DIR_NAME "xccl-bdcentos_x86_64")
+ENDIF()
+
+SET(XPU_BASE_URL "https://baidu-kunlun-product.cdn.bcebos.com/KL-SDK/klsdk-dev/20210527")
+SET(XPU_XRE_URL  "${XPU_BASE_URL}/${XPU_XRE_DIR_NAME}.tar.gz" CACHE STRING "" FORCE)
+SET(XPU_XDNN_URL "${XPU_BASE_URL}/${XPU_XDNN_DIR_NAME}.tar.gz" CACHE STRING "" FORCE)
+SET(XPU_XCCL_URL "${XPU_BASE_URL}/${XPU_XCCL_DIR_NAME}.tar.gz" CACHE STRING "" FORCE)
+SET(XPU_PACK_DEPENCE_URL "https://baidu-kunlun-public.su.bcebos.com/paddle_depence/pack_paddle_depence.sh" CACHE STRING "" FORCE)
+
+SET(XPU_SOURCE_DIR              "${THIRD_PARTY_PATH}/xpu")
+SET(XPU_DOWNLOAD_DIR            "${XPU_SOURCE_DIR}/src/${XPU_PROJECT}")
+SET(XPU_INSTALL_DIR             "${THIRD_PARTY_PATH}/install/xpu")
+SET(XPU_INC_DIR                 "${THIRD_PARTY_PATH}/install/xpu/include")
+SET(XPU_LIB_DIR                 "${THIRD_PARTY_PATH}/install/xpu/lib")
+
+SET(XPU_API_LIB                 "${XPU_LIB_DIR}/${XPU_API_LIB_NAME}")
+SET(XPU_RT_LIB                  "${XPU_LIB_DIR}/${XPU_RT_LIB_NAME}")
+
+SET(CMAKE_INSTALL_RPATH "${CMAKE_INSTALL_RPATH}" "${XPU_INSTALL_DIR}/lib")
+
+FILE(WRITE ${XPU_DOWNLOAD_DIR}/CMakeLists.txt
+  "PROJECT(XPU)\n"
+  "cmake_minimum_required(VERSION 3.0)\n"
+  "install(DIRECTORY xpu/include xpu/lib \n"
+  "        DESTINATION ${XPU_INSTALL_DIR})\n")
+
+ExternalProject_Add(
+    ${XPU_PROJECT}
+    ${EXTERNAL_PROJECT_LOG_ARGS}
+    PREFIX                ${XPU_SOURCE_DIR}
+    DOWNLOAD_DIR          ${XPU_DOWNLOAD_DIR}
+    DOWNLOAD_COMMAND      wget ${XPU_PACK_DEPENCE_URL}
+                          && bash pack_paddle_depence.sh ${XPU_XRE_URL} ${XPU_XRE_DIR_NAME} ${XPU_XDNN_URL} ${XPU_XDNN_DIR_NAME} ${XPU_XCCL_URL} ${XPU_XCCL_DIR_NAME}
+
+    DOWNLOAD_NO_PROGRESS  1
+    UPDATE_COMMAND        ""
+    CMAKE_ARGS            -DCMAKE_INSTALL_PREFIX=${XPU_INSTALL_ROOT}
+    CMAKE_CACHE_ARGS      -DCMAKE_INSTALL_PREFIX:PATH=${XPU_INSTALL_ROOT}
+)
 
-INCLUDE_DIRECTORIES(${XPU_API_INC_DIR})
+INCLUDE_DIRECTORIES(${XPU_INC_DIR})
 ADD_LIBRARY(shared_xpuapi SHARED IMPORTED GLOBAL)
 set_property(TARGET shared_xpuapi PROPERTY IMPORTED_LOCATION "${XPU_API_LIB}")
 
@@ -62,7 +80,7 @@ generate_dummy_static_lib(LIB_NAME "xpulib" GENERATOR "xpu.cmake")
 
 TARGET_LINK_LIBRARIES(xpulib ${XPU_API_LIB} ${XPU_RT_LIB})
 
-if (WITH_XPU_BKCL)
+IF(WITH_XPU_BKCL)
   MESSAGE(STATUS "Compile with XPU BKCL!")
   ADD_DEFINITIONS(-DPADDLE_WITH_XPU_BKCL)
 
@@ -71,9 +89,9 @@ if (WITH_XPU_BKCL)
   SET(XPU_BKCL_INC_DIR          "${THIRD_PARTY_PATH}/install/xpu/include")
   INCLUDE_DIRECTORIES(${XPU_BKCL_INC_DIR})
   TARGET_LINK_LIBRARIES(xpulib ${XPU_API_LIB} ${XPU_RT_LIB} ${XPU_BKCL_LIB})
-else(WITH_XPU_BKCL)
-  TARGET_LINK_LIBRARIES(xpulib ${XPU_API_LIB} ${XPU_RT_LIB} )
-endif(WITH_XPU_BKCL)
+ELSE(WITH_XPU_BKCL)
+  TARGET_LINK_LIBRARIES(xpulib ${XPU_API_LIB} ${XPU_RT_LIB})
+ENDIF(WITH_XPU_BKCL)
 
 if(NOT XPU_SDK_ROOT)
   ADD_DEPENDENCIES(xpulib ${XPU_PROJECT})
diff --git a/cmake/third_party.cmake b/cmake/third_party.cmake
index d33edef38ca7b36ce0b0474407ae7363884bfdaa..e3a78d3cf3bfe0c12d42ab7039fd3377fda6cdef 100644
--- a/cmake/third_party.cmake
+++ b/cmake/third_party.cmake
@@ -215,6 +215,8 @@ list(APPEND third_party_deps extern_eigen3 extern_gflags extern_glog extern_boos
 list(APPEND third_party_deps extern_zlib extern_dlpack extern_warpctc extern_threadpool)
 
 include(cblas)              	# find first, then download, build, install openblas
+
+message(STATUS "CBLAS_PROVIDER: ${CBLAS_PROVIDER}")
 if(${CBLAS_PROVIDER} STREQUAL MKLML)
     list(APPEND third_party_deps extern_mklml)
 elseif(${CBLAS_PROVIDER} STREQUAL EXTERN_OPENBLAS)
diff --git a/paddle/fluid/framework/details/nan_inf_utils.h b/paddle/fluid/framework/details/nan_inf_utils.h
index 4d7d9afe7019290e44bb6d20ce42784b8631cadd..cf64ccd60f45a40b6c9ca83dcdd473686d03904f 100644
--- a/paddle/fluid/framework/details/nan_inf_utils.h
+++ b/paddle/fluid/framework/details/nan_inf_utils.h
@@ -19,6 +19,7 @@
 
 #include "paddle/fluid/framework/operator.h"
 #include "paddle/fluid/framework/scope.h"
+#include "paddle/fluid/imperative/type_defs.h"
 #include "paddle/fluid/platform/place.h"
 
 namespace paddle {
@@ -30,9 +31,28 @@ void CheckVarHasNanOrInf(const std::string& op_type,
                          const std::string& var_name,
                          const platform::Place& place);
 
+void CheckVarHasNanOrInf(const std::string& op_type,
+                         const std::string& var_name,
+                         const framework::Variable* var,
+                         const platform::Place& place);
+
 void CheckOpHasNanOrInf(const framework::OperatorBase& op,
                         const framework::Scope& scope,
                         const platform::Place& place);
+
+template <typename VarType>
+void CheckOpHasNanOrInfInDygraph(const std::string& op_type,
+                                 const imperative::NameVarMap<VarType>& op_outs,
+                                 platform::Place place) {
+  for (const auto& pair : op_outs) {
+    for (const auto& ivar : pair.second) {
+      auto* var = ivar->MutableVar();
+      if (var == nullptr) continue;
+      CheckVarHasNanOrInf(op_type, ivar->Name(), var, place);
+    }
+  }
+}
+
 }  // namespace details
 }  // namespace framework
 }  // namespace paddle
diff --git a/paddle/fluid/framework/details/nan_inf_utils_detail.cc b/paddle/fluid/framework/details/nan_inf_utils_detail.cc
index f9aa14bf7e8d7e9a632cafefa8b88f0ae35c5a6c..30231a1799fd3714646a81bba2afb5de03045850 100644
--- a/paddle/fluid/framework/details/nan_inf_utils_detail.cc
+++ b/paddle/fluid/framework/details/nan_inf_utils_detail.cc
@@ -297,13 +297,12 @@ void tensor_check<platform::CPUDeviceContext>(const std::string& op_type,
 }
 
 void CheckVarHasNanOrInf(const std::string& op_type,
-                         const framework::Scope& scope,
                          const std::string& var_name,
+                         const framework::Variable* var,
                          const platform::Place& place) {
-  auto* var = scope.FindVar(var_name);
   PADDLE_ENFORCE_NOT_NULL(
-      var, platform::errors::NotFound("In op=%s, can't find var:%s", op_type,
-                                      var_name));
+      var, platform::errors::NotFound("Cannot find var: `%s` in op `%s`.",
+                                      var_name, op_type));
 
   const Tensor* tensor{nullptr};
   if (var->IsType<framework::LoDTensor>()) {
@@ -393,6 +392,14 @@ void CheckVarHasNanOrInf(const std::string& op_type,
   tensor_check<platform::CPUDeviceContext>(op_type, var_name, *tensor, place);
 }
 
+void CheckVarHasNanOrInf(const std::string& op_type,
+                         const framework::Scope& scope,
+                         const std::string& var_name,
+                         const platform::Place& place) {
+  auto* var = scope.FindVar(var_name);
+  CheckVarHasNanOrInf(op_type, var_name, var, place);
+}
+
 bool IsSkipOp(const framework::OperatorBase& op) {
   if (op_type_nan_inf_white_list().count(op.Type()) != 0) return true;
 
diff --git a/paddle/fluid/framework/distributed_strategy.proto b/paddle/fluid/framework/distributed_strategy.proto
index 181e3b68853801460c87162badb553c90ab7ccb5..be05941efb5b4b60f77ab13909bfd27d877615a1 100644
--- a/paddle/fluid/framework/distributed_strategy.proto
+++ b/paddle/fluid/framework/distributed_strategy.proto
@@ -176,6 +176,7 @@ message DistributedStrategy {
   optional bool find_unused_parameters = 28 [ default = false ];
   optional bool tensor_parallel = 29 [ default = false ];
   optional bool without_graph_optimization = 30 [ default = false ];
+  optional int32 fuse_grad_size_in_num = 31 [ default = 1 ];
 
   optional RecomputeConfig recompute_configs = 101;
   optional AMPConfig amp_configs = 102;
diff --git a/paddle/fluid/framework/ir/fc_fuse_pass.cc b/paddle/fluid/framework/ir/fc_fuse_pass.cc
index 656d453d4030439f0229492a7c2ab2ee46481950..0bb2782b3737ee3130e2d7bee68fd932c3b87932 100644
--- a/paddle/fluid/framework/ir/fc_fuse_pass.cc
+++ b/paddle/fluid/framework/ir/fc_fuse_pass.cc
@@ -14,7 +14,6 @@
 
 #include "paddle/fluid/framework/ir/fc_fuse_pass.h"
 #include <string>
-#include "paddle/fluid/framework/op_proto_maker.h"
 
 #include "paddle/fluid/framework/op_version_registry.h"
 #include "paddle/fluid/platform/enforce.h"
@@ -38,6 +37,7 @@ FCFusePass::FCFusePass() {
       .IsNumGE(1)
       .End()
       .AddAttr("y_num_col_dims")
+      .IsNumEQ(1)
       .End();
 
   AddOpCompat(OpCompat("elementwise_add"))
@@ -51,6 +51,7 @@ FCFusePass::FCFusePass() {
       .IsTensor()
       .End()
       .AddAttr("axis")
+      .IsNumGE(1)
       .End();
 
   AddOpCompat(OpCompat("relu"))
diff --git a/paddle/fluid/framework/ir/fc_fuse_pass_tester.cc b/paddle/fluid/framework/ir/fc_fuse_pass_tester.cc
index cf35c1ac772da079159cb4ced2edc234d7325b1e..5046911036818c902844a35220101836b6404478 100644
--- a/paddle/fluid/framework/ir/fc_fuse_pass_tester.cc
+++ b/paddle/fluid/framework/ir/fc_fuse_pass_tester.cc
@@ -58,12 +58,12 @@ TEST(FCFusePass, basic) {
   auto* weights_0 = layers.data("weights_0", {}, true);
   auto* mul_out_0 = layers.mul(relu_out_0, weights_0);
   auto* bias_1 = layers.data("bias_1", {}, true);
-  auto* add_out_0 = layers.elementwise_add(mul_out_0, bias_1);
+  auto* add_out_0 = layers.elementwise_add(mul_out_0, bias_1, nullptr, 1);
   auto* relu_out_1 = layers.relu(add_out_0);
   auto* weights_1 = layers.data("weights_1", {}, true);
   auto* mul_out_1 = layers.mul(relu_out_1, weights_1);
   auto* bias_2 = layers.data("bias_2", {}, true);
-  auto* add_out_1 = layers.elementwise_add(mul_out_1, bias_2);
+  auto* add_out_1 = layers.elementwise_add(mul_out_1, bias_2, nullptr, 1);
   VLOG(4) << add_out_1;
 
   std::unique_ptr<ir::Graph> graph(new ir::Graph(layers.main_program()));
diff --git a/paddle/fluid/framework/ir/op_compat_sensible_pass.cc b/paddle/fluid/framework/ir/op_compat_sensible_pass.cc
index 3d8e655c5b2730fd36651c67d2f7c37b7dd5ecd9..e422a9bae31181d064bd36359ff1ebe38da2cac6 100644
--- a/paddle/fluid/framework/ir/op_compat_sensible_pass.cc
+++ b/paddle/fluid/framework/ir/op_compat_sensible_pass.cc
@@ -250,6 +250,32 @@ OpCompat& OpCompatSensiblePass::AddOpCompat(OpCompat&& op_compat) {
   return *(op_compat_judgers_[name]);
 }
 
+//! Tell the Op compability of a subgraph.
+bool OpCompatSensiblePass::IsCompat(
+    const GraphPatternDetector::subgraph_t& subgraph, Graph*) const {
+  PADDLE_ENFORCE_EQ(op_compat_judgers_.empty(), false,
+                    platform::errors::InvalidArgument(
+                        "At least one OpCompat instance should be added"));
+  // Check the all the ops in the subgraph are contained in the
+  // op_compat.
+  for (auto& node_pair : subgraph) {
+    if (!node_pair.second->IsOp()) continue;
+    auto op_type = node_pair.second->Op()->Type();
+    if (!op_compat_judgers_.count(op_type)) {
+      if (HasOpDef(op_type)) {
+        LOG(WARNING) << op_type << "compat not registered!";
+        return false;
+      }
+      continue;
+    }
+    auto& judger = *op_compat_judgers_.at(op_type);
+    if (!judger.Judge(*(node_pair.second->Op()))) {
+      return false;
+    }
+  }
+  return true;
+}
+
 }  // namespace ir
 }  // namespace framework
 }  // namespace paddle
diff --git a/paddle/fluid/framework/ir/op_compat_sensible_pass.h b/paddle/fluid/framework/ir/op_compat_sensible_pass.h
index 3aa985c6d46fa262bd4050f63e668c68e55237ac..7346ca3756f361f00fb67090d4127995fbe89b30 100644
--- a/paddle/fluid/framework/ir/op_compat_sensible_pass.h
+++ b/paddle/fluid/framework/ir/op_compat_sensible_pass.h
@@ -195,26 +195,7 @@ class OpCompatSensiblePass : public Pass {
 
   //! Tell the Op compability of a subgraph.
   bool IsCompat(const GraphPatternDetector::subgraph_t& subgraph,
-                Graph* g) const {
-    CHECK(!op_compat_judgers_.empty())
-        << "At least one OpCompat instance should be added in the "
-           "OpCompatSensiblePass.";
-    // Check the all the ops in the subgraph are contained in the
-    // op_compat.
-    for (auto& node_pair : subgraph) {
-      if (!node_pair.second->IsOp()) continue;
-      auto op_type = node_pair.second->Op()->Type();
-      if (!op_compat_judgers_.count(op_type)) {
-        LOG(WARNING) << op_type << "compat not registered!";
-        return false;
-      }
-      auto& judger = *op_compat_judgers_.at(op_type);
-      if (!judger.Judge(*(node_pair.second->Op()))) {
-        return false;
-      }
-    }
-    return true;
-  }
+                Graph* g) const;
 
   //! Tell the op compatibility of a single Op.
   bool IsCompat(const OpDesc& op_desc) const {
diff --git a/paddle/fluid/framework/ir/op_compat_sensible_pass_tester.cc b/paddle/fluid/framework/ir/op_compat_sensible_pass_tester.cc
index 87e28ae3a3aadda63ef67c82596d20cfb0c644f4..9074a9876f9f7d200d4c464fdab57b641c1d3b5a 100644
--- a/paddle/fluid/framework/ir/op_compat_sensible_pass_tester.cc
+++ b/paddle/fluid/framework/ir/op_compat_sensible_pass_tester.cc
@@ -151,6 +151,10 @@ class OpCompatSensiblePassTest : public OpCompatSensiblePass {
  public:
   OpCompatSensiblePassTest();
   bool TestIsCompat(const OpDesc& op_desc) { return IsCompat(op_desc); }
+  bool TestIsCompat(const GraphPatternDetector::subgraph_t& subgraph,
+                    Graph* g) {
+    return IsCompat(subgraph, g);
+  }
 };
 
 OpCompatSensiblePassTest::OpCompatSensiblePassTest() {
@@ -192,6 +196,23 @@ TEST(OpCompatSensiblePass, IsCompat) {
   EXPECT_TRUE(test.TestIsCompat(fc_op));
 }
 
+TEST(OpCompatSensiblePass, IsCompatFail) {
+  OpCompatSensiblePassTest test;
+  GraphPatternDetector::subgraph_t subgraph;
+  PDPattern pattern;
+  PDNode* pd_node = pattern.NewNode();
+  ProgramDesc prog;
+  Graph g(prog);
+  OpDesc fc_op;
+  fc_op.SetType("op1");
+  subgraph[pd_node] = g.CreateOpNode(&fc_op);
+  EXPECT_TRUE(test.TestIsCompat(subgraph, &g));
+
+  fc_op.SetType("mul");
+  subgraph[pd_node] = g.CreateOpNode(&fc_op);
+  EXPECT_FALSE(test.TestIsCompat(subgraph, &g));
+}
+
 }  // namespace ir
 }  // namespace framework
 }  // namespace paddle
diff --git a/paddle/fluid/framework/ir/pass_tester_helper.h b/paddle/fluid/framework/ir/pass_tester_helper.h
index 6b187e538d1c082dec47144ed144a746794767b9..850d3dca6d0e10dd2f93a2149bef268042de339b 100644
--- a/paddle/fluid/framework/ir/pass_tester_helper.h
+++ b/paddle/fluid/framework/ir/pass_tester_helper.h
@@ -194,14 +194,18 @@ struct Layers {
   }
 
   VarDesc* mul(VarDesc* x, VarDesc* y, VarDesc* out = nullptr,
-               int x_num_col_dims = 1) {
+               int x_num_col_dims = 1, int y_num_col_dims = 1) {
     AttributeMap attrs;
-    attrs["x_num_col_dims"] = 1;
+    attrs["x_num_col_dims"] = x_num_col_dims;
+    attrs["y_num_col_dims"] = y_num_col_dims;
     return binary_op("mul", x, y, out, &attrs);
   }
 
-  VarDesc* elementwise_add(VarDesc* x, VarDesc* y, VarDesc* out = nullptr) {
-    return binary_op("elementwise_add", x, y, out);
+  VarDesc* elementwise_add(VarDesc* x, VarDesc* y, VarDesc* out = nullptr,
+                           int axis = -1) {
+    AttributeMap attrs;
+    attrs["axis"] = axis;
+    return binary_op("elementwise_add", x, y, out, &attrs);
   }
 
   VarDesc* elementwise_mul(VarDesc* x, VarDesc* y, VarDesc* out = nullptr,
diff --git a/paddle/fluid/framework/ir/repeated_fc_relu_fuse_pass.cc b/paddle/fluid/framework/ir/repeated_fc_relu_fuse_pass.cc
index bf59c140005167e3be342b4039d2b13e5bddf1c6..4c87b63625c1f69c09588c5bb8483ab03616f153 100644
--- a/paddle/fluid/framework/ir/repeated_fc_relu_fuse_pass.cc
+++ b/paddle/fluid/framework/ir/repeated_fc_relu_fuse_pass.cc
@@ -66,9 +66,13 @@ static bool IsFCWithPaddingWeights(Node* n) {
 }
 
 static bool IsParamOfFC(Node* n, const std::string& param_name) {
-  if (IsInputOfFC(n) && n->inputs.empty() &&
-      (n->Name() == n->outputs[0]->Op()->Input(param_name)[0])) {
-    return true;
+  if (IsInputOfFC(n) && n->inputs.empty()) {
+    for (auto* out : n->outputs) {
+      if (out->Op()->Type() == "fc" &&
+          n->Name() == out->Op()->Input(param_name)[0]) {
+        return true;
+      }
+    }
   }
   return false;
 }
diff --git a/paddle/fluid/framework/lod_tensor.cc b/paddle/fluid/framework/lod_tensor.cc
index 0a6b5e44452fe191fce5fea058194a92e3a406de..69a2a6eefaf8ca51d62842e62a6a731c6cbd3231 100644
--- a/paddle/fluid/framework/lod_tensor.cc
+++ b/paddle/fluid/framework/lod_tensor.cc
@@ -276,7 +276,7 @@ void SerializeToStream(std::ostream &os, const LoDTensor &tensor) {
   SerializeToStream(os, tensor, *dev_ctx);
 }
 
-void DeserializeFromStream(std::ifstream &os, LoDTensor *tensor) {
+void DeserializeFromStream(std::istream &os, LoDTensor *tensor) {
   platform::DeviceContextPool &pool = platform::DeviceContextPool::Instance();
   const platform::DeviceContext *dev_ctx;
   dev_ctx = pool.Get(platform::CPUPlace());
diff --git a/paddle/fluid/framework/lod_tensor.h b/paddle/fluid/framework/lod_tensor.h
index 6b357aba1c5f9a4c0db53b20a9d47e64b71d0a11..7dee0f44e384d4eda9ccb9507f62527a7795b221 100644
--- a/paddle/fluid/framework/lod_tensor.h
+++ b/paddle/fluid/framework/lod_tensor.h
@@ -257,7 +257,7 @@ LoD ConvertToOffsetBasedLoD(const LoD& length_lod);
 
 void SerializeToStream(std::ostream& os, const LoDTensor& tensor);
 
-void DeserializeFromStream(std::ifstream& os, LoDTensor* tensor);
+void DeserializeFromStream(std::istream& os, LoDTensor* tensor);
 
 }  // namespace framework
 }  // namespace paddle
diff --git a/paddle/fluid/framework/op_def_api.cc b/paddle/fluid/framework/op_def_api.cc
index 5e758fe4105097e0c6f3032d1d4e150b661ff5f5..b950f000bb8e50973d6d6ecbc32c416958b92ed3 100644
--- a/paddle/fluid/framework/op_def_api.cc
+++ b/paddle/fluid/framework/op_def_api.cc
@@ -68,5 +68,9 @@ const proto::OpDef& GetOpDef(const std::string& op_name) {
   }
   return ops_definition.at(op_name);
 }
+
+bool HasOpDef(const std::string& op_name) {
+  return op_def_map.find(op_name) != op_def_map.end();
+}
 }  // namespace framework
 }  // namespace paddle
diff --git a/paddle/fluid/framework/op_def_api.h b/paddle/fluid/framework/op_def_api.h
index 4ec2089f9b1f88de18305cb5a6615f96f2718d39..1ef2254d0da361915f29b713e2d9a53d5c35cb8a 100644
--- a/paddle/fluid/framework/op_def_api.h
+++ b/paddle/fluid/framework/op_def_api.h
@@ -19,5 +19,7 @@
 namespace paddle {
 namespace framework {
 const proto::OpDef& GetOpDef(const std::string& op_name);
+
+bool HasOpDef(const std::string& op_name);
 }
 }
diff --git a/paddle/fluid/framework/selected_rows.cc b/paddle/fluid/framework/selected_rows.cc
index 7e48d0dc5f96203c4bc89f954b82dfa582eddbc9..c67653953f8a76f8b848bc13efda6fcb23f965da 100644
--- a/paddle/fluid/framework/selected_rows.cc
+++ b/paddle/fluid/framework/selected_rows.cc
@@ -121,7 +121,7 @@ void SerializeToStream(std::ostream& os, const SelectedRows& selected_rows) {
   SerializeToStream(os, selected_rows, *dev_ctx);
 }
 
-void DeserializeFromStream(std::ifstream& os, SelectedRows* selected_rows) {
+void DeserializeFromStream(std::istream& os, SelectedRows* selected_rows) {
   platform::DeviceContextPool& pool = platform::DeviceContextPool::Instance();
   const platform::DeviceContext* dev_ctx;
   dev_ctx = pool.Get(platform::CPUPlace());
diff --git a/paddle/fluid/framework/selected_rows.h b/paddle/fluid/framework/selected_rows.h
index e53e3d973c524657a7b579d96d0f51a39ba40f12..3e4beb9498cf777f91899cd09c8dbb27835a20c2 100644
--- a/paddle/fluid/framework/selected_rows.h
+++ b/paddle/fluid/framework/selected_rows.h
@@ -175,7 +175,7 @@ void DeserializeFromStream(std::istream& is, SelectedRows* selected_rows,
 
 void SerializeToStream(std::ostream& os, const SelectedRows& selected_rows);
 
-void DeserializeFromStream(std::ifstream& os, SelectedRows* selected_rows);
+void DeserializeFromStream(std::istream& os, SelectedRows* selected_rows);
 
 }  // namespace framework
 }  // namespace paddle
diff --git a/paddle/fluid/imperative/CMakeLists.txt b/paddle/fluid/imperative/CMakeLists.txt
index 740045ce2f78becf5692d6b3e7dfba1444592788..46f3c7b0c2ce8ce82d99e36a50d0601f8f931fe8 100644
--- a/paddle/fluid/imperative/CMakeLists.txt
+++ b/paddle/fluid/imperative/CMakeLists.txt
@@ -1,6 +1,6 @@
 cc_library(imperative_flag SRCS flags.cc DEPS gflags)
 
-cc_library(prepared_operator SRCS prepared_operator.cc DEPS proto_desc operator device_context lod_tensor selected_rows var_type_traits op_kernel_type data_transform)
+cc_library(prepared_operator SRCS prepared_operator.cc DEPS proto_desc operator device_context lod_tensor selected_rows var_type_traits op_kernel_type data_transform nan_inf_utils)
 cc_library(layer SRCS layer.cc DEPS prepared_operator math_function imperative_flag variable_helper op_registry)
 add_subdirectory(jit)
 cc_library(amp SRCS amp_auto_cast.cc DEPS layer )
diff --git a/paddle/fluid/imperative/prepared_operator.cc b/paddle/fluid/imperative/prepared_operator.cc
index 2a3b6424d4a14e1cd6345cf24594582bd19f51d4..4a42751b1c4d5b71b9fed066f53f04b06015fea9 100644
--- a/paddle/fluid/imperative/prepared_operator.cc
+++ b/paddle/fluid/imperative/prepared_operator.cc
@@ -15,8 +15,11 @@
 #include "paddle/fluid/imperative/prepared_operator.h"
 
 #include "paddle/fluid/framework/data_type_transform.h"
+#include "paddle/fluid/framework/details/nan_inf_utils.h"
 #include "paddle/fluid/imperative/infer_shape_context.h"
 
+DECLARE_bool(check_nan_inf);
+
 namespace paddle {
 namespace imperative {
 
@@ -175,6 +178,11 @@ static void PreparedOpRunImpl(
   func(DygraphExecutionContext<VarType>(op, scope, *dev_ctx, ctx, ins, outs,
                                         attrs));
 
+  if (FLAGS_check_nan_inf) {
+    framework::details::CheckOpHasNanOrInfInDygraph<VarType>(
+        op.Type(), outs, dev_ctx->GetPlace());
+  }
+
   /**
    * [ Why need handle complex gradient to real gradient? ]
    *
diff --git a/paddle/fluid/imperative/py_layer_fwd.h b/paddle/fluid/imperative/py_layer_fwd.h
index de5f9d75e9173a7d39c113b881e078dc43c83f39..1baf73ab3b95da869922e5d4745c91356025799e 100644
--- a/paddle/fluid/imperative/py_layer_fwd.h
+++ b/paddle/fluid/imperative/py_layer_fwd.h
@@ -17,6 +17,7 @@
 #include <string>
 #include <vector>
 #include "paddle/fluid/imperative/layer.h"
+#include "paddle/fluid/imperative/prepared_operator.h"
 #include "paddle/fluid/imperative/tracer.h"
 
 #include "paddle/fluid/framework/op_registry.h"
@@ -32,7 +33,17 @@ bool RequiredGrad(const NameVarBaseMap& ins, const NameVarBaseMap& outs) {
   for (const auto& name_pair : ins) {
     for (const auto& var_base : name_pair.second) {
       if (!var_base->OverridedStopGradient()) {
-        PassStopGradient(outs, var_base->OverridedStopGradient());
+        for (const auto& pair : outs) {
+          for (const auto& var : pair.second) {
+            if (var) {
+              var->SetOverridedStopGradient(false);
+              SetForwardDataTypeOfGradVar(var);
+              VLOG(3) << "Set output: " << var->Name()
+                      << "'s OverridedStopGradient as "
+                      << var->OverridedStopGradient();
+            }
+          }
+        }
         return true;
       }
     }
@@ -78,28 +89,36 @@ py::object PyLayerApply(const platform::Place& place, const py::handle& cls,
   // process args,`input_vars` only collect `imperative::VarBase`
   if (!args.empty()) {
     for (auto ptr = args.begin(); ptr != args.end(); ptr++) {
-      try {
-        if (Py_None != ptr->ptr()) {
+      // Only collect Tensor type in 'args' and pass them to backward. Ignore
+      // other types of input temporarily.
+      if (py::isinstance<imperative::VarBase>(*ptr)) {
+        try {
           auto a = ptr->cast<std::shared_ptr<VarBase>>();
           input_vars.push_back(a);
+        } catch (py::cast_error& err) {
+          PADDLE_THROW(platform::errors::InvalidArgument(
+              "The `PyLayer.forward` function contains invalid argument, the "
+              "`%s` type argument can not be cast into `Tensor`.",
+              ptr->ptr()->ob_type->tp_name));
         }
-      } catch (py::cast_error& err) {
-        // Only collect Tensor type in 'args' and pass them to backward. Ignore
-        // other types of input temporarily.
       }
     }
   }
   // process kwargs, only collect `imperative::VarBase`
   if (!kwargs.empty()) {
     for (auto ptr = kwargs.begin(); ptr != kwargs.end(); ptr++) {
-      try {
-        if (Py_None != ptr->second.ptr()) {
+      // Only collect Tensor type in 'kwargs' and pass them to backward.
+      // Ignore other types of input temporarily.
+      if (py::isinstance<imperative::VarBase>(*ptr->second)) {
+        try {
           auto a = ptr->second.cast<std::shared_ptr<VarBase>>();
           input_vars.push_back(a);
+        } catch (py::cast_error&) {
+          PADDLE_THROW(platform::errors::InvalidArgument(
+              "The `PyLayer.forward` function contains invalid argument, the "
+              "`%s` type argument can not be cast into `Tensor`.",
+              ptr->second.ptr()->ob_type->tp_name));
         }
-      } catch (py::cast_error&) {
-        // Only collect Tensor type in 'kwargs' and pass them to backward.
-        // Ignore other types of input temporarily.
       }
     }
   }
@@ -110,33 +129,35 @@ py::object PyLayerApply(const platform::Place& place, const py::handle& cls,
       PyList_Check(result_forward.ptr())) {
     auto tuple_result = result_forward.cast<py::tuple>();
     for (size_t i = 0; i < tuple_result.size(); i++) {
-      if (Py_None != tuple_result[i].ptr()) {
+      // Only collect Tensor type of output and pass them to backward.
+      // Ignore other types of input temporarily.
+      if (py::isinstance<imperative::VarBase>(tuple_result[i])) {
         try {
           auto temp_out =
               tuple_result[i].cast<std::shared_ptr<imperative::VarBase>>();
           output_vars.push_back(temp_out);
         } catch (py::cast_error&) {
-          // Only collect Tensor type in 'kwargs' and pass them to backward.
-          // Ignore other types of input temporarily.
+          PADDLE_THROW(platform::errors::InvalidArgument(
+              "The `PyLayer.forward` function returns invalid argument, the "
+              "`%s` type argument can not be cast into `Tensor`.",
+              tuple_result[i].ptr()->ob_type->tp_name));
         }
-      } else {
-        // Only collect Tensor type in 'kwargs' and pass them to backward.
-        // Ignore other types of input temporarily.
       }
     }
   } else {
-    if (Py_None != result_forward.ptr()) {
+    // Only collect Tensor type of output and pass them to backward.
+    // Ignore other types of input temporarily.
+    if (py::isinstance<imperative::VarBase>(result_forward)) {
       try {
         auto temp_out =
             result_forward.cast<std::shared_ptr<imperative::VarBase>>();
         output_vars.push_back(temp_out);
       } catch (py::cast_error&) {
-        // Only collect Tensor type in 'kwargs' and pass them to backward.
-        // Ignore other types of input temporarily.
+        PADDLE_THROW(platform::errors::InvalidArgument(
+            "The `PyLayer.forward` function returns invalid argument, the `%s` "
+            "type argument can not be cast into `Tensor`.",
+            result_forward.ptr()->ob_type->tp_name));
       }
-    } else {
-      // Only collect Tensor type in 'kwargs' and pass them to backward.
-      // Ignore other types of input temporarily.
     }
   }
   if (output_vars.size() == 0) {
diff --git a/paddle/fluid/inference/api/analysis_predictor.cc b/paddle/fluid/inference/api/analysis_predictor.cc
index e628216a5ed87b42a3f37a1adb86f441a735151e..e0dc0f72f17dc535fc7a1524fbe44cd93ae5906a 100644
--- a/paddle/fluid/inference/api/analysis_predictor.cc
+++ b/paddle/fluid/inference/api/analysis_predictor.cc
@@ -303,7 +303,9 @@ static void DisablePrepareDataOpt(
                             disable_opt || pre_disable_opt);
     }
     // disable prepare data if unfriendly op is found
-    disable_opt = IsPrepareDataOptTargetOp(op);
+    if (!disable_opt) {
+      disable_opt = IsPrepareDataOptTargetOp(op);
+    }
   }
 }
 
diff --git a/paddle/fluid/inference/tensorrt/convert/layer_norm_op.cc b/paddle/fluid/inference/tensorrt/convert/layer_norm_op.cc
index 0b97b5d87a3d506e9e14ea5780a9e7b4ac471c83..de5d3110e189030568b3dfeb5a04e5dbe249ae58 100644
--- a/paddle/fluid/inference/tensorrt/convert/layer_norm_op.cc
+++ b/paddle/fluid/inference/tensorrt/convert/layer_norm_op.cc
@@ -46,13 +46,6 @@ class LayerNormOpConverter : public OpConverter {
     auto* Bias_t = Bias_v->GetMutable<framework::LoDTensor>();
     auto* Scale_t = Scale_v->GetMutable<framework::LoDTensor>();
 
-    int input_num = 1;
-    for (int i = 0; i < X->getDimensions().nbDims; i++) {
-      input_num *= X->getDimensions().d[i];
-    }
-    std::vector<int64_t> mean_shape{input_num};
-    std::vector<int64_t> variance_shape{input_num};
-
     std::unique_ptr<framework::LoDTensor> bias_tensor(
         new framework::LoDTensor());
     std::unique_ptr<framework::LoDTensor> scale_tensor(
@@ -68,10 +61,33 @@ class LayerNormOpConverter : public OpConverter {
     auto* bias_data = bias_tensor->mutable_data<float>(platform::CPUPlace());
     auto* scale_data = scale_tensor->mutable_data<float>(platform::CPUPlace());
 
-    plugin::LayerNormPlugin* plugin = new plugin::LayerNormPlugin(
-        bias_data, bias_tensor->numel(), scale_data, scale_tensor->numel(),
-        begin_norm_axis, eps, mean_shape, variance_shape);
-    nvinfer1::IPluginLayer* layernorm_layer = engine_->AddPlugin(&X, 1, plugin);
+    nvinfer1::ILayer* layernorm_layer = nullptr;
+    if (engine_->with_dynamic_shape()) {
+      int input_num = 1;
+      for (int i = begin_norm_axis; i < X->getDimensions().nbDims; i++) {
+        input_num *= X->getDimensions().d[i];
+      }
+      std::vector<int64_t> mean_shape{input_num};
+      std::vector<int64_t> variance_shape{input_num};
+      plugin::LayerNormPluginDynamic* plugin =
+          new plugin::LayerNormPluginDynamic(bias_data, bias_tensor->numel(),
+                                             scale_data, scale_tensor->numel(),
+                                             begin_norm_axis, eps, mean_shape,
+                                             variance_shape);
+      layernorm_layer = engine_->AddDynamicPlugin(&X, 1, plugin);
+    } else {
+      int input_num = 1;
+      for (int i = begin_norm_axis - 1; i < X->getDimensions().nbDims; i++) {
+        input_num *= X->getDimensions().d[i];
+      }
+      std::vector<int64_t> mean_shape{input_num};
+      std::vector<int64_t> variance_shape{input_num};
+      plugin::LayerNormPlugin* plugin = new plugin::LayerNormPlugin(
+          bias_data, bias_tensor->numel(), scale_data, scale_tensor->numel(),
+          begin_norm_axis, eps, mean_shape, variance_shape);
+      layernorm_layer = engine_->AddPlugin(
+          &X, 1, reinterpret_cast<plugin::PluginTensorRT*>(plugin));
+    }
 
     auto output_name = op_desc.Output("Y").front();
     engine_->SetWeights(op_desc.Input("Bias").front(), std::move(bias_tensor));
diff --git a/paddle/fluid/inference/tensorrt/op_teller.cc b/paddle/fluid/inference/tensorrt/op_teller.cc
index 6c6a59e98d9e21728a9fbca08de1ffb455b55ccd..0dc08a482733a388f8324444fb5a8d33e4bfb372 100644
--- a/paddle/fluid/inference/tensorrt/op_teller.cc
+++ b/paddle/fluid/inference/tensorrt/op_teller.cc
@@ -703,7 +703,7 @@ bool OpTeller::Tell(const framework::ir::Node* node, bool use_no_calib_int8,
         return false;
         // Paddle-TRT does not support the input tensors: Shape and ShapeTensor
       } else if (desc.Input("Shape").size() >= 1 ||
-                 desc.Input("ShapeTensor").size() >= 1) {
+                 desc.Input("ShapeTensor").size() >= 1 || with_dynamic_shape) {
         return false;
       } else {
         std::vector<int> shape =
diff --git a/paddle/fluid/inference/tensorrt/plugin/layer_norm_op_plugin.cu b/paddle/fluid/inference/tensorrt/plugin/layer_norm_op_plugin.cu
index 8af036a0e86709336b0ef8b3310442cb7374bfbc..d67820a6f0af4f77d5d0fbb1fe02dac680dfd980 100644
--- a/paddle/fluid/inference/tensorrt/plugin/layer_norm_op_plugin.cu
+++ b/paddle/fluid/inference/tensorrt/plugin/layer_norm_op_plugin.cu
@@ -57,8 +57,18 @@ int LayerNormPlugin::enqueue(int batch_size, const void *const *inputs,
     input_shape.push_back(input_dims.d[i]);
   }
   const auto input_ddim = framework::make_ddim(input_shape);
-  auto matrix_dim = framework::flatten_to_2d(input_ddim, begin_norm_axis - 1);
+  auto matrix_dim = framework::flatten_to_2d(input_ddim, begin_norm_axis);
   int feature_size = static_cast<int>(matrix_dim[1]);
+  PADDLE_ENFORCE_EQ(feature_size, scale_.size(),
+                    platform::errors::InvalidArgument(
+                        "scale's size should be equal to the feature_size,"
+                        "but got feature_size:%d, scale's size:%d.",
+                        feature_size, scale_.size()));
+  PADDLE_ENFORCE_EQ(feature_size, bias_.size(),
+                    platform::errors::InvalidArgument(
+                        "bias's size should be equal to the feature_size,"
+                        "but got feature_size:%d, bias's size:%d.",
+                        feature_size, bias_.size()));
 
   scale_t.Resize(framework::make_ddim({feature_size}));
   bias_t.Resize(framework::make_ddim({feature_size}));
@@ -82,6 +92,163 @@ int LayerNormPlugin::enqueue(int batch_size, const void *const *inputs,
   return cudaGetLastError() != cudaSuccess;
 }
 
+nvinfer1::DimsExprs LayerNormPluginDynamic::getOutputDimensions(
+    int output_index, const nvinfer1::DimsExprs *inputDims, int nb_inputs,
+    nvinfer1::IExprBuilder &expr_builder) {
+  return inputDims[0];
+}
+
+bool LayerNormPluginDynamic::supportsFormatCombination(
+    int pos, const nvinfer1::PluginTensorDesc *in_out, int nb_inputs,
+    int nb_outputs) {
+  PADDLE_ENFORCE_NOT_NULL(
+      in_out, platform::errors::InvalidArgument(
+                  "The input of layernorm plugin shoule not be nullptr."));
+  PADDLE_ENFORCE_LT(
+      pos, nb_inputs + nb_outputs,
+      platform::errors::InvalidArgument("The pos(%d) should be less than the "
+                                        "num(%d) of the input and the output.",
+                                        pos, nb_inputs + nb_outputs));
+  const nvinfer1::PluginTensorDesc &in = in_out[pos];
+  if (pos == 0) {
+    // TODO(Shangzhizhou) FP16 support
+    return (in.type == nvinfer1::DataType::kFLOAT) &&
+           (in.format == nvinfer1::TensorFormat::kLINEAR);
+  }
+  const nvinfer1::PluginTensorDesc &prev = in_out[pos - 1];
+  // output
+  return in.type == prev.type && in.format == prev.format;
+}
+
+nvinfer1::DataType LayerNormPluginDynamic::getOutputDataType(
+    int index, const nvinfer1::DataType *input_types, int nb_inputs) const {
+  PADDLE_ENFORCE_EQ(index, 0,
+                    platform::errors::InvalidArgument(
+                        "The LayerNormPlugin only has one input, so the "
+                        "index value should be 0, but get %d.",
+                        index));
+  return input_types[0];
+}
+
+int LayerNormPluginDynamic::enqueue(
+    const nvinfer1::PluginTensorDesc *input_desc,
+    const nvinfer1::PluginTensorDesc *output_desc, const void *const *inputs,
+    void *const *outputs, void *workspace, cudaStream_t stream) {
+  const auto &input_dims = input_desc[0].dims;
+  int begin_norm_axis = begin_norm_axis_;
+  float eps = eps_;
+
+  std::vector<int> input_shape;
+  for (int i = 0; i < input_dims.nbDims; i++) {
+    input_shape.push_back(input_dims.d[i]);
+  }
+  const auto input_ddim = framework::make_ddim(input_shape);
+  auto matrix_dim = framework::flatten_to_2d(input_ddim, begin_norm_axis);
+  int feature_size = static_cast<int>(matrix_dim[1]);
+  PADDLE_ENFORCE_EQ(feature_size, scale_.size(),
+                    platform::errors::InvalidArgument(
+                        "scale's size should be equal to the feature_size,"
+                        "but got feature_size:%d, scale's size:%d.",
+                        feature_size, scale_.size()));
+  PADDLE_ENFORCE_EQ(feature_size, bias_.size(),
+                    platform::errors::InvalidArgument(
+                        "bias's size should be equal to the feature_size,"
+                        "but got feature_size:%d, bias's size:%d.",
+                        feature_size, bias_.size()));
+  int device_id;
+  cudaGetDevice(&device_id);
+  auto input_type = input_desc[0].type;
+  if (input_type == nvinfer1::DataType::kFLOAT) {
+    VLOG(1) << "TRT Plugin DataType selected. LayerNorm-->fp32";
+    const float *input = reinterpret_cast<const float *>(inputs[0]);
+    float *output = static_cast<float *>(outputs[0]);
+    scale_t.Resize(framework::make_ddim({feature_size}));
+    bias_t.Resize(framework::make_ddim({feature_size}));
+    mean_t.Resize(framework::make_ddim(mean_shape_));
+    variance_t.Resize(framework::make_ddim(variance_shape_));
+
+    float *scale_d =
+        scale_t.mutable_data<float>(platform::CUDAPlace(device_id));
+    float *bias_d = bias_t.mutable_data<float>(platform::CUDAPlace(device_id));
+    float *mean_d = mean_t.mutable_data<float>(platform::CUDAPlace(device_id));
+    float *variance_d =
+        variance_t.mutable_data<float>(platform::CUDAPlace(device_id));
+
+    cudaMemcpyAsync(scale_d, scale_.data(), sizeof(float) * feature_size,
+                    cudaMemcpyHostToDevice, stream);
+    cudaMemcpyAsync(bias_d, bias_.data(), sizeof(float) * feature_size,
+                    cudaMemcpyHostToDevice, stream);
+
+    paddle::operators::LayerNormDirectCUDAFunctor<float> layer_norm;
+    layer_norm(stream, input, input_shape, bias_d, scale_d, output, mean_d,
+               variance_d, begin_norm_axis, eps);
+  } else if (input_type == nvinfer1::DataType::kHALF) {
+#ifdef TRT_PLUGIN_FP16_AVALIABLE
+    VLOG(1) << "TRT Plugin DataType selected. LayerNorm-->fp16";
+    const half *input = reinterpret_cast<const half *>(inputs[0]);
+    half *output = static_cast<half *>(outputs[0]);
+    size_t mean_shape_product = 1;
+    for (auto s : mean_shape_) {
+      mean_shape_product *= s;
+    }
+    size_t variance_shape_product = 1;
+    for (auto s : variance_shape_) {
+      variance_shape_product *= s;
+    }
+    if (!scale_gpu_half_d_) {
+      cudaMalloc(&scale_gpu_half_d_, feature_size * sizeof(half));
+    }
+    if (!bias_gpu_half_d_) {
+      cudaMalloc(&bias_gpu_half_d_, feature_size * sizeof(half));
+    }
+    if (!mean_gpu_half_d_) {
+      cudaMalloc(&mean_gpu_half_d_, mean_shape_product * sizeof(half));
+    }
+    if (!variance_gpu_half_d_) {
+      cudaMalloc(&variance_gpu_half_d_, variance_shape_product * sizeof(half));
+    }
+
+    half *scale_cpu_half =
+        static_cast<half *>(malloc(feature_size * sizeof(half)));
+    half *bias_cpu_half =
+        static_cast<half *>(malloc(feature_size * sizeof(half)));
+    PADDLE_ENFORCE_EQ(
+        scale_cpu_half && bias_cpu_half, true,
+        platform::errors::Unavailable("Out of memory, malloc size %d.",
+                                      feature_size * sizeof(half)));
+
+    for (int i = 0; i < feature_size; i++) {
+      scale_cpu_half[i] = static_cast<half>(scale_[i]);
+      bias_cpu_half[i] = static_cast<half>(bias_[i]);
+    }
+    cudaMemcpyAsync(scale_gpu_half_d_, scale_cpu_half,
+                    sizeof(half) * feature_size, cudaMemcpyHostToDevice,
+                    stream);
+    cudaMemcpyAsync(bias_gpu_half_d_, bias_cpu_half,
+                    sizeof(half) * feature_size, cudaMemcpyHostToDevice,
+                    stream);
+    free(scale_cpu_half);
+    free(bias_cpu_half);
+
+    paddle::operators::LayerNormDirectCUDAFunctor<half> layer_norm;
+    layer_norm(stream, input, input_shape, bias_gpu_half_d_, scale_gpu_half_d_,
+               output, mean_gpu_half_d_, variance_gpu_half_d_, begin_norm_axis,
+               eps);
+#else
+    PADDLE_THROW(platform::errors::Fatal(
+        "The layer_norm tensorRT plugin should be "
+        "complied with CUDA version >= 10.0 when running with fp16. "
+        "Please recomplie it or try to use fp32 by set "
+        "config.SetTRTDynamicShapeInfo(min_input_shape, "
+        "max_input_shape, opt_input_shape, true"));
+#endif
+  } else {
+    PADDLE_THROW(platform::errors::Fatal(
+        "The LayerNorm TRT Plugin's input type should be float or half."));
+  }
+  return cudaGetLastError() != cudaSuccess;
+}
+
 }  // namespace plugin
 }  // namespace tensorrt
 }  // namespace inference
diff --git a/paddle/fluid/inference/tensorrt/plugin/layer_norm_op_plugin.h b/paddle/fluid/inference/tensorrt/plugin/layer_norm_op_plugin.h
index 050ef3b77d3157f89edee949a3a86923846cc3f7..1a6125b0e16ffd2ea9478b0fd4261a823d832417 100644
--- a/paddle/fluid/inference/tensorrt/plugin/layer_norm_op_plugin.h
+++ b/paddle/fluid/inference/tensorrt/plugin/layer_norm_op_plugin.h
@@ -50,7 +50,7 @@ class LayerNormPlugin : public PluginTensorRT {
   // TRT will call this func when we need to serialize the configuration of
   // tensorrt.
   // It should not be called by users.
-  void serialize(void *buffer) override {
+  void serialize(void* buffer) override {
     SerializeValue(&buffer, getPluginType());
     serializeBase(buffer);
     SerializeValue(&buffer, bias_);
@@ -62,7 +62,7 @@ class LayerNormPlugin : public PluginTensorRT {
   }
 
  public:
-  LayerNormPlugin(const float *bias, const int bias_num, const float *scale,
+  LayerNormPlugin(const float* bias, const int bias_num, const float* scale,
                   const int scale_num, int begin_norm_axis, float eps,
                   std::vector<int64_t> mean_shape,
                   std::vector<int64_t> variance_shape)
@@ -78,7 +78,7 @@ class LayerNormPlugin : public PluginTensorRT {
 
   // It was used for tensorrt deserialization.
   // It should not be called by users.
-  LayerNormPlugin(void const *serialData, size_t serialLength) {
+  LayerNormPlugin(void const* serialData, size_t serialLength) {
     deserializeBase(serialData, serialLength);
     DeserializeValue(&serialData, &serialLength, &bias_);
     DeserializeValue(&serialData, &serialLength, &scale_);
@@ -90,20 +90,180 @@ class LayerNormPlugin : public PluginTensorRT {
   ~LayerNormPlugin() {}
   int initialize() override;
 
-  LayerNormPlugin *clone() const override {
+  LayerNormPlugin* clone() const override {
     return new LayerNormPlugin(bias_.data(), bias_.size(), scale_.data(),
                                scale_.size(), begin_norm_axis_, eps_,
                                mean_shape_, variance_shape_);
   }
 
-  const char *getPluginType() const override { return "layer_norm_plugin"; }
+  const char* getPluginType() const override { return "layer_norm_plugin"; }
   int getNbOutputs() const override { return 1; }
-  nvinfer1::Dims getOutputDimensions(int index, const nvinfer1::Dims *inputs,
+  nvinfer1::Dims getOutputDimensions(int index, const nvinfer1::Dims* inputs,
                                      int nbInputDims) override;
-  int enqueue(int batchSize, const void *const *inputs, void **outputs,
-              void *workspace, cudaStream_t stream) override;
+  int enqueue(int batchSize, const void* const* inputs, void** outputs,
+              void* workspace, cudaStream_t stream) override;
 };
 
+class LayerNormPluginDynamic : public DynamicPluginTensorRT {
+ public:
+  LayerNormPluginDynamic(const float* bias, const int bias_num,
+                         const float* scale, const int scale_num,
+                         int begin_norm_axis, float eps,
+                         std::vector<int64_t> mean_shape,
+                         std::vector<int64_t> variance_shape)
+      : begin_norm_axis_(begin_norm_axis),
+        eps_(eps),
+        mean_shape_(mean_shape),
+        variance_shape_(variance_shape),
+        scale_gpu_half_d_(nullptr),
+        bias_gpu_half_d_(nullptr),
+        mean_gpu_half_d_(nullptr),
+        variance_gpu_half_d_(nullptr) {
+    bias_.resize(bias_num);
+    scale_.resize(scale_num);
+    std::copy(bias, bias + bias_num, bias_.data());
+    std::copy(scale, scale + scale_num, scale_.data());
+  }
+
+  LayerNormPluginDynamic(void const* serialData, size_t serialLength)
+      : scale_gpu_half_d_(nullptr),
+        bias_gpu_half_d_(nullptr),
+        mean_gpu_half_d_(nullptr),
+        variance_gpu_half_d_(nullptr) {
+    DeserializeValue(&serialData, &serialLength, &bias_);
+    DeserializeValue(&serialData, &serialLength, &scale_);
+    DeserializeValue(&serialData, &serialLength, &begin_norm_axis_);
+    DeserializeValue(&serialData, &serialLength, &eps_);
+    DeserializeValue(&serialData, &serialLength, &mean_shape_);
+    DeserializeValue(&serialData, &serialLength, &variance_shape_);
+  }
+  nvinfer1::IPluginV2DynamicExt* clone() const override {
+    return new LayerNormPluginDynamic(bias_.data(), bias_.size(), scale_.data(),
+                                      scale_.size(), begin_norm_axis_, eps_,
+                                      mean_shape_, variance_shape_);
+  }
+
+  const char* getPluginType() const override { return "layernorm_plugin"; }
+  int getNbOutputs() const override { return 1; }
+  int initialize() override { return 0; }
+
+  size_t getSerializationSize() const override {
+    return SerializedSize(bias_) + SerializedSize(scale_) +
+           SerializedSize(begin_norm_axis_) + SerializedSize(eps_) +
+           SerializedSize(mean_shape_) + SerializedSize(variance_shape_);
+  }
+
+  void serialize(void* buffer) const override {
+    SerializeValue(&buffer, bias_);
+    SerializeValue(&buffer, scale_);
+    SerializeValue(&buffer, begin_norm_axis_);
+    SerializeValue(&buffer, eps_);
+    SerializeValue(&buffer, mean_shape_);
+    SerializeValue(&buffer, variance_shape_);
+  }
+
+  nvinfer1::DimsExprs getOutputDimensions(
+      int output_index, const nvinfer1::DimsExprs* inputs, int nb_inputs,
+      nvinfer1::IExprBuilder& expr_builder) override;
+
+  bool supportsFormatCombination(int pos,
+                                 const nvinfer1::PluginTensorDesc* inOut,
+                                 int nbInputs, int nbOutputs) override;
+
+  void configurePlugin(const nvinfer1::DynamicPluginTensorDesc* in,
+                       int nbInputs,
+                       const nvinfer1::DynamicPluginTensorDesc* out,
+                       int nbOutputs) override {}
+
+  size_t getWorkspaceSize(const nvinfer1::PluginTensorDesc* inputs,
+                          int nbInputs,
+                          const nvinfer1::PluginTensorDesc* outputs,
+                          int nbOutputs) const override {
+    return 0;
+  }
+
+  int enqueue(const nvinfer1::PluginTensorDesc* inputDesc,
+              const nvinfer1::PluginTensorDesc* outputDesc,
+              const void* const* inputs, void* const* outputs, void* workspace,
+              cudaStream_t stream) override;
+  nvinfer1::DataType getOutputDataType(int index,
+                                       const nvinfer1::DataType* inputTypes,
+                                       int nbInputs) const override;
+
+  ~LayerNormPluginDynamic() {
+    if (scale_gpu_half_d_) {
+      cudaFree(scale_gpu_half_d_);
+    }
+    if (bias_gpu_half_d_) {
+      cudaFree(bias_gpu_half_d_);
+    }
+    if (mean_gpu_half_d_) {
+      cudaFree(mean_gpu_half_d_);
+    }
+    if (variance_gpu_half_d_) {
+      cudaFree(variance_gpu_half_d_);
+    }
+  }
+
+  void destroy() override { delete this; }
+
+ private:
+  std::vector<float> bias_;
+  std::vector<float> scale_;
+  framework::Tensor scale_t;
+  framework::Tensor bias_t;
+  framework::Tensor mean_t;
+  framework::Tensor variance_t;
+  int begin_norm_axis_;
+  float eps_;
+  std::vector<int64_t> mean_shape_;
+  std::vector<int64_t> variance_shape_;
+  half* scale_gpu_half_d_;
+  half* bias_gpu_half_d_;
+  half* mean_gpu_half_d_;
+  half* variance_gpu_half_d_;
+};
+
+class LayerNormPluginDynamicCreator : public nvinfer1::IPluginCreator {
+ public:
+  LayerNormPluginDynamicCreator() {}
+  const char* getPluginName() const override { return "layernorm_plugin"; }
+
+  const char* getPluginVersion() const override { return "1"; }
+
+  const nvinfer1::PluginFieldCollection* getFieldNames() override {
+    return &field_collection_;
+  }
+
+  nvinfer1::IPluginV2* createPlugin(
+      const char* name, const nvinfer1::PluginFieldCollection* fc) override {
+    return nullptr;
+  }
+
+  nvinfer1::IPluginV2* deserializePlugin(const char* name,
+                                         const void* serial_data,
+                                         size_t serial_length) override {
+    auto plugin = new LayerNormPluginDynamic(serial_data, serial_length);
+    return plugin;
+  }
+
+  void setPluginNamespace(const char* lib_namespace) override {
+    plugin_namespace_ = lib_namespace;
+  }
+
+  const char* getPluginNamespace() const override {
+    return plugin_namespace_.c_str();
+  }
+
+ private:
+  std::string plugin_namespace_;
+  std::string plugin_name_;
+  nvinfer1::PluginFieldCollection field_collection_{0, nullptr};
+  std::vector<nvinfer1::PluginField> plugin_attributes_;
+};
+
+REGISTER_TRT_PLUGIN_V2(LayerNormPluginDynamicCreator);
+
 }  // namespace plugin
 }  // namespace tensorrt
 }  // namespace inference
diff --git a/paddle/fluid/inference/tests/api/CMakeLists.txt b/paddle/fluid/inference/tests/api/CMakeLists.txt
index 07208d016a79083079707e38dd0207b4d1c282a2..f0eb0d1fa675b7e88aae44acd79e425a2bc70e47 100644
--- a/paddle/fluid/inference/tests/api/CMakeLists.txt
+++ b/paddle/fluid/inference/tests/api/CMakeLists.txt
@@ -325,11 +325,10 @@ inference_analysis_api_test(test_analyzer_ocr ${OCR_INSTALL_DIR} analyzer_vis_te
 # densebox
 set(DENSEBOX_INSTALL_DIR "${INFERENCE_DEMO_INSTALL_DIR}/densebox")
 download_data_without_verify(${DENSEBOX_INSTALL_DIR} "densebox.tar.gz")
-#inference_analysis_test(test_analyzer_detect SRCS analyzer_detect_tester.cc 
-#  EXTRA_DEPS ${INFERENCE_EXTRA_DEPS}
-#  ARGS --infer_model=${DENSEBOX_INSTALL_DIR}/model --infer_data=${DENSEBOX_INSTALL_DIR}/detect_input_50.txt 
-#       --infer_shape=${DENSEBOX_INSTALL_DIR}/shape_50.txt)
-#set_property(TEST test_analyzer_detect PROPERTY ENVIRONMENT GLOG_vmodule=analysis_predictor=2)
+inference_analysis_test(test_analyzer_detect_functional_mkldnn SRCS analyzer_detect_functional_mkldnn_tester.cc 
+  EXTRA_DEPS ${INFERENCE_EXTRA_DEPS}
+  ARGS --infer_model=${DENSEBOX_INSTALL_DIR}/model --infer_data=${DENSEBOX_INSTALL_DIR}/detect_input_50.txt 
+       --infer_shape=${DENSEBOX_INSTALL_DIR}/shape_50.txt)
 
 # mobilenet with transpose op
 set(MOBILENET_INSTALL_DIR "${INFERENCE_DEMO_INSTALL_DIR}/mobilenet")
diff --git a/paddle/fluid/inference/tests/api/analyzer_detect_functional_mkldnn_tester.cc b/paddle/fluid/inference/tests/api/analyzer_detect_functional_mkldnn_tester.cc
new file mode 100644
index 0000000000000000000000000000000000000000..f157f6b0b82ea9a4759d68d522acd614a98a5f6c
--- /dev/null
+++ b/paddle/fluid/inference/tests/api/analyzer_detect_functional_mkldnn_tester.cc
@@ -0,0 +1,153 @@
+/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include <gtest/gtest.h>
+#include <fstream>
+#include <iostream>
+#include "paddle/fluid/inference/tests/api/tester_helper.h"
+#include "paddle/fluid/platform/device_context.h"
+#include "paddle/fluid/platform/place.h"
+
+DEFINE_string(infer_shape, "", "data shape file");
+DEFINE_int32(sample, 20, "number of sample");
+
+namespace paddle {
+namespace inference {
+namespace analysis {
+
+struct Record {
+  std::vector<float> data;
+  std::vector<int32_t> shape;
+};
+
+Record ProcessALine(const std::string &line, const std::string &shape_line) {
+  VLOG(3) << "process a line";
+
+  Record record;
+  std::vector<std::string> data_strs;
+  split(line, ' ', &data_strs);
+  for (auto &d : data_strs) {
+    record.data.push_back(std::stof(d));
+  }
+
+  std::vector<std::string> shape_strs;
+  split(shape_line, ' ', &shape_strs);
+  for (auto &s : shape_strs) {
+    record.shape.push_back(std::stoi(s));
+  }
+  return record;
+}
+
+void SetConfig(AnalysisConfig *cfg) {
+  cfg->SetModel(FLAGS_infer_model + "/model", FLAGS_infer_model + "/params");
+  cfg->DisableGpu();
+  // cfg->SwitchIrDebug(); // Enable to have graphs dumped
+  cfg->SwitchSpecifyInputNames(false);
+  cfg->SetCpuMathLibraryNumThreads(FLAGS_cpu_num_threads);
+}
+
+void SetInput(std::vector<std::vector<PaddleTensor>> *inputs,
+              const std::string &line, const std::string &shape_line) {
+  auto record = ProcessALine(line, shape_line);
+
+  PaddleTensor input;
+  input.shape = record.shape;
+  input.dtype = PaddleDType::FLOAT32;
+  size_t input_size = record.data.size() * sizeof(float);
+  input.data.Resize(input_size);
+  memcpy(input.data.data(), record.data.data(), input_size);
+  std::vector<PaddleTensor> input_slots;
+  input_slots.assign({input});
+  (*inputs).emplace_back(input_slots);
+}
+
+#ifdef PADDLE_WITH_MKLDNN
+int GetNumCachedObjects(void) {
+  auto &pool = platform::DeviceContextPool::Instance();
+  platform::CPUPlace place;
+  auto onednn_dev_ctx =
+      dynamic_cast<platform::MKLDNNDeviceContext *>(pool.Get(place));
+  return onednn_dev_ctx->GetCachedObjectsNumber();
+}
+
+void validate_cache_onednn(int cache_capacity = 1) {
+  AnalysisConfig cfg;
+  SetConfig(&cfg);
+  cfg.EnableMKLDNN();
+  cfg.SetMkldnnCacheCapacity(cache_capacity);
+
+  auto predictor = CreatePaddlePredictor<AnalysisConfig>(cfg);
+  std::vector<std::vector<PaddleTensor>> ref_outputs;
+  std::vector<std::vector<PaddleTensor>> input_slots_all;
+
+  std::ifstream file(FLAGS_infer_data);
+  std::ifstream infer_file(FLAGS_infer_shape);
+  std::vector<std::string> lines;
+  std::vector<std::string> shape_lines;
+
+  // Let's work with 4 samples
+  auto num_samples = 4;
+  ref_outputs.resize(num_samples);
+  lines.resize(num_samples);
+  shape_lines.resize(num_samples);
+
+  // Let's remember number of cached objects before
+  // execution and after every single execution
+  std::vector<int> cache_filling;
+  cache_filling.push_back(GetNumCachedObjects());
+
+  // compute sequentially prediction
+  for (int i = 0; i < num_samples; ++i) {
+    std::getline(file, lines[i]);
+    std::getline(infer_file, shape_lines[i]);
+    SetInput(&input_slots_all, lines[i], shape_lines[i]);
+    predictor->Run(input_slots_all[i], &ref_outputs[i], FLAGS_batch_size);
+    // record number of cached objects
+    cache_filling.push_back(GetNumCachedObjects());
+  }
+
+  file.close();
+  infer_file.close();
+
+  predictor.reset(nullptr);
+  cache_filling.push_back(GetNumCachedObjects());
+
+  // Compare results
+  // First and last value should be equal e.g. before using cache (empty) and
+  // after releasing executor
+  PADDLE_ENFORCE_EQ(
+      cache_filling[0], cache_filling[cache_filling.size() - 1],
+      platform::errors::Fatal("Cache size before execution and after "
+                              "releasing Executor do not match"));
+
+  // Iterate to check if cache is not increasing
+  // over exceeding cache capacity
+  if (cache_capacity != 0) {
+    for (int i = cache_capacity + 1; i < num_samples + 1; ++i) {
+      PADDLE_ENFORCE_EQ(
+          cache_filling[cache_capacity], cache_filling[i],
+          platform::errors::Fatal("Cache capacity should not increase "
+                                  "after full capacity is used"));
+    }
+  }
+}
+
+TEST(Analyzer_detect, validate_cache_onednn) {
+  validate_cache_onednn(2 /*cache_capacity */);
+}
+#endif
+
+}  // namespace analysis
+}  // namespace inference
+}  // namespace paddle
diff --git a/paddle/fluid/inference/tests/api/full_ILSVRC2012_val_preprocess.py b/paddle/fluid/inference/tests/api/full_ILSVRC2012_val_preprocess.py
index e911c94208711e3cd6929a68024c8957a5aae334..adb6aa4d75344d767ce44019f3c1162956087210 100644
--- a/paddle/fluid/inference/tests/api/full_ILSVRC2012_val_preprocess.py
+++ b/paddle/fluid/inference/tests/api/full_ILSVRC2012_val_preprocess.py
@@ -167,7 +167,7 @@ def run_convert():
                os.path.getsize(output_file) == FULL_SIZE_BYTES):
         if os.path.exists(output_file):
             sys.stderr.write(
-                "\n\nThe existing binary file is broken. Start to generate new one...\n\n".
+                "\n\nThe existing binary file[{}] is broken. Start to generate new one...\n\n".
                 format(output_file))
             os.remove(output_file)
         if retry < try_limit:
diff --git a/paddle/fluid/operators/coalesce_tensor_op.cc b/paddle/fluid/operators/coalesce_tensor_op.cc
index 153fa529f96a5980c6b95baedce6a6dcc0b26f6e..c1c4f14582e95e5b971437796e58e5a420427453 100644
--- a/paddle/fluid/operators/coalesce_tensor_op.cc
+++ b/paddle/fluid/operators/coalesce_tensor_op.cc
@@ -120,6 +120,7 @@ class CoalesceTensorOpKernel : public framework::OpKernel<T> {
                 : len;
       }
     } else if (context.Attr<bool>("set_constant")) {
+      // TODO(Liu yuang) ADD NPU SET_CONSTANT FUNCTION.
       math::SetConstant<DeviceContext, T> set_constant;
       set_constant(dev_ctx, fused_tensor,
                    static_cast<T>(context.Attr<float>("constant")));
@@ -145,6 +146,14 @@ class CoalesceTensorOpKernel : public framework::OpKernel<T> {
     offset = 0;
     std::stringstream ss;
     ss << "alloc_space_for_vars: ";
+#if defined(PADDLE_WITH_ASCEND_CL)
+    auto stream =
+        context.template device_context<paddle::platform::NPUDeviceContext>()
+            .stream();
+    platform::NPUMemsetAsync(
+        static_cast<void *>(fused_tensor->mutable_data<T>(dev_ctx.GetPlace())),
+        0.0, fused_tensor->numel() * sizeof(T), stream);
+#endif
     for (size_t i = 0; i < out_tensors.size(); ++i) {
       size_t len = static_cast<size_t>(out_tensors[i]->numel());
       auto dim = out_tensors[i]->dims();
@@ -160,6 +169,12 @@ class CoalesceTensorOpKernel : public framework::OpKernel<T> {
       ss << "output(" << out_var_names[i] << ")  dim:(" << dim << ")"
          << " address: " << out_tensors[i]->data<void>() << ", ";
     }
+    PADDLE_ENFORCE_EQ(
+        (int64_t)offset, fused_tensor->numel(),
+        platform::errors::InvalidArgument(
+            "The alloc_space_for_vars's offset: %s is unequal with "
+            "fused_tensor's numel: %s.",
+            offset, fused_tensor->numel()));
     VLOG(10) << ss.str();
   }
 
@@ -191,13 +206,13 @@ class CoalesceTensorOpKernel : public framework::OpKernel<T> {
       ss << "input(" << var_names[i] << ") dim:(" << lod_tensors[i]->dims()
          << ") "
          << " addres:" << lod_tensors[i]->data<void>() << ", ";
+
       *numel += use_align
                     ? platform::Alignment(
                           static_cast<size_t>(size) * size_of_dtype, place) /
                           size_of_dtype
                     : static_cast<size_t>(size);
     }
-
     VLOG(10) << ss.str();
   }
 };
@@ -309,6 +324,16 @@ REGISTER_OP_XPU_KERNEL(
     ops::CoalesceTensorOpKernel<paddle::platform::XPUDeviceContext, double>);
 #endif
 
+#if defined(PADDLE_WITH_ASCEND_CL)
+REGISTER_OP_NPU_KERNEL(
+    coalesce_tensor,
+    ops::CoalesceTensorOpKernel<paddle::platform::CPUDeviceContext, int>,
+    ops::CoalesceTensorOpKernel<paddle::platform::CPUDeviceContext, float>,
+    ops::CoalesceTensorOpKernel<paddle::platform::CPUDeviceContext,
+                                plat::float16>,
+    ops::CoalesceTensorOpKernel<paddle::platform::CPUDeviceContext, double>);
+#endif
+
 REGISTER_OP_VERSION(coalesce_tensor)
     .AddCheckpoint(
         R"ROC(
diff --git a/paddle/fluid/operators/collective/barrier_op.cu.cc b/paddle/fluid/operators/collective/barrier_op.cu.cc
index f6281aa8ca2710bd7281088f5d477278c93fe328..b8631b44f14caac162dd332f715b825e42bf31af 100644
--- a/paddle/fluid/operators/collective/barrier_op.cu.cc
+++ b/paddle/fluid/operators/collective/barrier_op.cu.cc
@@ -43,12 +43,10 @@ class BarrierOpCUDAKernel : public framework::OpKernel<T> {
     ncclRedOp_t nccl_red_type = ncclSum;
     PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::ncclAllReduce(
         sendbuff, recvbuff, numel, dtype, nccl_red_type, comm->comm(), stream));
-    auto comm_stream =
-        platform::NCCLCommContext::Instance().Get(rid, place)->stream();
 #ifdef PADDLE_WITH_RCCL
-    PADDLE_ENFORCE_CUDA_SUCCESS(hipStreamSynchronize(comm_stream));
+    PADDLE_ENFORCE_CUDA_SUCCESS(hipStreamSynchronize(stream));
 #else
-    PADDLE_ENFORCE_CUDA_SUCCESS(cudaStreamSynchronize(comm_stream));
+    PADDLE_ENFORCE_CUDA_SUCCESS(cudaStreamSynchronize(stream));
 #endif
 #else
     PADDLE_THROW(platform::errors::Unavailable(
diff --git a/paddle/fluid/operators/collective/c_embedding_op.cc b/paddle/fluid/operators/collective/c_embedding_op.cc
index 094ef9c8d4ef6e58e4ad639ffbf32b5ea2e68561..3055e2ceb23dd239cf98188aa81a0d783b4f9e96 100644
--- a/paddle/fluid/operators/collective/c_embedding_op.cc
+++ b/paddle/fluid/operators/collective/c_embedding_op.cc
@@ -31,13 +31,12 @@ class CEmbeddingOp : public framework::OperatorWithKernel {
     int ids_rank = ids_dims.size();
 
     VLOG(5) << "ids rank is " << ids_rank << std::endl;
-    PADDLE_ENFORCE_EQ(
-        table_dims.size(), 2,
-        platform::errors::InvalidArgument(
-            "ShapeError: The dimensions of the 'c_embedding' must be 2. "
-            "But received c_embedding's dimensions = %d, "
-            "c_embedding's shape = [%s].",
-            table_dims.size(), table_dims));
+    PADDLE_ENFORCE_EQ(table_dims.size(), 2,
+                      platform::errors::InvalidArgument(
+                          "The dimensions of the 'c_embedding' must be 2. "
+                          "But received c_embedding's dimensions = %d, "
+                          "c_embedding's shape = [%s].",
+                          table_dims.size(), table_dims));
 
     auto output_dims = framework::vectorize(ids_dims);
     output_dims.push_back(table_dims[1]);
diff --git a/paddle/fluid/operators/collective/c_softmax_with_cross_entropy_op.cc b/paddle/fluid/operators/collective/c_softmax_with_cross_entropy_op.cc
new file mode 100644
index 0000000000000000000000000000000000000000..f75e1b3c7aedccbd0405ae26a952aa0b19b40a6d
--- /dev/null
+++ b/paddle/fluid/operators/collective/c_softmax_with_cross_entropy_op.cc
@@ -0,0 +1,194 @@
+/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/operators/collective/c_softmax_with_cross_entropy_op.h"
+
+namespace paddle {
+namespace operators {
+
+class CSoftmaxWithCrossEntropyOp : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+
+  void InferShape(framework::InferShapeContext* ctx) const override {
+    OP_INOUT_CHECK(ctx->HasInput("Logits"), "Input", "Logits",
+                   "CSoftmaxWithCrossEntropyOp");
+    OP_INOUT_CHECK(ctx->HasInput("Label"), "Input", "Label",
+                   "CSoftmaxWithCrossEntropyOp");
+
+    OP_INOUT_CHECK(ctx->HasOutput("Softmax"), "Output", "Softmax",
+                   "CSoftmaxWithCrossEntropyOp");
+    OP_INOUT_CHECK(ctx->HasOutput("Loss"), "Output", "Loss",
+                   "CSoftmaxWithCrossEntropyOp");
+
+    auto logits_dims = ctx->GetInputDim("Logits");
+    auto labels_dims = ctx->GetInputDim("Label");
+
+    auto logits_rank = logits_dims.size();
+    auto axis = logits_rank - 1;
+    for (int i = 0; i < logits_rank; i++) {
+      if (i != axis) {
+        if (ctx->IsRuntime() || (logits_dims[i] > 0 && labels_dims[i] > 0)) {
+          PADDLE_ENFORCE_EQ(logits_dims[i], labels_dims[i],
+                            platform::errors::InvalidArgument(
+                                "Input(Logits) and Input(Label) should in "
+                                "same shape in dimensions except axis."));
+        }
+      }
+    }
+
+    PADDLE_ENFORCE_EQ(
+        labels_dims[logits_rank - 1], 1UL,
+        platform::errors::InvalidArgument(
+            "the last dimension of Input(Label) should be 1."
+            "But received: the last dimension of Input(Label) is [%d],"
+            "the last dimension is [%d]",
+            labels_dims[logits_rank - 1], logits_rank - 1));
+
+    ctx->SetOutputDim("Softmax", logits_dims);
+
+    logits_dims[axis] = 1;
+    ctx->SetOutputDim("Loss", logits_dims);
+
+    ctx->ShareLoD("Logits", /*->*/ "Softmax");
+    ctx->ShareLoD("Logits", /*->*/ "Loss");
+  }
+
+ protected:
+  framework::OpKernelType GetExpectedKernelType(
+      const framework::ExecutionContext& ctx) const override {
+    return framework::OpKernelType(
+        OperatorWithKernel::IndicateVarDataType(ctx, "Logits"),
+        ctx.device_context());
+  }
+};
+
+class CSoftmaxWithCrossEntropyOpMaker
+    : public framework::OpProtoAndCheckerMaker {
+ public:
+  void Make() {
+    AddInput("Logits",
+             "(Tensor, default: Tensor<float>), The input tensor of unscaled "
+             "log probabilities, whose dimension :attr:`axis` should be scaled "
+             "by softmax.");
+    AddInput(
+        "Label",
+        "(Tensor) The input tensor of groud truth label. If :attr:`soft_label` "
+        "is set to false, Label is a Tensor<int64> in same shape with "
+        "Input(Logits) except the shape in dimension :attr:`axis` as 1. If "
+        "soft_label is set to true, Label is a Tensor<float/double> in same "
+        "shape with Input(Logits).");
+    AddOutput(
+        "Softmax",
+        "(Tensor, default: Tensor<float>), A tensor in same shape with "
+        "Input(Logits). "
+        "The outputs value of softmax activation by given the input batch, "
+        "which will be used in backward calculation.");
+    AddOutput("Loss",
+              "(Tensor, default: Tensor<float>), A tensor in same shape with "
+              "Input(Logits) "
+              "except the shape in dimension :attr:`axis` as 1. The cross "
+              "entropy loss.");
+    AddAttr<int>("ring_id", "(int default 0) nccl communication ring id.")
+        .SetDefault(0);
+    AddAttr<int>("rank",
+                 "(int default 0) rank id for CSoftmaxWithCrossEntropy.")
+        .SetDefault(0);
+    AddAttr<int>("nranks",
+                 "(int default 1) nranks id for CSoftmaxWithCrossEntropy.")
+        .SetDefault(0);
+    AddComment(R"DOC(
+CSoftmaxWithCrossEntropy Operator
+
+)DOC");
+  }
+};
+
+class CSoftmaxWithCrossEntropyOpGrad : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+
+  void InferShape(framework::InferShapeContext* ctx) const override {
+    PADDLE_ENFORCE_EQ(ctx->HasInput(framework::GradVarName("Loss")), true,
+                      platform::errors::InvalidArgument(
+                          "Input(Loss@Grad) should not be null."));
+    PADDLE_ENFORCE_EQ(ctx->HasInput("Softmax"), true,
+                      platform::errors::InvalidArgument(
+                          "Input(Softmax) should be not null."));
+    PADDLE_ENFORCE_EQ(
+        ctx->HasInput("Label"), true,
+        platform::errors::InvalidArgument("Input(Label) should be not null."));
+
+    PADDLE_ENFORCE_EQ(ctx->HasOutput(framework::GradVarName("Logits")), true,
+                      platform::errors::InvalidArgument(
+                          "Output(Logits@Grad) should be not null."));
+
+    ctx->SetOutputDim(framework::GradVarName("Logits"),
+                      ctx->GetInputDim("Softmax"));
+  }
+
+ protected:
+  framework::OpKernelType GetExpectedKernelType(
+      const framework::ExecutionContext& ctx) const override {
+    return framework::OpKernelType(OperatorWithKernel::IndicateVarDataType(
+                                       ctx, framework::GradVarName("Loss")),
+                                   ctx.device_context());
+  }
+};
+
+template <typename T>
+class CSoftmaxWithCrossEntropyOpGradMaker
+    : public framework::SingleGradOpMaker<T> {
+ public:
+  using framework::SingleGradOpMaker<T>::SingleGradOpMaker;
+
+ protected:
+  void Apply(GradOpPtr<T> op) const override {
+    op->SetType("c_softmax_with_cross_entropy_grad");
+
+    op->SetInput("Softmax", this->Output("Softmax"));
+    op->SetInput("Label", this->Input("Label"));
+    op->SetInput(framework::GradVarName("Loss"), this->OutputGrad("Loss"));
+    op->SetAttrMap(this->Attrs());
+    op->SetOutput(framework::GradVarName("Logits"), this->InputGrad("Logits"));
+  }
+};
+
+DECLARE_INPLACE_OP_INFERER(CSoftmaxWithCrossEntropyInplaceInferer,
+                           {"Logits", "Softmax"});
+
+DECLARE_INPLACE_OP_INFERER(CSoftmaxWithCrossEntropyGradInplaceInferer,
+                           {"Softmax", framework::GradVarName("Logits")});
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+namespace plat = paddle::platform;
+
+REGISTER_OPERATOR(
+    c_softmax_with_cross_entropy, ops::CSoftmaxWithCrossEntropyOp,
+    ops::CSoftmaxWithCrossEntropyOpMaker,
+    ops::CSoftmaxWithCrossEntropyOpGradMaker<paddle::framework::OpDesc>,
+    ops::CSoftmaxWithCrossEntropyOpGradMaker<paddle::imperative::OpBase>,
+    ops::CSoftmaxWithCrossEntropyInplaceInferer);
+
+REGISTER_OPERATOR(c_softmax_with_cross_entropy_grad,
+                  ops::CSoftmaxWithCrossEntropyOpGrad,
+                  ops::CSoftmaxWithCrossEntropyGradInplaceInferer);
+
+REGISTER_OP_CPU_KERNEL(c_softmax_with_cross_entropy,
+                       ops::CSoftmaxWithCrossEntropyOpCPUKernel<float>,
+                       ops::CSoftmaxWithCrossEntropyOpCPUKernel<double>,
+                       ops::CSoftmaxWithCrossEntropyOpCPUKernel<plat::float16>);
diff --git a/paddle/fluid/operators/collective/c_softmax_with_cross_entropy_op.cu b/paddle/fluid/operators/collective/c_softmax_with_cross_entropy_op.cu
new file mode 100644
index 0000000000000000000000000000000000000000..77db86e7111112ac78bea270413ee9a2c2cba72b
--- /dev/null
+++ b/paddle/fluid/operators/collective/c_softmax_with_cross_entropy_op.cu
@@ -0,0 +1,262 @@
+/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/operators/collective/c_softmax_with_cross_entropy_op.h"
+#include "paddle/fluid/operators/math/cross_entropy.h"
+#include "paddle/fluid/operators/math/softmax_impl.h"
+#include "paddle/fluid/platform/collective_helper.h"
+#include "paddle/fluid/platform/nccl_helper.h"
+#include "paddle/fluid/string/string_helper.h"
+
+namespace paddle {
+namespace operators {
+
+using Tensor = framework::Tensor;
+
+static constexpr int kNumCUDAThreads = 512;
+static constexpr int kNumMaxinumNumBlocks = 4096;
+
+static inline int NumBlocks(const int N) {
+  return std::min((N + kNumCUDAThreads - 1) / kNumCUDAThreads,
+                  kNumMaxinumNumBlocks);
+}
+
+template <typename T, typename IndexT>
+__global__ void MaskLabelByIndex(T* predicted_logits, const T* logit,
+                                 const IndexT* label, const int start_index,
+                                 const int end_index, const int64_t N,
+                                 const int64_t D, const int nranks) {
+  CUDA_KERNEL_LOOP(i, N) {
+    auto real_label = label[i];
+    PADDLE_ENFORCE((real_label < D * nranks) && (real_label >= 0),
+                   "The index is out of bounds, "
+                   "please check whether the value of label and "
+                   "input meet the class number. It should "
+                   "be less than [%d], but received [%d]",
+                   D * nranks, real_label);
+
+    if (real_label >= start_index && real_label < end_index) {
+      predicted_logits[i] = logit[i * D + real_label - start_index];
+    }
+  }
+}
+
+template <typename T, typename IndexT>
+__global__ void MaskLabelByIndexGrad(T* logits_grad, const T* loss_grad,
+                                     const IndexT* labels,
+                                     const int start_index, const int end_index,
+                                     const int64_t N, const int64_t D) {
+  CUDA_KERNEL_LOOP(i, N * D) {
+    auto row = i / D;
+    auto col = i % D;
+    if ((col + start_index) == labels[row]) {
+      logits_grad[i] = (logits_grad[i] - static_cast<T>(1.0)) * loss_grad[row];
+    } else {
+      logits_grad[i] *= loss_grad[row];
+    }
+  }
+}
+
+template <typename T>
+class CSoftmaxWithCrossEntropyOpCUDAKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    const Tensor* logits = ctx.Input<Tensor>("Logits");
+    const Tensor* labels = ctx.Input<Tensor>("Label");
+    Tensor* softmax = ctx.Output<Tensor>("Softmax");
+    Tensor* loss = ctx.Output<Tensor>("Loss");
+
+    const int rid = ctx.Attr<int>("ring_id");
+    const int nranks = ctx.Attr<int>("nranks");
+    const int rank = ctx.Attr<int>("rank");
+
+    const auto& place = ctx.GetPlace();
+    const auto& comm = platform::NCCLCommContext::Instance().Get(rid, place);
+    auto& dev_ctx = ctx.template device_context<platform::CUDADeviceContext>();
+
+    // use global calculate stream
+    const auto stream = static_cast<platform::CUDADeviceContext*>(
+                            platform::DeviceContextPool::Instance().Get(place))
+                            ->stream();
+
+    // allocate memory on device.
+    softmax->mutable_data<T>(place);
+    loss->mutable_data<T>(place);
+
+    const auto& logits_dims = logits->dims();
+    const auto& labels_dims = labels->dims();
+
+    const int axis = logits_dims.size() - 1;
+    const int N = SizeToAxis(axis, logits_dims);
+    const int D = SizeFromAxis(axis, logits_dims);
+
+    Tensor logits_2d, softmax_2d, loss_2d;
+    logits_2d.ShareDataWith(*logits).Resize({N, D});
+    softmax_2d.ShareDataWith(*softmax).Resize({N, D});
+    loss_2d.ShareDataWith(*loss).Resize({N, 1});
+
+    auto eigen_logits = math::EigenMatrix<T>::From(logits_2d);
+    auto eigen_softmax = math::EigenMatrix<T>::From(softmax_2d);
+
+    // step 1, obtain logit_max
+    Tensor logits_max;
+    logits_max =
+        ctx.AllocateTmpTensor<T, platform::CUDADeviceContext>({N, 1}, dev_ctx);
+    void* logits_max_buff = logits_max.mutable_data<T>(place);
+
+    auto eigen_logits_max = math::EigenMatrix<T>::From(logits_max);
+    Eigen::DSizes<int, 1> along_axis(1);
+    eigen_logits_max.device(*dev_ctx.eigen_device()) =
+        eigen_logits.maximum(along_axis);
+    PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::ncclAllReduce(
+        logits_max_buff, logits_max_buff, logits_max.numel(),
+        platform::ToNCCLDataType(logits_max.type()), ncclMax, comm->comm(),
+        stream));
+
+    // step 2, obtain logit - logit_max
+    Eigen::DSizes<int, 2> batch_by_one(N, 1);
+    Eigen::DSizes<int, 2> one_by_class(1, D);
+
+    eigen_softmax.device(*dev_ctx.eigen_device()) =
+        (eigen_logits -
+         eigen_logits_max.reshape(batch_by_one).broadcast(one_by_class))
+            .unaryExpr(math::ValueClip<T>());
+
+    // step 3, obtain predict target
+    Tensor predicted_logits;
+    predicted_logits =
+        ctx.AllocateTmpTensor<T, platform::CUDADeviceContext>({N, 1}, dev_ctx);
+    predicted_logits.mutable_data<T>(place);
+
+    auto t = framework::EigenVector<T>::Flatten(predicted_logits);
+    t.device(*dev_ctx.eigen_device()) = t.constant(static_cast<T>(0));
+
+    const int start_index = rank * D;
+    const int end_index = start_index + D;
+
+    int blocks = NumBlocks(N);
+    int threads = kNumCUDAThreads;
+    const auto& label_type = labels->type();
+
+    if (label_type == framework::proto::VarType::INT32) {
+      MaskLabelByIndex<T, int32_t><<<blocks, threads, 0, dev_ctx.stream()>>>(
+          predicted_logits.data<T>(), softmax_2d.data<T>(),
+          labels->data<int32_t>(), start_index, end_index, N, D, nranks);
+    } else if (label_type == framework::proto::VarType::INT64) {
+      MaskLabelByIndex<T, int64_t><<<blocks, threads, 0, dev_ctx.stream()>>>(
+          predicted_logits.data<T>(), softmax_2d.data<T>(),
+          labels->data<int64_t>(), start_index, end_index, N, D, nranks);
+    }
+
+    void* predict_logits_buff = predicted_logits.mutable_data<T>(place);
+    PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::ncclAllReduce(
+        predict_logits_buff, predict_logits_buff, predicted_logits.numel(),
+        platform::ToNCCLDataType(predicted_logits.type()), ncclSum,
+        comm->comm(), stream));
+
+    // step 4, obtain exp(logit)
+    eigen_softmax.device(*dev_ctx.eigen_device()) = eigen_softmax.exp();
+
+    // step 5, obtain sum_exp_logits
+    Tensor sum_exp_logits;
+    sum_exp_logits =
+        ctx.AllocateTmpTensor<T, platform::CUDADeviceContext>({N, 1}, dev_ctx);
+    void* sum_exp_logits_buff = sum_exp_logits.mutable_data<T>(place);
+
+    auto eigen_sum_exp_logits = math::EigenMatrix<T>::From(sum_exp_logits);
+    eigen_sum_exp_logits.device(*dev_ctx.eigen_device()) =
+        eigen_softmax.sum(along_axis);
+
+    PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::ncclAllReduce(
+        sum_exp_logits_buff, sum_exp_logits_buff, sum_exp_logits.numel(),
+        platform::ToNCCLDataType(sum_exp_logits.type()), ncclSum, comm->comm(),
+        stream));
+
+    auto eigen_loss = math::EigenMatrix<T>::From(loss_2d);
+    auto eigen_predicted_logits = math::EigenMatrix<T>::From(predicted_logits);
+
+    eigen_loss.device(*dev_ctx.eigen_device()) =
+        (eigen_sum_exp_logits.log().unaryExpr(math::TolerableValue<T>()) -
+         eigen_predicted_logits)
+            .unaryExpr(math::TolerableValue<T>());
+
+    eigen_softmax.device(*dev_ctx.eigen_device()) =
+        (eigen_softmax *
+         eigen_sum_exp_logits.inverse().broadcast(one_by_class));
+  }
+};
+
+template <typename T>
+class CSoftmaxWithCrossEntropyGradCUDAKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& context) const override {
+    const Tensor* labels = context.Input<Tensor>("Label");
+    const Tensor* loss_grad =
+        context.Input<Tensor>(framework::GradVarName("Loss"));
+    Tensor* logit_grad =
+        context.Output<Tensor>(framework::GradVarName("Logits"));
+    const Tensor* softmax = context.Input<Tensor>("Softmax");
+    const int rank = context.Attr<int>("rank");
+    auto& dev_ctx =
+        context.template device_context<platform::CUDADeviceContext>();
+
+    if (logit_grad != softmax) {
+      framework::TensorCopy(*softmax, context.GetPlace(),
+                            context.device_context(), logit_grad);
+    }
+    const auto sofrmax_dims = softmax->dims();
+    const int axis = sofrmax_dims.size() - 1;
+    const int N = SizeToAxis(axis, sofrmax_dims);
+    const int D = SizeFromAxis(axis, sofrmax_dims);
+
+    Tensor logit_grad_2d;
+    logit_grad_2d.ShareDataWith(*logit_grad).Resize({N, D});
+
+    int blocks = NumBlocks(N * D);
+    int threads = kNumCUDAThreads;
+    const auto& label_type = labels->type();
+    const int start_index = rank * D;
+    const int end_index = start_index + D;
+
+    if (label_type == framework::proto::VarType::INT32) {
+      MaskLabelByIndexGrad<T,
+                           int32_t><<<blocks, threads, 0, dev_ctx.stream()>>>(
+          logit_grad_2d.data<T>(), loss_grad->data<T>(),
+          labels->data<int32_t>(), start_index, end_index, N, D);
+    } else if (label_type == framework::proto::VarType::INT64) {
+      MaskLabelByIndexGrad<T,
+                           int64_t><<<blocks, threads, 0, dev_ctx.stream()>>>(
+          logit_grad_2d.data<T>(), loss_grad->data<T>(),
+          labels->data<int64_t>(), start_index, end_index, N, D);
+    }
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+namespace plat = paddle::platform;
+
+REGISTER_OP_CUDA_KERNEL(
+    c_softmax_with_cross_entropy,
+    ops::CSoftmaxWithCrossEntropyOpCUDAKernel<float>,
+    ops::CSoftmaxWithCrossEntropyOpCUDAKernel<double>,
+    ops::CSoftmaxWithCrossEntropyOpCUDAKernel<plat::float16>);
+
+REGISTER_OP_CUDA_KERNEL(
+    c_softmax_with_cross_entropy_grad,
+    ops::CSoftmaxWithCrossEntropyGradCUDAKernel<float>,
+    ops::CSoftmaxWithCrossEntropyGradCUDAKernel<paddle::platform::float16>,
+    ops::CSoftmaxWithCrossEntropyGradCUDAKernel<double>);
diff --git a/paddle/fluid/operators/collective/c_softmax_with_cross_entropy_op.h b/paddle/fluid/operators/collective/c_softmax_with_cross_entropy_op.h
new file mode 100644
index 0000000000000000000000000000000000000000..c7cfd41fa2556873166701c96616323d2b1e40c3
--- /dev/null
+++ b/paddle/fluid/operators/collective/c_softmax_with_cross_entropy_op.h
@@ -0,0 +1,41 @@
+/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include <algorithm>
+#include <utility>
+#include <vector>
+
+#include "paddle/fluid/framework/data_type.h"
+#include "paddle/fluid/framework/lod_tensor.h"
+#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/operators/math/cross_entropy.h"
+#include "paddle/fluid/operators/math/softmax.h"
+#include "paddle/fluid/operators/softmax_op.h"
+
+namespace paddle {
+namespace operators {
+
+template <typename T>
+class CSoftmaxWithCrossEntropyOpCPUKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    PADDLE_THROW(platform::errors::Unavailable(
+        "Do not support c_embedding for cpu kernel now."));
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/fluid/operators/compat/affine_channel.pbtxt b/paddle/fluid/operators/compat/affine_channel.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..444fde59a9631c75ec4a5626a22f2d3a785ffd3d
--- /dev/null
+++ b/paddle/fluid/operators/compat/affine_channel.pbtxt
@@ -0,0 +1,41 @@
+type: "affine_channel"
+def {
+  inputs {
+    name: "X"
+  }
+  inputs {
+    name: "Scale"
+  }
+  inputs {
+    name: "Bias"
+  }
+  attrs {
+    name: "data_layout"
+    type: STRING
+  }
+  outputs {
+    name: "Out"
+  }
+}
+extra {
+  attrs {
+    name: "op_role"
+    type: INT
+  }
+  attrs {
+    name: "op_role_var"
+    type: STRINGS
+  }
+  attrs {
+    name: "op_namescope"
+    type: STRING
+  }
+  attrs {
+    name: "op_callstack"
+    type: STRINGS
+  }
+  attrs {
+    name: "op_device"
+    type: STRING
+  }
+}
diff --git a/paddle/fluid/operators/compat/flatten2.pbtxt b/paddle/fluid/operators/compat/flatten2.pbtxt
new file mode 100755
index 0000000000000000000000000000000000000000..6b8a6661a6fd7d66d9a16ee64cefce8bccb374f4
--- /dev/null
+++ b/paddle/fluid/operators/compat/flatten2.pbtxt
@@ -0,0 +1,38 @@
+type: "flatten2"
+def {
+  inputs {
+    name: "X"
+  }
+  outputs {
+    name: "Out"
+  }
+  outputs {
+    name: "XShape"
+  }
+  attrs {
+    name: "axis"
+    type: INT
+  }
+}
+extra {
+  attrs {
+    name: "op_role"
+    type: INT
+  }
+  attrs {
+    name: "op_role_var"
+    type: STRINGS
+  }
+  attrs {
+    name: "op_namescope"
+    type: STRING
+  }
+  attrs {
+    name: "op_callstack"
+    type: STRINGS
+  }
+  attrs {
+    name: "op_device"
+    type: STRING
+  }
+}
diff --git a/paddle/fluid/operators/compat/gru.pbtxt b/paddle/fluid/operators/compat/gru.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..38aa8a92f75bd92801333cadbc0d1c1b1068c790
--- /dev/null
+++ b/paddle/fluid/operators/compat/gru.pbtxt
@@ -0,0 +1,65 @@
+type: "gru"
+def {
+  inputs {
+    name: "Input"
+  }
+  inputs {
+    name: "H0"
+  }
+  inputs {
+    name: "Weight"
+  }
+  inputs {
+    name: "Bias"
+  }
+  outputs {
+    name: "BatchGate"
+  }
+  outputs {
+    name: "BatchResetHiddenPrev"
+  }
+  outputs {
+    name: "BatchHidden"
+  }
+  outputs {
+    name: "Hidden"
+  }
+  attrs {
+    name: "activation"
+    type: STRING
+  }
+  attrs {
+    name: "gate_activation"
+    type: STRING
+  }
+  attrs {
+    name: "is_reverse"
+    type: BOOLEAN
+  }
+  attrs {
+    name: "origin_mode"
+    type: BOOLEAN
+  }
+}
+extra {
+  attrs {
+    name: "op_role"
+    type: INT
+  }
+  attrs {
+    name: "op_role_var"
+    type: STRINGS
+  }
+  attrs {
+    name: "op_namescope"
+    type: STRING
+  }
+  attrs {
+    name: "op_callstack"
+    type: STRINGS
+  }
+  attrs {
+    name: "op_device"
+    type: STRING
+  }
+}
diff --git a/paddle/fluid/operators/compat/layer_norm.pbtxt b/paddle/fluid/operators/compat/layer_norm.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..dbb78e0a8baa1efae2efdf66a8520fcc9a505b65
--- /dev/null
+++ b/paddle/fluid/operators/compat/layer_norm.pbtxt
@@ -0,0 +1,63 @@
+type: "layer_norm"
+def {
+  inputs {
+    name: "X"
+  }
+  inputs {
+    name: "Scale"
+  }
+  inputs {
+    name: "Bias"
+  }
+  outputs {
+    name: "Y"
+  }
+  outputs {
+    name: "Mean"
+  }
+  outputs {
+    name: "Variance"
+  }
+  attrs {
+    name: "epsilon"
+    type: FLOAT
+  }
+  attrs {
+    name: "begin_norm_axis"
+    type: INT
+  }
+}
+extra {
+  attrs {
+    name: "use_mkldnn"
+    type: BOOLEAN
+  }
+  attrs {
+    name: "mkldnn_data_type"
+    type: STRING
+  }
+  attrs {
+    name: "is_test"
+    type: BOOLEAN
+  }
+  attrs {
+    name: "op_role"
+    type: INT
+  }
+  attrs {
+    name: "op_role_var"
+    type: STRINGS
+  }
+  attrs {
+    name: "op_namescope"
+    type: STRING
+  }
+  attrs {
+    name: "op_callstack"
+    type: STRINGS
+  }
+  attrs {
+    name: "op_device"
+    type: STRING
+  }
+}
diff --git a/paddle/fluid/operators/compat/lstm.pbtxt b/paddle/fluid/operators/compat/lstm.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..889911a8408cb0f9c3b48b856900383194d1c884
--- /dev/null
+++ b/paddle/fluid/operators/compat/lstm.pbtxt
@@ -0,0 +1,72 @@
+type: "lstm"
+def {
+  inputs {
+    name: "Input"
+  }
+  inputs {
+    name: "H0"
+  }
+  inputs {
+    name: "C0"
+  }
+  inputs {
+    name: "Weight"
+  }
+  inputs {
+    name: "Bias"
+  }
+  outputs {
+    name: "Hidden"
+  }
+  outputs {
+    name: "Cell"
+  }
+  outputs {
+    name: "BatchGate"
+  }
+  outputs {
+    name: "BatchCellPreAct"
+  }
+  attrs {
+    name: "use_peepholes"
+    type: BOOLEAN
+  }
+  attrs {
+    name: "is_reverse"
+    type: BOOLEAN
+  }
+  attrs {
+    name: "gate_activation"
+    type: STRING
+  }
+  attrs {
+    name: "cell_activation"
+    type: STRING
+  }
+  attrs {
+    name: "candidate_activation"
+    type: STRING
+  }
+}
+extra {
+  attrs {
+    name: "op_role"
+    type: INT
+  }
+  attrs {
+    name: "op_role_var"
+    type: STRINGS
+  }
+  attrs {
+    name: "op_namescope"
+    type: STRING
+  }
+  attrs {
+    name: "op_callstack"
+    type: STRINGS
+  }
+  attrs {
+    name: "op_device"
+    type: STRING
+  }
+}
diff --git a/paddle/fluid/operators/compat/matmul.pbtxt b/paddle/fluid/operators/compat/matmul.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..e68a7f31b663403fba6ddfee9e7584b5e2372bad
--- /dev/null
+++ b/paddle/fluid/operators/compat/matmul.pbtxt
@@ -0,0 +1,98 @@
+type: "matmul"
+def {
+  inputs {
+    name: "X"
+  }
+  inputs {
+    name: "Y"
+  }
+  outputs {
+    name: "Out"
+  }
+  attrs {
+    name: "alpha"
+    type: FLOAT
+  }
+  attrs {
+    name: "transpose_X"
+    type: BOOLEAN
+  }
+  attrs {
+    name: "transpose_Y"
+    type: BOOLEAN
+  }
+}
+extra {
+  attrs {
+    name: "Scale_out"
+    type: FLOAT
+  }
+  attrs {
+    name: "Scale_x"
+    type: FLOAT
+  }
+  attrs {
+    name: "Scale_y"
+    type: FLOAT
+  }
+  attrs {
+    name: "use_mkldnn"
+    type: BOOLEAN
+  }
+  attrs {
+    name: "mkldnn_data_type"
+    type: STRING
+  }
+  attrs {
+    name: "op_role"
+    type: INT
+  }
+  attrs {
+    name: "op_role_var"
+    type: STRINGS
+  }
+  attrs {
+    name: "op_namescope"
+    type: STRING
+  }
+  attrs {
+    name: "op_callstack"
+    type: STRINGS
+  }
+  attrs {
+    name: "op_device"
+    type: STRING
+  }
+  attrs {
+    name: "use_quantizer"
+    type: BOOLEAN
+  }
+  attrs {
+    name: "force_fp32_output"
+    type: BOOLEAN
+  }
+  attrs {
+    name: "fused_reshape_Out"
+    type: INTS
+  }
+  attrs {
+    name: "fused_reshape_X"
+    type: INTS
+  }
+  attrs {
+    name: "fused_reshape_Y"
+    type: INTS
+  }
+  attrs {
+    name: "fused_transpose_Out"
+    type: INTS
+  }
+  attrs {
+    name: "fused_transpose_X"
+    type: INTS
+  }
+  attrs {
+    name: "fused_transpose_Y"
+    type: INTS
+  }
+}
diff --git a/paddle/fluid/operators/compat/matmul_v2.pbtxt b/paddle/fluid/operators/compat/matmul_v2.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..5f43e1f8bf0e0c502566a2cc783b8927e5df56cc
--- /dev/null
+++ b/paddle/fluid/operators/compat/matmul_v2.pbtxt
@@ -0,0 +1,42 @@
+type: "matmul_v2"
+def {
+  inputs {
+    name: "X"
+  }
+  inputs {
+    name: "Y"
+  }
+  outputs {
+    name: "Out"
+  }
+  attrs {
+    name: "trans_x"
+    type: BOOLEAN
+  }
+  attrs {
+    name: "trans_y"
+    type: BOOLEAN
+  }
+}
+extra {
+  attrs {
+    name: "op_role"
+    type: INT
+  }
+  attrs {
+    name: "op_role_var"
+    type: STRINGS
+  }
+  attrs {
+    name: "op_namescope"
+    type: STRING
+  }
+  attrs {
+    name: "op_callstack"
+    type: STRINGS
+  }
+  attrs {
+    name: "op_device"
+    type: STRING
+  }
+}
diff --git a/paddle/fluid/operators/compat/pool2d.pbtxt b/paddle/fluid/operators/compat/pool2d.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..1620d1ef1c649ab8a90307241ae8956b62ceee52
--- /dev/null
+++ b/paddle/fluid/operators/compat/pool2d.pbtxt
@@ -0,0 +1,92 @@
+type: "pool2d"
+def {
+  inputs {
+    name: "X"
+  }
+  outputs {
+    name: "Out"
+  }
+  attrs {
+    name: "pooling_type"
+    type: STRING
+  }
+  attrs {
+    name: "ksize"
+    type: INTS
+  }
+  attrs {
+    name: "global_pooling"
+    type: BOOLEAN
+  }
+  attrs {
+    name: "strides"
+    type: INTS
+  }
+  attrs {
+    name: "paddings"
+    type: INTS
+  }
+  attrs {
+    name: "exclusive"
+    type: BOOLEAN
+  }
+  attrs {
+    name: "adaptive"
+    type: BOOLEAN
+  }
+  attrs {
+    name: "ceil_mode"
+    type: BOOLEAN
+  }
+  attrs {
+    name: "data_format"
+    type: STRING
+  }
+  attrs {
+    name: "padding_algorithm"
+    type: STRING
+  }
+}
+extra {
+  attrs {
+    name: "is_test"
+    type: BOOLEAN
+  }
+  attrs {
+    name: "use_cudnn"
+    type: BOOLEAN
+  }
+  attrs {
+    name: "use_mkldnn"
+    type: BOOLEAN
+  }
+  attrs {
+    name: "use_quantizer"
+    type: BOOLEAN
+  }
+  attrs {
+    name: "mkldnn_data_type"
+    type: STRING
+  }
+  attrs {
+    name: "op_role"
+    type: INT
+  }
+  attrs {
+    name: "op_role_var"
+    type: STRINGS
+  }
+  attrs {
+    name: "op_namescope"
+    type: STRING
+  }
+  attrs {
+    name: "op_callstack"
+    type: STRINGS
+  }
+  attrs {
+    name: "op_device"
+    type: STRING
+  }
+}
+
diff --git a/paddle/fluid/operators/compat/reshape2.pbtxt b/paddle/fluid/operators/compat/reshape2.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..2ccc83305baca9a2979fcd37420abfd945a35123
--- /dev/null
+++ b/paddle/fluid/operators/compat/reshape2.pbtxt
@@ -0,0 +1,53 @@
+type: "reshape2"
+def {
+  inputs {
+    name: "X"
+  }
+  outputs {
+    name: "Out"
+  }
+  attrs {
+    name: "shape"
+    type: INTS
+  }
+}
+extra {
+  inputs {
+    name: "Shape"
+  }
+  inputs {
+    name: "ShapeTensor"
+  }
+  outputs {
+    name: "XShape"
+  }
+  attrs {
+    name: "use_quantizer"
+    type: BOOLEAN
+  }
+  attrs {
+    name: "mkldnn_data_type"
+    type: STRING
+  }
+  attrs {
+    name: "op_role"
+    type: INT
+  }
+  attrs {
+    name: "op_role_var"
+    type: STRINGS
+  }
+  attrs {
+    name: "op_namescope"
+    type: STRING
+  }
+  attrs {
+    name: "op_callstack"
+    type: STRINGS
+  }
+  attrs {
+    name: "op_device"
+    type: STRING
+  }
+}
+
diff --git a/paddle/fluid/operators/compat/scale.pbtxt b/paddle/fluid/operators/compat/scale.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..1331cd5cd77a62da26eb1f2b97e93669e4742c37
--- /dev/null
+++ b/paddle/fluid/operators/compat/scale.pbtxt
@@ -0,0 +1,43 @@
+type: "scale"
+def {
+  inputs {
+    name: "X"
+  }
+  outputs {
+    name: "Out"
+  }
+  attrs {
+    name: "bias"
+    type: FLOAT
+  }
+  attrs {
+    name: "scale"
+    type: FLOAT
+  }
+  attrs {
+    name: "bias_after_scale"
+    type: BOOLEAN
+  }
+}
+extra {
+  attrs {
+    name: "op_role"
+    type: INT
+  }
+  attrs {
+    name: "op_role_var"
+    type: STRINGS
+  }
+  attrs {
+    name: "op_namescope"
+    type: STRING
+  }
+  attrs {
+    name: "op_callstack"
+    type: STRINGS
+  }
+  attrs {
+    name: "op_device"
+    type: STRING
+  }
+}
diff --git a/paddle/fluid/operators/compat/softmax.pbtxt b/paddle/fluid/operators/compat/softmax.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..5cd155ed1c63a847fb354a3474fdfe05a4eb0333
--- /dev/null
+++ b/paddle/fluid/operators/compat/softmax.pbtxt
@@ -0,0 +1,55 @@
+type: "softmax"
+def {
+  inputs {
+    name: "X"
+  }
+  outputs {
+    name: "Out"
+  }
+  attrs {
+    name: "axis"
+    type: INT
+  }
+  attrs {
+    name: "data_format"
+    type: STRING
+  }
+}
+extra {
+  attrs {
+    name: "op_role"
+    type: INT
+  }
+  attrs {
+    name: "op_role_var"
+    type: STRINGS
+  }
+  attrs {
+    name: "op_namescope"
+    type: STRING
+  }
+  attrs {
+    name: "op_callstack"
+    type: STRINGS
+  }
+  attrs {
+    name: "op_device"
+    type: STRING
+  }
+  attrs {
+    name: "is_test"
+    type: BOOLEAN
+  }
+  attrs {
+    name: "mkldnn_data_type"
+    type: STRING
+  }
+  attrs {
+    name: "use_cudnn"
+    type: BOOLEAN
+  }
+  attrs {
+    name: "use_mkldnn"
+    type: BOOLEAN
+  }
+}
diff --git a/paddle/fluid/operators/compat/squeeze2.pbtxt b/paddle/fluid/operators/compat/squeeze2.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..160e6a7278649408f7c5245eb53029610407ebc0
--- /dev/null
+++ b/paddle/fluid/operators/compat/squeeze2.pbtxt
@@ -0,0 +1,38 @@
+type: "squeeze2"
+def {
+  inputs {
+    name: "X"
+  }
+  outputs {
+    name: "Out"
+  }
+  outputs {
+    name: "XShape"
+  }
+  attrs {
+    name: "axes"
+    type: INTS
+  }
+}
+extra {
+  attrs {
+    name: "op_role"
+    type: INT
+  }
+  attrs {
+    name: "op_role_var"
+    type: STRINGS
+  }
+  attrs {
+    name: "op_namescope"
+    type: STRING
+  }
+  attrs {
+    name: "op_callstack"
+    type: STRINGS
+  }
+  attrs {
+    name: "op_device"
+    type: STRING
+  }
+}
diff --git a/paddle/fluid/operators/compat/transpose.pdtxt b/paddle/fluid/operators/compat/transpose.pdtxt
new file mode 100644
index 0000000000000000000000000000000000000000..97081e0afc29a823d2dc95f2be31311020da8203
--- /dev/null
+++ b/paddle/fluid/operators/compat/transpose.pdtxt
@@ -0,0 +1,52 @@
+type: "transpose"
+def {
+  inputs {
+    name: "X"
+  }
+  outputs {
+    name: "Out"
+  }
+  attrs {
+    name: "axis"
+    type: INTS
+  }
+  attrs {
+    name: "data_format"
+    type: STRING
+  }
+}
+extra {
+  attrs {
+    name: "use_mkldnn"
+    type: BOOLEAN
+  }
+  attrs {
+    name: "use_quantizer"
+    type: BOOLEAN
+  }
+  attrs {
+    name: "mkldnn_data_type"
+    type: STRING
+  }
+  attrs {
+    name: "op_role"
+    type: INT
+  }
+  attrs {
+    name: "op_role_var"
+    type: STRINGS
+  }
+  attrs {
+    name: "op_namescope"
+    type: STRING
+  }
+  attrs {
+    name: "op_callstack"
+    type: STRINGS
+  }
+  attrs {
+    name: "op_device"
+    type: STRING
+  }
+}
+
diff --git a/paddle/fluid/operators/compat/transpose2.pdtxt b/paddle/fluid/operators/compat/transpose2.pdtxt
new file mode 100644
index 0000000000000000000000000000000000000000..34fad62a101e0de1bed8e671cb454396f865b421
--- /dev/null
+++ b/paddle/fluid/operators/compat/transpose2.pdtxt
@@ -0,0 +1,55 @@
+type: "transpose"
+def {
+  inputs {
+    name: "X"
+  }
+  outputs {
+    name: "Out"
+  }
+  attrs {
+    name: "axis"
+    type: INTS
+  }
+  attrs {
+    name: "data_format"
+    type: STRING
+  }
+}
+extra {
+  outputs {
+    name: "XShape"
+  }
+  attrs {
+    name: "use_mkldnn"
+    type: BOOLEAN
+  }
+  attrs {
+    name: "use_quantizer"
+    type: BOOLEAN
+  }
+  attrs {
+    name: "mkldnn_data_type"
+    type: STRING
+  }
+  attrs {
+    name: "op_role"
+    type: INT
+  }
+  attrs {
+    name: "op_role_var"
+    type: STRINGS
+  }
+  attrs {
+    name: "op_namescope"
+    type: STRING
+  }
+  attrs {
+    name: "op_callstack"
+    type: STRINGS
+  }
+  attrs {
+    name: "op_device"
+    type: STRING
+  }
+}
+
diff --git a/paddle/fluid/operators/compat/unsqueeze2.pbtxt b/paddle/fluid/operators/compat/unsqueeze2.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..ed3c32754a59f0a30ad4351bdf188d8ae7d68692
--- /dev/null
+++ b/paddle/fluid/operators/compat/unsqueeze2.pbtxt
@@ -0,0 +1,44 @@
+type: "unsqueeze2"
+def {
+  inputs {
+    name: "X"
+  }
+  inputs {
+    name: "AxesTensor"
+  }
+  inputs {
+    name: "AxesTensorList"
+  }
+  outputs {
+    name: "Out"
+  }
+  outputs {
+    name: "XShape"
+  }
+  attrs {
+    name: "axes"
+    type: INTS
+  }
+}
+extra {
+  attrs {
+    name: "op_role"
+    type: INT
+  }
+  attrs {
+    name: "op_role_var"
+    type: STRINGS
+  }
+  attrs {
+    name: "op_namescope"
+    type: STRING
+  }
+  attrs {
+    name: "op_callstack"
+    type: STRINGS
+  }
+  attrs {
+    name: "op_device"
+    type: STRING
+  }
+}
diff --git a/paddle/fluid/operators/detection/yolo_box_op.cc b/paddle/fluid/operators/detection/yolo_box_op.cc
index 6f2a3ca87623847f261f0111bdfd8c168bb24b0a..e6f6c2a39358fdc94b36bd1aa2afd2e5d0a495c6 100644
--- a/paddle/fluid/operators/detection/yolo_box_op.cc
+++ b/paddle/fluid/operators/detection/yolo_box_op.cc
@@ -11,6 +11,7 @@
 
 #include "paddle/fluid/operators/detection/yolo_box_op.h"
 #include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/framework/op_version_registry.h"
 
 namespace paddle {
 namespace operators {
@@ -31,19 +32,44 @@ class YoloBoxOp : public framework::OperatorWithKernel {
     auto anchors = ctx->Attrs().Get<std::vector<int>>("anchors");
     int anchor_num = anchors.size() / 2;
     auto class_num = ctx->Attrs().Get<int>("class_num");
+    auto iou_aware = ctx->Attrs().Get<bool>("iou_aware");
+    auto iou_aware_factor = ctx->Attrs().Get<float>("iou_aware_factor");
 
     PADDLE_ENFORCE_EQ(dim_x.size(), 4, platform::errors::InvalidArgument(
                                            "Input(X) should be a 4-D tensor."
                                            "But received X dimension(%s)",
                                            dim_x.size()));
-    PADDLE_ENFORCE_EQ(
-        dim_x[1], anchor_num * (5 + class_num),
-        platform::errors::InvalidArgument(
-            "Input(X) dim[1] should be equal to (anchor_mask_number * (5 "
-            "+ class_num))."
-            "But received dim[1](%s) != (anchor_mask_number * "
-            "(5+class_num)(%s).",
-            dim_x[1], anchor_num * (5 + class_num)));
+    if (iou_aware) {
+      PADDLE_ENFORCE_EQ(
+          dim_x[1], anchor_num * (6 + class_num),
+          platform::errors::InvalidArgument(
+              "Input(X) dim[1] should be equal to (anchor_mask_number * (6 "
+              "+ class_num)) while iou_aware is true."
+              "But received dim[1](%s) != (anchor_mask_number * "
+              "(6+class_num)(%s).",
+              dim_x[1], anchor_num * (6 + class_num)));
+      PADDLE_ENFORCE_GE(
+          iou_aware_factor, 0,
+          platform::errors::InvalidArgument(
+              "Attr(iou_aware_factor) should greater than or equal to 0."
+              "But received iou_aware_factor (%s)",
+              iou_aware_factor));
+      PADDLE_ENFORCE_LE(
+          iou_aware_factor, 1,
+          platform::errors::InvalidArgument(
+              "Attr(iou_aware_factor) should less than or equal to 1."
+              "But received iou_aware_factor (%s)",
+              iou_aware_factor));
+    } else {
+      PADDLE_ENFORCE_EQ(
+          dim_x[1], anchor_num * (5 + class_num),
+          platform::errors::InvalidArgument(
+              "Input(X) dim[1] should be equal to (anchor_mask_number * (5 "
+              "+ class_num))."
+              "But received dim[1](%s) != (anchor_mask_number * "
+              "(5+class_num)(%s).",
+              dim_x[1], anchor_num * (5 + class_num)));
+    }
     PADDLE_ENFORCE_EQ(dim_imgsize.size(), 2,
                       platform::errors::InvalidArgument(
                           "Input(ImgSize) should be a 2-D tensor."
@@ -140,6 +166,10 @@ class YoloBoxOpMaker : public framework::OpProtoAndCheckerMaker {
                    "Scale the center point of decoded bounding "
                    "box. Default 1.0")
         .SetDefault(1.);
+    AddAttr<bool>("iou_aware", "Whether use iou aware. Default false.")
+        .SetDefault(false);
+    AddAttr<float>("iou_aware_factor", "iou aware factor. Default 0.5.")
+        .SetDefault(0.5);
     AddComment(R"DOC(
          This operator generates YOLO detection boxes from output of YOLOv3 network.
          
@@ -147,7 +177,8 @@ class YoloBoxOpMaker : public framework::OpProtoAndCheckerMaker {
          should be the same, H and W specify the grid size, each grid point predict 
          given number boxes, this given number, which following will be represented as S,
          is specified by the number of anchors. In the second dimension(the channel
-         dimension), C should be equal to S * (5 + class_num), class_num is the object 
+         dimension), C should be equal to S * (5 + class_num) if :attr:`iou_aware` is false,
+         otherwise C should be equal to S * (6 + class_num). class_num is the object
          category number of source dataset(such as 80 in coco dataset), so the 
          second(channel) dimension, apart from 4 box location coordinates x, y, w, h, 
          also includes confidence score of the box and class one-hot key of each anchor 
@@ -183,6 +214,15 @@ class YoloBoxOpMaker : public framework::OpProtoAndCheckerMaker {
          score_{pred} = score_{conf} * score_{class}
          $$
 
+         where the confidence scores follow the formula bellow
+
+         .. math::
+
+            score_{conf} = \begin{case}
+                             obj, \text{if } iou_aware == flase \\
+                             obj^{1 - iou_aware_factor} * iou^{iou_aware_factor}, \text{otherwise}
+                           \end{case}
+
          )DOC");
   }
 };
@@ -197,3 +237,12 @@ REGISTER_OPERATOR(
     paddle::framework::EmptyGradOpMaker<paddle::imperative::OpBase>);
 REGISTER_OP_CPU_KERNEL(yolo_box, ops::YoloBoxKernel<float>,
                        ops::YoloBoxKernel<double>);
+
+REGISTER_OP_VERSION(yolo_box)
+    .AddCheckpoint(
+        R"ROC(
+      Upgrade yolo box to add new attribute [iou_aware, iou_aware_factor].
+    )ROC",
+        paddle::framework::compatible::OpVersionDesc()
+            .NewAttr("iou_aware", "Whether use iou aware", false)
+            .NewAttr("iou_aware_factor", "iou aware factor", 0.5f));
diff --git a/paddle/fluid/operators/detection/yolo_box_op.cu b/paddle/fluid/operators/detection/yolo_box_op.cu
index 65dc73ef38323521590c9f5914ac13b321ef4469..83a0eb87d02dd549521b68a112c5d9eea6055159 100644
--- a/paddle/fluid/operators/detection/yolo_box_op.cu
+++ b/paddle/fluid/operators/detection/yolo_box_op.cu
@@ -28,7 +28,8 @@ __global__ void KeYoloBoxFw(const T* input, const int* imgsize, T* boxes,
                             const int w, const int an_num, const int class_num,
                             const int box_num, int input_size_h,
                             int input_size_w, bool clip_bbox, const float scale,
-                            const float bias) {
+                            const float bias, bool iou_aware,
+                            const float iou_aware_factor) {
   int tid = blockIdx.x * blockDim.x + threadIdx.x;
   int stride = blockDim.x * gridDim.x;
   T box[4];
@@ -43,23 +44,29 @@ __global__ void KeYoloBoxFw(const T* input, const int* imgsize, T* boxes,
     int img_height = imgsize[2 * i];
     int img_width = imgsize[2 * i + 1];
 
-    int obj_idx =
-        GetEntryIndex(i, j, k * w + l, an_num, an_stride, grid_num, 4);
+    int obj_idx = GetEntryIndex(i, j, k * w + l, an_num, an_stride, grid_num, 4,
+                                iou_aware);
     T conf = sigmoid<T>(input[obj_idx]);
+    if (iou_aware) {
+      int iou_idx = GetIoUIndex(i, j, k * w + l, an_num, an_stride, grid_num);
+      T iou = sigmoid<T>(input[iou_idx]);
+      conf = pow(conf, static_cast<T>(1. - iou_aware_factor)) *
+             pow(iou, static_cast<T>(iou_aware_factor));
+    }
     if (conf < conf_thresh) {
       continue;
     }
 
-    int box_idx =
-        GetEntryIndex(i, j, k * w + l, an_num, an_stride, grid_num, 0);
+    int box_idx = GetEntryIndex(i, j, k * w + l, an_num, an_stride, grid_num, 0,
+                                iou_aware);
     GetYoloBox<T>(box, input, anchors, l, k, j, h, w, input_size_h,
                   input_size_w, box_idx, grid_num, img_height, img_width, scale,
                   bias);
     box_idx = (i * box_num + j * grid_num + k * w + l) * 4;
     CalcDetectionBox<T>(boxes, box, box_idx, img_height, img_width, clip_bbox);
 
-    int label_idx =
-        GetEntryIndex(i, j, k * w + l, an_num, an_stride, grid_num, 5);
+    int label_idx = GetEntryIndex(i, j, k * w + l, an_num, an_stride, grid_num,
+                                  5, iou_aware);
     int score_idx = (i * box_num + j * grid_num + k * w + l) * class_num;
     CalcLabelScore<T>(scores, input, label_idx, score_idx, class_num, conf,
                       grid_num);
@@ -80,6 +87,8 @@ class YoloBoxOpCUDAKernel : public framework::OpKernel<T> {
     float conf_thresh = ctx.Attr<float>("conf_thresh");
     int downsample_ratio = ctx.Attr<int>("downsample_ratio");
     bool clip_bbox = ctx.Attr<bool>("clip_bbox");
+    bool iou_aware = ctx.Attr<bool>("iou_aware");
+    float iou_aware_factor = ctx.Attr<float>("iou_aware_factor");
     float scale = ctx.Attr<float>("scale_x_y");
     float bias = -0.5 * (scale - 1.);
 
@@ -111,11 +120,18 @@ class YoloBoxOpCUDAKernel : public framework::OpKernel<T> {
     platform::GpuLaunchConfig config =
         platform::GetGpuLaunchConfig1D(ctx.cuda_device_context(), n * box_num);
 
-    KeYoloBoxFw<T><<<config.block_per_grid, config.thread_per_block, 0,
+    dim3 thread_num = config.thread_per_block;
+#ifdef WITH_NV_JETSON
+    if (config.compute_capability == 53 || config.compute_capability == 62) {
+      thread_num = 512;
+    }
+#endif
+
+    KeYoloBoxFw<T><<<config.block_per_grid, thread_num, 0,
                      ctx.cuda_device_context().stream()>>>(
         input_data, imgsize_data, boxes_data, scores_data, conf_thresh,
         anchors_data, n, h, w, an_num, class_num, box_num, input_size_h,
-        input_size_w, clip_bbox, scale, bias);
+        input_size_w, clip_bbox, scale, bias, iou_aware, iou_aware_factor);
   }
 };
 
diff --git a/paddle/fluid/operators/detection/yolo_box_op.h b/paddle/fluid/operators/detection/yolo_box_op.h
index 1cfef142bca7327cb039412719b7c002beb53cab..e06c81052a0f42c9db4d96e49d2708e64e4f3137 100644
--- a/paddle/fluid/operators/detection/yolo_box_op.h
+++ b/paddle/fluid/operators/detection/yolo_box_op.h
@@ -13,6 +13,7 @@
 #include <algorithm>
 #include <vector>
 #include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/operators/math/math_function.h"
 #include "paddle/fluid/platform/hostdevice.h"
 
 namespace paddle {
@@ -43,8 +44,19 @@ HOSTDEVICE inline void GetYoloBox(T* box, const T* x, const int* anchors, int i,
 
 HOSTDEVICE inline int GetEntryIndex(int batch, int an_idx, int hw_idx,
                                     int an_num, int an_stride, int stride,
-                                    int entry) {
-  return (batch * an_num + an_idx) * an_stride + entry * stride + hw_idx;
+                                    int entry, bool iou_aware) {
+  if (iou_aware) {
+    return (batch * an_num + an_idx) * an_stride +
+           (batch * an_num + an_num + entry) * stride + hw_idx;
+  } else {
+    return (batch * an_num + an_idx) * an_stride + entry * stride + hw_idx;
+  }
+}
+
+HOSTDEVICE inline int GetIoUIndex(int batch, int an_idx, int hw_idx, int an_num,
+                                  int an_stride, int stride) {
+  return batch * an_num * an_stride + (batch * an_num + an_idx) * stride +
+         hw_idx;
 }
 
 template <typename T>
@@ -92,6 +104,8 @@ class YoloBoxKernel : public framework::OpKernel<T> {
     float conf_thresh = ctx.Attr<float>("conf_thresh");
     int downsample_ratio = ctx.Attr<int>("downsample_ratio");
     bool clip_bbox = ctx.Attr<bool>("clip_bbox");
+    bool iou_aware = ctx.Attr<bool>("iou_aware");
+    float iou_aware_factor = ctx.Attr<float>("iou_aware_factor");
     float scale = ctx.Attr<float>("scale_x_y");
     float bias = -0.5 * (scale - 1.);
 
@@ -127,15 +141,22 @@ class YoloBoxKernel : public framework::OpKernel<T> {
       for (int j = 0; j < an_num; j++) {
         for (int k = 0; k < h; k++) {
           for (int l = 0; l < w; l++) {
-            int obj_idx =
-                GetEntryIndex(i, j, k * w + l, an_num, an_stride, stride, 4);
+            int obj_idx = GetEntryIndex(i, j, k * w + l, an_num, an_stride,
+                                        stride, 4, iou_aware);
             T conf = sigmoid<T>(input_data[obj_idx]);
+            if (iou_aware) {
+              int iou_idx =
+                  GetIoUIndex(i, j, k * w + l, an_num, an_stride, stride);
+              T iou = sigmoid<T>(input_data[iou_idx]);
+              conf = pow(conf, static_cast<T>(1. - iou_aware_factor)) *
+                     pow(iou, static_cast<T>(iou_aware_factor));
+            }
             if (conf < conf_thresh) {
               continue;
             }
 
-            int box_idx =
-                GetEntryIndex(i, j, k * w + l, an_num, an_stride, stride, 0);
+            int box_idx = GetEntryIndex(i, j, k * w + l, an_num, an_stride,
+                                        stride, 0, iou_aware);
             GetYoloBox<T>(box, input_data, anchors_data, l, k, j, h, w,
                           input_size_h, input_size_w, box_idx, stride,
                           img_height, img_width, scale, bias);
@@ -143,8 +164,8 @@ class YoloBoxKernel : public framework::OpKernel<T> {
             CalcDetectionBox<T>(boxes_data, box, box_idx, img_height, img_width,
                                 clip_bbox);
 
-            int label_idx =
-                GetEntryIndex(i, j, k * w + l, an_num, an_stride, stride, 5);
+            int label_idx = GetEntryIndex(i, j, k * w + l, an_num, an_stride,
+                                          stride, 5, iou_aware);
             int score_idx = (i * box_num + j * stride + k * w + l) * class_num;
             CalcLabelScore<T>(scores_data, input_data, label_idx, score_idx,
                               class_num, conf, stride);
diff --git a/paddle/fluid/operators/layer_norm_op.cu b/paddle/fluid/operators/layer_norm_op.cu
index 3656de3525d32cac814e4199089de56b40ea09d8..ea1bca8b4d58dfefa7b03b3c821faed2a175931e 100644
--- a/paddle/fluid/operators/layer_norm_op.cu
+++ b/paddle/fluid/operators/layer_norm_op.cu
@@ -209,6 +209,73 @@ __global__ void LayerNormForward(const T *x, const U *scale, const U *bias,
   }
 }
 
+template <typename T, typename U, int BlockDim>
+__global__ void LayerNormForwardFP16(const T *x, const U *scale, const U *bias,
+                                     T *y, U *mean, U *var, float epsilon,
+                                     int feature_size) {
+#if CUDA_ARCH_FP16_SUPPORTED(__CUDA_ARCH__)
+  using BlockReduce = cub::BlockReduce<PairForLayerNorm<U>, BlockDim>;
+  __shared__ typename BlockReduce::TempStorage temp_storage;
+  __shared__ U mean_share;
+  __shared__ U var_share;
+
+  int beg_idx = blockIdx.x * feature_size + threadIdx.x;
+  int end_idx = (blockIdx.x + 1) * feature_size;
+
+  // Step 1: Reduce to calculate mean and var
+  U mean_val = 0;
+  U var_val = 0;
+  for (int i = beg_idx; i < end_idx; i += BlockDim) {
+    U tmp = static_cast<U>(x[i]);
+    mean_val += tmp;
+    var_val += (tmp * tmp);
+  }
+  auto pair = BlockReduce(temp_storage)
+                  .Reduce(PairForLayerNorm<U>(mean_val, var_val),
+                          PairForLayerNormAddFunctor<U>());
+  if (threadIdx.x == 0) {
+    auto tmp = pair.first_ / static_cast<U>(feature_size);
+    mean[blockIdx.x] = mean_share = static_cast<U>(tmp);
+    var[blockIdx.x] = var_share =
+        static_cast<U>(pair.second_ / static_cast<U>(feature_size) - tmp * tmp);
+  }
+  __syncthreads();
+
+  mean_val = mean_share;
+  U invvar = rsqrt_<U>(var_share + static_cast<U>(epsilon));
+
+  // Step 2: Calculate y
+  if (scale != nullptr) {
+    if (bias != nullptr) {
+      for (int i = beg_idx, j = threadIdx.x; i < end_idx;
+           i += BlockDim, j += BlockDim) {
+        y[i] = static_cast<T>(
+            scale[j] * (static_cast<U>(x[i]) - mean_val) * invvar + bias[j]);
+      }
+    } else {
+      for (int i = beg_idx, j = threadIdx.x; i < end_idx;
+           i += BlockDim, j += BlockDim) {
+        y[i] = static_cast<T>(scale[j] * (static_cast<U>(x[i]) - mean_val) *
+                              invvar);
+      }
+    }
+  } else {  // scale == nullptr
+    if (bias != nullptr) {
+      for (int i = beg_idx, j = threadIdx.x; i < end_idx;
+           i += BlockDim, j += BlockDim) {
+        y[i] = static_cast<T>((static_cast<U>(x[i]) - mean_val) * invvar +
+                              bias[j]);
+      }
+    } else {
+      for (int i = beg_idx, j = threadIdx.x; i < end_idx;
+           i += BlockDim, j += BlockDim) {
+        y[i] = static_cast<T>((static_cast<U>(x[i]) - mean_val) * invvar);
+      }
+    }
+  }
+#endif
+}
+
 template <typename T, typename U, int VPT>
 __inline__ __device__ void cuLoadAddStridedInputs(
     const int i1_block, const int thr_load_row_off, const int thr_load_col_off,
@@ -872,6 +939,28 @@ void LayerNormDirectCUDAFunctor<T>::operator()(gpuStream_t stream,
   }
 }
 
+template <>
+void LayerNormDirectCUDAFunctor<half>::operator()(
+    gpuStream_t stream, const half *input, std::vector<int> input_shape,
+    const half *bias, const half *scale, half *output, half *mean,
+    half *variance, int begin_norm_axis, float eps) {
+  const auto x_dims = framework::make_ddim(input_shape);
+  auto matrix_dim = framework::flatten_to_2d(x_dims, begin_norm_axis);
+  int batch_size = static_cast<int>(matrix_dim[0]);
+  int feature_size = static_cast<int>(matrix_dim[1]);
+  switch (GetDesiredBlockDim(feature_size)) {
+    FIXED_BLOCK_DIM_CASE(
+        LayerNormForwardFP16<half, half,
+                             kBlockDim><<<batch_size, kBlockDim, 0, stream>>>(
+            input, scale, bias, output, mean, variance, eps, feature_size));
+    default:
+      PADDLE_THROW(platform::errors::InvalidArgument(
+          "Product from begin_norm_axis to end in layer_norm must be larger "
+          "than 1"));
+      break;
+  }
+}
+
 template <typename T>
 class LayerNormKernel<platform::CUDADeviceContext, T>
     : public framework::OpKernel<T> {
@@ -961,6 +1050,9 @@ class LayerNormGradKernel<platform::CUDADeviceContext, T>
 };
 
 template class LayerNormDirectCUDAFunctor<float>;
+#ifdef TRT_PLUGIN_FP16_AVALIABLE
+template class LayerNormDirectCUDAFunctor<half>;
+#endif
 
 #undef FIXED_BLOCK_DIM_FIXED_BLOCK_NUM_CASE_BASE
 #undef FIXED_BLOCK_DIM_FIXED_BLOCK_NUM_CASE
diff --git a/paddle/fluid/operators/py_layer_op.cc b/paddle/fluid/operators/py_layer_op.cc
index c2f68675beb6214134cd0f73a2ef40f674e4d935..ce6db633c9566e77a6b581fea45b781b75d60e17 100644
--- a/paddle/fluid/operators/py_layer_op.cc
+++ b/paddle/fluid/operators/py_layer_op.cc
@@ -62,13 +62,22 @@ void RunPyObject(py::object *py_object,
     for (size_t i = 0; i < result_tuple.size(); i++) {
       if ((*outs)[i] != nullptr) {
         if (Py_None != result_tuple[i].ptr()) {
-          try {
-            auto result_var =
-                result_tuple[i].cast<std::shared_ptr<imperative::VarBase>>();
-            *(*outs)[i] = result_var->Var();
-          } catch (py::cast_error &) {
+          if (py::isinstance<imperative::VarBase>(result_tuple[i])) {
+            try {
+              auto result_var =
+                  result_tuple[i].cast<std::shared_ptr<imperative::VarBase>>();
+              *(*outs)[i] = result_var->Var();
+            } catch (py::cast_error &) {
+              PADDLE_THROW(platform::errors::InvalidArgument(
+                  "The `PyLayer.backward` function returns invalid argument, "
+                  "the `%s` type argument can not be cast into `Tensor`.",
+                  result_tuple[i].ptr()->ob_type->tp_name));
+            }
+          } else {
             PADDLE_THROW(platform::errors::InvalidArgument(
-                "The output of `PyLayer.backward` should be `Tensor`."));
+                "The output of `PyLayer.backward` should be `Tensor`, but "
+                "received `%s`.",
+                result_tuple[i].ptr()->ob_type->tp_name));
           }
         } else {
           PADDLE_THROW(platform::errors::InvalidArgument(
@@ -94,13 +103,22 @@ void RunPyObject(py::object *py_object,
     }
     if ((*outs)[0] != nullptr) {
       if (Py_None != py_result.ptr()) {
-        try {
-          auto result_var =
-              py_result.cast<std::shared_ptr<imperative::VarBase>>();
-          *((*outs)[0]) = result_var->Var();
-        } catch (py::cast_error &) {
+        if (py::isinstance<imperative::VarBase>(py_result)) {
+          try {
+            auto result_var =
+                py_result.cast<std::shared_ptr<imperative::VarBase>>();
+            *((*outs)[0]) = result_var->Var();
+          } catch (py::cast_error &) {
+            PADDLE_THROW(platform::errors::InvalidArgument(
+                "The `PyLayer.backward` function returns invalid argument, the "
+                "`%s` type argument can not be cast into `Tensor`.",
+                py_result.ptr()->ob_type->tp_name));
+          }
+        } else {
           PADDLE_THROW(platform::errors::InvalidArgument(
-              "The output of `PyLayer.backward` should be `Tensor`."));
+              "The output of `PyLayer.backward` should be `Tensor`, but "
+              "received `%s`",
+              py_result.ptr()->ob_type->tp_name));
         }
       } else {
         PADDLE_THROW(platform::errors::InvalidArgument(
diff --git a/paddle/fluid/operators/reader/buffered_reader.cc b/paddle/fluid/operators/reader/buffered_reader.cc
index f5d55791d86c68bf800b869ee2be981bd6ab63b5..17c84530b23e667d8da4bf18cf44a89d44b1b51e 100644
--- a/paddle/fluid/operators/reader/buffered_reader.cc
+++ b/paddle/fluid/operators/reader/buffered_reader.cc
@@ -68,7 +68,6 @@ BufferedReader::BufferedReader(
     stream_ = platform::NpuStreamResourcePool::Instance().New(dev_idx);
   }
 #endif
-  is_same_place_ = false;
   cpu_buffer_.resize(buffer_size);
   cuda_buffer_.resize(buffer_size);
   npu_buffer_.resize(buffer_size);
@@ -116,7 +115,7 @@ void BufferedReader::ReadAsync(size_t i) {
         std::vector<void *> cuda_pinned_ptrs;
         cuda_pinned_ptrs.reserve(cpu.size());
         platform::RecordEvent record_event("BufferedReader:MemoryCopy");
-        // NODE(chenwehiang): When we use CUDAPinned Memory, we need call
+        // NODE(chenweihang): When we use CUDAPinned Memory, we need call
         // cudaHostAlloc, that is a CUDA API, calling CUDA API need load
         // cuda lib into device, it will cost hundreds of MB of GPU memory.
         // If we don't set Device here, which will use CUDAPlace(0) default.
@@ -126,18 +125,21 @@ void BufferedReader::ReadAsync(size_t i) {
           if (platform::is_cpu_place(cpu[i].place())) {
             cuda[i].Resize(cpu[i].dims());
             cuda[i].set_layout(cpu[i].layout());
-            cuda_pinned_ptrs.emplace_back(
-                cuda[i].mutable_data(cuda_pinned_place, cpu[i].type()));
+            cuda_pinned_ptrs[i] =
+                cuda[i].mutable_data(cuda_pinned_place, cpu[i].type());
             auto size =
                 cpu[i].numel() * paddle::framework::SizeOfType(cpu[i].type());
 
             memory::Copy(cuda_pinned_place, cuda_pinned_ptrs[i],
                          BOOST_GET_CONST(platform::CPUPlace, cpu[i].place()),
                          cpu[i].data<void>(), size);
+
             cuda[i].set_lod(cpu[i].lod());
           } else {
-            // we set same place flag & use cpu[i] directly
-            is_same_place_ = true;
+            // Here the cpu[i]'s place may be CUDAPlace, CUDAPinnedPlace, or
+            // others, we don't copy the memory of it to CUDAPinnedPlace, but
+            // we should share tensor data to cuda[i]
+            cuda[i].ShareDataWith(cpu[i]);
           }
         }
       } else {
@@ -296,9 +298,9 @@ void BufferedReader::ReadNextImpl(std::vector<framework::LoDTensor> *out) {
     return;
   }
 
-  if (platform::is_gpu_place(place_) && !is_same_place_) {
+  if (platform::is_gpu_place(place_)) {
     *out = std::move(cuda_buffer_[i]);
-  } else if (platform::is_npu_place(place_) && !is_same_place_) {
+  } else if (platform::is_npu_place(place_)) {
     *out = std::move(npu_buffer_[i]);
   } else {
     *out = std::move(cpu_buffer_[i]);
diff --git a/paddle/fluid/operators/reader/buffered_reader.h b/paddle/fluid/operators/reader/buffered_reader.h
index 9f7b0e753281eb2e6476bc931b454b3b15340c3c..5b4bbc7d62cd8f1cdb64b0454279dada2f1a0e69 100644
--- a/paddle/fluid/operators/reader/buffered_reader.h
+++ b/paddle/fluid/operators/reader/buffered_reader.h
@@ -67,7 +67,6 @@ class BufferedReader : public framework::DecoratedReader {
   // buffer, just read async and create futures as buffer size. However, to
   // malloc tensors every time is extremely slow. Here we store all data in
   // buffers and prevent alloc every time.
-  bool is_same_place_;
   std::vector<TensorVec> cpu_buffer_;
   std::vector<TensorVec> cuda_buffer_;
   std::vector<TensorVec> npu_buffer_;
diff --git a/paddle/fluid/operators/roi_align_op.cu b/paddle/fluid/operators/roi_align_op.cu
index d6ba399439d0238f12797cc2a0ab90389225b7af..934802f6a9e0e9eec1e6492595c336a5ce3bd927 100644
--- a/paddle/fluid/operators/roi_align_op.cu
+++ b/paddle/fluid/operators/roi_align_op.cu
@@ -124,8 +124,10 @@ __global__ void GPUROIAlignForward(
 
     T roi_width = roi_xmax - roi_xmin;
     T roi_height = roi_ymax - roi_ymin;
-    roi_width = max(roi_width, static_cast<T>(1.));
-    roi_height = max(roi_height, static_cast<T>(1.));
+    if (!continuous_coordinate) {
+      roi_width = max(roi_width, static_cast<T>(1.));
+      roi_height = max(roi_height, static_cast<T>(1.));
+    }
 
     T bin_size_h = static_cast<T>(roi_height) / static_cast<T>(pooled_height);
     T bin_size_w = static_cast<T>(roi_width) / static_cast<T>(pooled_width);
@@ -138,7 +140,7 @@ __global__ void GPUROIAlignForward(
                              : ceil(roi_height / pooled_height);
     int roi_bin_grid_w =
         (sampling_ratio > 0) ? sampling_ratio : ceil(roi_width / pooled_width);
-    const T count = roi_bin_grid_h * roi_bin_grid_w;
+    const T count = max(roi_bin_grid_h * roi_bin_grid_w, 1);
     T output_val = 0;
     for (int iy = 0; iy < roi_bin_grid_h; iy++) {
       const T y = roi_ymin + ph * bin_size_h +
@@ -180,9 +182,10 @@ __global__ void GPUROIAlignBackward(
 
     T roi_width = roi_xmax - roi_xmin;
     T roi_height = roi_ymax - roi_ymin;
-    roi_width = max(roi_width, static_cast<T>(1.));
-    roi_height = max(roi_height, static_cast<T>(1.));
-
+    if (!continuous_coordinate) {
+      roi_width = max(roi_width, static_cast<T>(1.));
+      roi_height = max(roi_height, static_cast<T>(1.));
+    }
     T bin_size_h = static_cast<T>(roi_height) / static_cast<T>(pooled_height);
     T bin_size_w = static_cast<T>(roi_width) / static_cast<T>(pooled_width);
 
diff --git a/paddle/fluid/operators/roi_align_op.h b/paddle/fluid/operators/roi_align_op.h
index 46564ed4f629d80a2ab1706b512598cf8dbe4a27..29c9268d5241cce8bfaad6a96950933f1b7a3280 100644
--- a/paddle/fluid/operators/roi_align_op.h
+++ b/paddle/fluid/operators/roi_align_op.h
@@ -226,8 +226,10 @@ class CPUROIAlignOpKernel : public framework::OpKernel<T> {
 
       T roi_width = roi_xmax - roi_xmin;
       T roi_height = roi_ymax - roi_ymin;
-      roi_width = std::max(roi_width, static_cast<T>(1.));
-      roi_height = std::max(roi_height, static_cast<T>(1.));
+      if (!aligned) {
+        roi_width = std::max(roi_width, static_cast<T>(1.));
+        roi_height = std::max(roi_height, static_cast<T>(1.));
+      }
 
       T bin_size_h = static_cast<T>(roi_height) / static_cast<T>(pooled_height);
       T bin_size_w = static_cast<T>(roi_width) / static_cast<T>(pooled_width);
@@ -239,7 +241,7 @@ class CPUROIAlignOpKernel : public framework::OpKernel<T> {
       int roi_bin_grid_w = (sampling_ratio > 0)
                                ? sampling_ratio
                                : ceil(roi_width / pooled_width);
-      const T count = roi_bin_grid_h * roi_bin_grid_w;
+      const T count = std::max(roi_bin_grid_h * roi_bin_grid_w, 1);
       Tensor pre_pos;
       Tensor pre_w;
       int pre_size = count * out_stride[1];
@@ -362,6 +364,10 @@ class CPUROIAlignGradOpKernel : public framework::OpKernel<T> {
       T roi_height = roi_ymax - roi_ymin;
       roi_width = std::max(roi_width, static_cast<T>(1.));
       roi_height = std::max(roi_height, static_cast<T>(1.));
+      if (!aligned) {
+        roi_width = std::max(roi_width, static_cast<T>(1.));
+        roi_height = std::max(roi_height, static_cast<T>(1.));
+      }
 
       T bin_size_h = static_cast<T>(roi_height) / static_cast<T>(pooled_height);
       T bin_size_w = static_cast<T>(roi_width) / static_cast<T>(pooled_width);
diff --git a/paddle/fluid/operators/set_value_op.h b/paddle/fluid/operators/set_value_op.h
index eca51147f8159e1bcb7c0c88ca7760e4f62e5543..c7b61333cdab3d2cadf8bf6af1b3e4b2df5ed6f0 100644
--- a/paddle/fluid/operators/set_value_op.h
+++ b/paddle/fluid/operators/set_value_op.h
@@ -23,6 +23,7 @@
 #include "paddle/fluid/framework/tensor_util.h"
 #include "paddle/fluid/operators/assign_value_op.h"
 #include "paddle/fluid/operators/elementwise/elementwise_op_function.h"
+#include "paddle/fluid/operators/slice_utils.h"
 #include "paddle/fluid/operators/utils.h"
 #include "paddle/fluid/platform/enforce.h"
 
@@ -59,106 +60,6 @@ inline std::string GetValueName(framework::proto::VarType::Type data_type) {
   return value_name;
 }
 
-inline void CheckAndUpdateSlice(const framework::DDim in_dims,
-                                const std::vector<int64_t> axes,
-                                std::vector<int64_t>* starts,
-                                std::vector<int64_t>* ends,
-                                std::vector<int64_t>* steps) {
-  for (size_t i = 0; i < axes.size(); ++i) {
-    int64_t axis = axes[i];
-    int64_t dim_value = in_dims[axis];
-
-    int64_t start =
-        (*starts)[i] < 0 ? ((*starts)[i] + dim_value) : (*starts)[i];
-    int64_t end = (*ends)[i] < 0 ? ((*ends)[i] + dim_value) : (*ends)[i];
-    start = std::max(start, static_cast<int64_t>(0));
-    end = std::min(end, dim_value);
-
-    int64_t step = (*steps)[i];
-    PADDLE_ENFORCE_NE(
-        step, 0, platform::errors::InvalidArgument(
-                     "Step should not be 0, but received step = %d.", step));
-    if (step > 0) {
-      start = std::min(start, dim_value);
-      end = std::max(end, static_cast<int64_t>(0));
-      PADDLE_ENFORCE_GT(
-          end, start,
-          platform::errors::InvalidArgument(
-              "When step > 0, end should be greater than start, but "
-              "received end = %d, start = %d.",
-              end, start));
-    } else {
-      // NOTE(liym27): When step < 0, start should less and equal to dim_value-1
-      // "end is -1" means contain the 0-th element of this axis.
-      start = std::min(start, dim_value - 1);
-      end = std::max(end, static_cast<int64_t>(-1));
-      PADDLE_ENFORCE_GT(
-          start, end,
-          platform::errors::InvalidArgument(
-              "When step < 0, start should be greater than end, but "
-              "received start = %d, end = %d.",
-              start, end));
-    }
-
-    (*starts)[i] = start;
-    (*ends)[i] = end;
-  }
-}
-
-inline framework::DDim GetSliceDims(const framework::DDim in_dims,
-                                    const std::vector<int64_t>& axes,
-                                    const std::vector<int64_t>& starts,
-                                    const std::vector<int64_t>& ends,
-                                    const std::vector<int64_t>& steps) {
-  framework::DDim slice_dims(in_dims);
-
-  for (size_t i = 0; i < axes.size(); ++i) {
-    int64_t axis = axes[i];
-    int64_t start = starts[i];
-    int64_t end = ends[i];
-    int64_t step = steps[i];
-
-    if (step > 0) {
-      slice_dims[axis] = (end - start + step - 1) / step;
-    } else {
-      slice_dims[axis] = (end - start + step + 1) / step;
-    }
-  }
-  return slice_dims;
-}
-
-inline framework::DDim GetDecreasedDims(
-    const framework::DDim slice_dims,
-    const std::vector<int64_t>& decrease_axes) {
-  // Get dims after decreasing axes.
-  framework::DDim decreased_dims(slice_dims);
-  if (decrease_axes.size() > 0) {
-    for (size_t i = 0; i < decrease_axes.size(); ++i) {
-      int64_t axis = decrease_axes[i];
-      PADDLE_ENFORCE_EQ(
-          decreased_dims[axis], 1,
-          platform::errors::InvalidArgument("decrease dim should be 1"));
-      decreased_dims[axis] = 0;
-    }
-
-    std::vector<int64_t> new_shape;
-    for (int i = 0; i < decreased_dims.size(); ++i) {
-      if (decreased_dims[i] != 0) {
-        new_shape.push_back(decreased_dims[i]);
-      }
-    }
-
-    // NOTE(liym27): Paddle does not support that the rank of Tensor is 0, and
-    // uses [1] instead.
-    if (new_shape.size() == 0) {
-      new_shape.push_back(1);
-    }
-
-    decreased_dims = framework::make_ddim(new_shape);
-  }
-  return decreased_dims;
-}
-
 template <typename DeviceContext, typename T>
 class SetValueKernel : public framework::OpKernel<T> {
  public:
@@ -225,8 +126,8 @@ class SetValueKernel : public framework::OpKernel<T> {
     }
 
     auto in_dims = in->dims();
-    CheckAndUpdateSlice(in_dims, axes, &starts, &ends, &steps);
-    auto slice_dims = GetSliceDims(in_dims, axes, starts, ends, steps);
+    CheckAndUpdateSliceAttrs(in_dims, axes, &starts, &ends, &steps);
+    auto slice_dims = GetSliceDims(in_dims, axes, starts, ends, &steps);
     auto decrease_slice_dims = GetDecreasedDims(slice_dims, decrease_axes);
 
     auto place = ctx.GetPlace();
diff --git a/paddle/fluid/operators/slice_op.cc b/paddle/fluid/operators/slice_op.cc
index b5298979721642065ae75bcf98bb8b44435038a3..01daba7c072845e47cf5aa176a4b7e060ee2d942 100644
--- a/paddle/fluid/operators/slice_op.cc
+++ b/paddle/fluid/operators/slice_op.cc
@@ -28,13 +28,10 @@ class SliceOp : public framework::OperatorWithKernel {
   using framework::OperatorWithKernel::OperatorWithKernel;
 
   void InferShape(framework::InferShapeContext *ctx) const override {
-    PADDLE_ENFORCE_EQ(ctx->HasInput("Input"), true,
-                      platform::errors::InvalidArgument(
-                          "Input (Input) of slice op should not be null."));
+    OP_INOUT_CHECK(ctx->HasInput("Input"), "Input", "Input", "slice");
+    OP_INOUT_CHECK(ctx->HasOutput("Out"), "Output", "Out", "slice");
 
-    PADDLE_ENFORCE_EQ(ctx->HasOutput("Out"), true,
-                      platform::errors::InvalidArgument(
-                          "Output (Out) of slice op should not be null."));
+    // Case 1: Special treatment when input is a tensor array.
     auto x_var_type = ctx->GetInputsVarType("Input")[0];
     auto axes = ctx->Attrs().Get<std::vector<int>>("axes");
     if (x_var_type == framework::proto::VarType::LOD_TENSOR_ARRAY) {
@@ -57,6 +54,8 @@ class SliceOp : public framework::OperatorWithKernel {
         return;
       }
     }
+
+    // Case 2: input is a tensor.
     auto in_dims = ctx->GetInputDim("Input");
     PADDLE_ENFORCE_LT(in_dims.size(), 7,
                       platform::errors::InvalidArgument(
@@ -65,101 +64,54 @@ class SliceOp : public framework::OperatorWithKernel {
 
     auto starts = ctx->Attrs().Get<std::vector<int>>("starts");
     auto ends = ctx->Attrs().Get<std::vector<int>>("ends");
-    auto infer_flags = ctx->Attrs().Get<std::vector<int>>("infer_flags");
     auto decrease_axis = ctx->Attrs().Get<std::vector<int>>("decrease_axis");
-
-    auto starts_size = starts.size();
-    auto ends_size = ends.size();
+    auto infer_flags = ctx->Attrs().Get<std::vector<int>>("infer_flags");
     if (infer_flags.empty()) {
       // Initialize infer_flags with 1.
       // To be compatible with other op tests in which infer_flags is not set.
       infer_flags = std::vector<int>(axes.size(), 1);
     }
 
+    // 2.1 Check attrs.
+    auto starts_size = starts.size();
+    auto ends_size = ends.size();
+
     if (ctx->HasInputs("StartsTensorList")) {
-      auto StartsTensorList = ctx->Inputs("StartsTensorList");
-      PADDLE_ENFORCE_GT(StartsTensorList.size(), 0,
+      starts_size = ctx->Inputs("StartsTensorList").size();
+      PADDLE_ENFORCE_GT(starts_size, 0,
                         platform::errors::InvalidArgument(
                             "StartsTensorList size can't be zero"));
-      starts_size = StartsTensorList.size();
     }
     if (ctx->HasInputs("EndsTensorList")) {
-      auto EndsTensorList = ctx->Inputs("EndsTensorList");
-      PADDLE_ENFORCE_GT(EndsTensorList.size(), 0,
-                        platform::errors::InvalidArgument(
-                            "EndsTensorList size can't be zero"));
-      ends_size = EndsTensorList.size();
+      ends_size = ctx->Inputs("EndsTensorList").size();
+      PADDLE_ENFORCE_GT(ends_size, 0, platform::errors::InvalidArgument(
+                                          "EndsTensorList size can't be zero"));
     }
 
-    if (ctx->HasInput("StartsTensor") == false) {
+    if (!ctx->HasInput("StartsTensor")) {
       PADDLE_ENFORCE_EQ(
           starts_size, axes.size(),
           platform::errors::InvalidArgument(
               "The size of starts must be equal to the size of axes."));
     }
-    if (ctx->HasInput("EndsTensor") == false) {
+    if (!ctx->HasInput("EndsTensor")) {
       PADDLE_ENFORCE_EQ(
           ends_size, axes.size(),
           platform::errors::InvalidArgument(
               "The size of ends must be equal to the size of axes."));
     }
 
-    int dim_value, start, end;
-    for (size_t i = 0; i < axes.size(); ++i) {
-      PADDLE_ENFORCE_LT(static_cast<int>(axes[i]), in_dims.size(),
-                        platform::errors::InvalidArgument(
-                            "The index of dimension in axes must be less "
-                            "than the size of input shape."));
-      if (infer_flags[i] == -1) {
-        out_dims[axes[i]] = -1;
-      } else {
-        // infer out_dim shape
-        dim_value = out_dims[axes[i]];
-        if (dim_value > 0) {
-          start = starts[i] < 0 ? (starts[i] + dim_value) : starts[i];
-          end = ends[i] < 0 ? (ends[i] + dim_value) : ends[i];
-          start = std::max(start, 0);
-          end = std::max(end, 0);
-          end = std::min(end, dim_value);
-
-          PADDLE_ENFORCE_LE(start, dim_value,
-                            platform::errors::InvalidArgument(
-                                "start should be less than or equal to the "
-                                "dimension value, but received "
-                                "start = %d, shape[%d] = %d.",
-                                starts[i], axes[i], out_dims[axes[i]]));
-          PADDLE_ENFORCE_GT(end, start,
-                            platform::errors::InvalidArgument(
-                                "end should greater than start, but received "
-                                "end = %d, start = %d.",
-                                ends[i], starts[i]));
-          out_dims[axes[i]] = end - start;
-        }
-      }
-    }
-    // generate new shape
-    if (decrease_axis.size() > 0) {
-      std::vector<int> new_out_shape;
-      for (size_t i = 0; i < decrease_axis.size(); ++i) {
-        if (ctx->IsRuntime() && infer_flags[i] != -1) {
-          PADDLE_ENFORCE_EQ(
-              out_dims[decrease_axis[i]], 1,
-              platform::errors::InvalidArgument("decrease dim should be 1"));
-        }
-        out_dims[decrease_axis[i]] = 0;
-      }
+    CheckAndUpdateSliceAttrs<int>(in_dims, axes, &starts, &ends, nullptr,
+                                  &infer_flags);
 
-      for (int i = 0; i < out_dims.size(); ++i) {
-        if (out_dims[i] != 0) {
-          new_out_shape.push_back(out_dims[i]);
-        }
-      }
-      if (new_out_shape.size() == 0) {
-        new_out_shape.push_back(1);
-      }
-
-      out_dims = framework::make_ddim(new_out_shape);
+    auto slice_dims =
+        GetSliceDims<int>(in_dims, axes, starts, ends, nullptr, &infer_flags);
+    if (ctx->IsRuntime()) {
+      out_dims = GetDecreasedDims<int>(slice_dims, decrease_axis, &infer_flags);
+    } else {
+      out_dims = GetDecreasedDims<int>(slice_dims, decrease_axis, nullptr);
     }
+
     ctx->SetOutputDim("Out", out_dims);
     if (axes[0] != 0) {
       ctx->ShareLoD("Input", /*->*/ "Out");
@@ -185,6 +137,7 @@ class SliceOp : public framework::OperatorWithKernel {
     return framework::OpKernelType(
         OperatorWithKernel::IndicateVarDataType(ctx, "Input"), ctx.GetPlace());
   }
+
   framework::OpKernelType GetKernelTypeForVar(
       const std::string &var_name, const Tensor &tensor,
       const framework::OpKernelType &expected_kernel_type) const override {
diff --git a/paddle/fluid/operators/slice_op.h b/paddle/fluid/operators/slice_op.h
index 3d294ae238986c8cd7f18109871a559679553db0..96b8ea11d6845eb1b07cc05f1363ff34681d2071 100644
--- a/paddle/fluid/operators/slice_op.h
+++ b/paddle/fluid/operators/slice_op.h
@@ -19,21 +19,67 @@ limitations under the License. */
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/operators/eigen/eigen_function.h"
 #include "paddle/fluid/operators/math/math_function.h"
+#include "paddle/fluid/operators/slice_utils.h"
 #include "paddle/fluid/operators/utils.h"
 
 namespace paddle {
 namespace operators {
 using Tensor = framework::Tensor;
+using Variable = framework::Variable;
+using LoDTensorArray = framework::LoDTensorArray;
+using DDim = framework::DDim;
+
+inline void DealTensorArray(const framework::ExecutionContext& ctx,
+                            const std::vector<int64_t>& starts,
+                            const std::vector<int64_t>& ends,
+                            bool out_is_array) {
+  auto in_array = ctx.Input<LoDTensorArray>("Input");
+  // If the input is LoDTensorArray, the rank of input is 1.
+  int64_t in_size = in_array->size();
+  int64_t start = starts[0] < 0 ? (starts[0] + in_size) : starts[0];
+  int64_t end = ends[0] < 0 ? (ends[0] + in_size) : ends[0];
+
+  start = std::max(start, static_cast<int64_t>(0));
+  end = std::max(end, static_cast<int64_t>(0));
+  end = std::min(end, in_size);
+
+  PADDLE_ENFORCE_GT(end, start,
+                    platform::errors::InvalidArgument(
+                        "Attr(ends) should be greater than attr(starts) in "
+                        "slice op. But received end = %d, start = %d.",
+                        ends[0], starts[0]));
+  int64_t out_size = end - start;
+
+  if (out_is_array) {
+    auto out_array = ctx.Output<LoDTensorArray>("Out");
+    out_array->resize(out_size);
+
+    for (int i = 0; i < out_size; ++i) {
+      auto* out_tensor = &out_array->at(i);
+      auto in_tensor = in_array->at(i + start);
+      out_tensor->set_lod(in_tensor.lod());
+      if (in_tensor.memory_size() > 0) {
+        TensorCopy(in_tensor, ctx.GetPlace(), out_tensor);
+      } else {
+        VLOG(10) << "WARNING: The input tensor 'x_tensor' holds no memory, so "
+                    "nothing has been written to output array["
+                 << i << "].";
+      }
+    }
+  } else {
+    auto out = ctx.Output<Tensor>("Out");
+    auto in_tensor = in_array->at(start);
+    TensorCopy(in_tensor, ctx.GetPlace(), out);
+  }
+}
 
 template <typename DeviceContext, typename T>
 class SliceKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& ctx) const override {
-    const framework::Variable* input_var = ctx.InputVar("Input");
-    bool is_tensor_array = input_var->IsType<framework::LoDTensorArray>();
-    int rank = is_tensor_array
-                   ? 1
-                   : ctx.Input<framework::Tensor>("Input")->dims().size();
+    const Variable* input_var = ctx.InputVar("Input");
+    bool is_tensor_array = input_var->IsType<LoDTensorArray>();
+    int rank = is_tensor_array ? 1 : ctx.Input<Tensor>("Input")->dims().size();
 
     switch (rank) {
       case 1:
@@ -54,53 +100,45 @@ class SliceKernel : public framework::OpKernel<T> {
       case 6:
         SliceCompute<6>(ctx);
         break;
+      default:
+        PADDLE_THROW(platform::errors::InvalidArgument(
+            "The rank of input should be less than 7, but received %d.", rank));
     }
   }
 
  private:
   template <size_t D>
-  void SliceCompute(const framework::ExecutionContext& context) const {
-    auto& place =
-        *context.template device_context<DeviceContext>().eigen_device();
-    const framework::Variable* input_var = context.InputVar("Input");
-    framework::Variable* out_var = context.OutputVar("Out");
-    bool input_is_tensor_array = input_var->IsType<framework::LoDTensorArray>();
-    bool out_is_tensor_array = out_var->IsType<framework::LoDTensorArray>();
-
-    auto axes = context.Attr<std::vector<int>>("axes");
-
-    auto starts_int = context.Attr<std::vector<int>>("starts");
+  void SliceCompute(const framework::ExecutionContext& ctx) const {
+    const Variable* input_var = ctx.InputVar("Input");
+    Variable* out_var = ctx.OutputVar("Out");
+    bool input_is_array = input_var->IsType<LoDTensorArray>();
+    bool out_is_array = out_var->IsType<LoDTensorArray>();
+
+    auto axes_int = ctx.Attr<std::vector<int>>("axes");
+    auto starts_int = ctx.Attr<std::vector<int>>("starts");
+    auto ends_int = ctx.Attr<std::vector<int>>("ends");
+    std::vector<int64_t> axes(axes_int.begin(), axes_int.end());
     std::vector<int64_t> starts(starts_int.begin(), starts_int.end());
-    auto ends_int = context.Attr<std::vector<int>>("ends");
     std::vector<int64_t> ends(ends_int.begin(), ends_int.end());
-    auto decrease_axis = context.Attr<std::vector<int>>("decrease_axis");
-    auto infer_flags = context.Attr<std::vector<int>>("infer_flags");
-    auto list_new_ends_tensor =
-        context.MultiInput<framework::Tensor>("EndsTensorList");
-    auto list_new_starts_tensor =
-        context.MultiInput<framework::Tensor>("StartsTensorList");
-
-    bool need_infer = false;
-    if (context.HasInput("StartsTensor") || context.HasInput("EndsTensor")) {
-      need_infer = true;
-    }
-    if (list_new_starts_tensor.size() > 0 || list_new_ends_tensor.size() > 0) {
-      need_infer = true;
+
+    auto decrease_axis = ctx.Attr<std::vector<int>>("decrease_axis");
+    auto infer_flags = ctx.Attr<std::vector<int>>("infer_flags");
+
+    // Step 1: Get the accurate attribute value of starts and ends
+    auto starts_tensor_list = ctx.MultiInput<Tensor>("StartsTensorList");
+    if (ctx.HasInput("StartsTensor")) {
+      starts = GetDataFromTensor<int64_t>(ctx.Input<Tensor>("StartsTensor"));
+    } else if (starts_tensor_list.size() > 0) {
+      starts = GetDataFromTensorList<int64_t>(starts_tensor_list);
     }
-    if (need_infer) {
-      if (context.HasInput("StartsTensor")) {
-        auto* starts_tensor = context.Input<framework::Tensor>("StartsTensor");
-        starts = GetDataFromTensor<int64_t>(starts_tensor);
-      } else if (list_new_starts_tensor.size() > 0) {
-        starts = GetDataFromTensorList<int64_t>(list_new_starts_tensor);
-      }
-      if (context.HasInput("EndsTensor")) {
-        auto* ends_tensor = context.Input<framework::Tensor>("EndsTensor");
-        ends = GetDataFromTensor<int64_t>(ends_tensor);
-      } else if (list_new_ends_tensor.size() > 0) {
-        ends = GetDataFromTensorList<int64_t>(list_new_ends_tensor);
-      }
+
+    auto ends_tensor_list = ctx.MultiInput<Tensor>("EndsTensorList");
+    if (ctx.HasInput("EndsTensor")) {
+      ends = GetDataFromTensor<int64_t>(ctx.Input<Tensor>("EndsTensor"));
+    } else if (ends_tensor_list.size() > 0) {
+      ends = GetDataFromTensorList<int64_t>(ends_tensor_list);
     }
+
     PADDLE_ENFORCE_EQ(
         starts.size(), axes.size(),
         platform::errors::InvalidArgument(
@@ -109,175 +147,74 @@ class SliceKernel : public framework::OpKernel<T> {
         ends.size(), axes.size(),
         platform::errors::InvalidArgument(
             "The size of ends must be equal to the size of axes."));
-    if (input_is_tensor_array) {
-      auto in_array = context.Input<framework::LoDTensorArray>("Input");
-      // If the input is LoDTensorArray, the rank of input is 1.
-      int64_t in_size = in_array->size();
-      int64_t start = starts[0] < 0 ? (starts[0] + in_size) : starts[0];
-      int64_t end = ends[0] < 0 ? (ends[0] + in_size) : ends[0];
-
-      start = std::max(start, static_cast<int64_t>(0));
-      end = std::max(end, static_cast<int64_t>(0));
-      end = std::min(end, in_size);
-
-      PADDLE_ENFORCE_GT(end, start,
-                        platform::errors::InvalidArgument(
-                            "Attr(ends) should be greater than attr(starts) in "
-                            "slice op. But received end = %d, start = %d.",
-                            ends[0], starts[0]));
-      int64_t out_size = end - start;
-
-      if (out_is_tensor_array) {
-        auto out_array = context.Output<framework::LoDTensorArray>("Out");
-        out_array->resize(out_size);
-
-        for (int i = 0; i < out_size; ++i) {
-          auto* out_tensor = &out_array->at(i);
-          auto in_tensor = in_array->at(i + start);
-          out_tensor->set_lod(in_tensor.lod());
-          if (in_tensor.memory_size() > 0) {
-            TensorCopy(in_tensor, context.GetPlace(), out_tensor);
-          } else {
-            VLOG(10)
-                << "WARNING: The input tensor 'x_tensor' holds no memory, so "
-                   "nothing has been written to output array["
-                << i << "].";
-          }
-        }
-      } else {
-        auto out = context.Output<framework::Tensor>("Out");
-        auto in_tensor = in_array->at(start);
-        TensorCopy(in_tensor, context.GetPlace(), out);
-      }
 
+    // Step 2: Compute output
+    if (input_is_array) {
+      DealTensorArray(ctx, starts, ends, out_is_array);
       return;
-    }
+    } else {
+      auto in = ctx.Input<Tensor>("Input");
+      auto out = ctx.Output<Tensor>("Out");
 
-    auto in = context.Input<framework::Tensor>("Input");
-    auto out = context.Output<framework::Tensor>("Out");
+      auto in_dims = in->dims();
+      auto out_dims = out->dims();
+      auto slice_dims = out_dims;
 
-    auto out_dims = out->dims();
-    auto in_dims = in->dims();
-    if (need_infer) {
-      out_dims = in_dims;
-      int64_t dim_value, start, end;
+      // 2.1 Infer output dims
       for (size_t i = 0; i < axes.size(); ++i) {
-        dim_value = out_dims[axes[i]];
-        if (dim_value > 0) {
-          // when end = start+1 and start == -1
-          if (starts[i] == -1 && ends[i] == 0 && infer_flags[i] == -1) {
-            auto ret =
-                std::find(decrease_axis.begin(), decrease_axis.end(), axes[i]);
-            if (ret != decrease_axis.end()) {
-              ends[i] = 10000000;
-            }
-          }
-
-          start = starts[i] < 0 ? (starts[i] + dim_value) : starts[i];
-          end = ends[i] < 0 ? (ends[i] + dim_value) : ends[i];
-          start = std::max(start, static_cast<int64_t>(0));
-          end = std::max(end, static_cast<int64_t>(0));
-          end = std::min(end, dim_value);
-          PADDLE_ENFORCE_GT(
-              end, start,
-              platform::errors::InvalidArgument(
-                  "Attr(ends) should be greater than attr(starts) in "
-                  "slice op. But received end = %d, start = %d.",
-                  ends[i], starts[i]));
-          out_dims[axes[i]] = end - start;
-        }
-      }
-      out->Resize(out_dims);
-      // generate new shape
-      if (decrease_axis.size() > 0) {
-        std::vector<int64_t> new_out_shape;
-        for (size_t i = 0; i < decrease_axis.size(); ++i) {
-          PADDLE_ENFORCE_EQ(
-              out_dims[decrease_axis[i]], 1,
-              platform::errors::InvalidArgument("decrease dim should be 1"));
-          out_dims[decrease_axis[i]] = 0;
-        }
-
-        for (int i = 0; i < out_dims.size(); ++i) {
-          if (out_dims[i] != 0) {
-            new_out_shape.push_back(out_dims[i]);
+        // when start == -1 && end == start+1
+        if (starts[i] == -1 && ends[i] == 0 && infer_flags[i] == -1) {
+          auto ret =
+              std::find(decrease_axis.begin(), decrease_axis.end(), axes[i]);
+          if (ret != decrease_axis.end()) {
+            ends[i] = in_dims[axes[i]];
           }
         }
-        if (new_out_shape.size() == 0) {
-          new_out_shape.push_back(1);
-        }
-
-        out_dims = framework::make_ddim(new_out_shape);
       }
-    }
 
-    // resize out_dims
-    if (decrease_axis.size() > 0) {
-      if (decrease_axis.size() == (size_t)in_dims.size()) {
-        std::vector<int> vec_origin_out_shape(decrease_axis.size(), 1);
-        out->Resize(framework::make_ddim(vec_origin_out_shape));
-      } else {
-        std::vector<int> vec_origin_out_shape(
-            out_dims.size() + decrease_axis.size(), -1);
+      CheckAndUpdateSliceAttrs(in_dims, axes, &starts, &ends);
+      slice_dims =
+          GetSliceDims<int64_t>(in_dims, axes, starts, ends, nullptr, nullptr);
+      out_dims = GetDecreasedDims(slice_dims, decrease_axis);
 
-        for (size_t i = 0; i < decrease_axis.size(); ++i) {
-          vec_origin_out_shape[decrease_axis[i]] = 1;
-        }
+      // 2.2 Get output
+      auto offsets = Eigen::DSizes<Eigen::DenseIndex, D>();
+      auto extents = Eigen::DSizes<Eigen::DenseIndex, D>();
 
-        int index = 0;
-        for (size_t i = 0; i < vec_origin_out_shape.size(); ++i) {
-          if (vec_origin_out_shape[i] == -1) {
-            vec_origin_out_shape[i] = out_dims[index];
-            ++index;
-          }
-        }
-
-        out->Resize(framework::make_ddim(vec_origin_out_shape));
+      for (size_t i = 0; i < D; ++i) {
+        offsets[i] = 0;
+        extents[i] = slice_dims[i];
       }
-    }
-
-    out->mutable_data<T>(context.GetPlace());
-
-    auto new_out_dims = out->dims();
-    auto offsets = Eigen::DSizes<Eigen::DenseIndex, D>();
-    auto extents = Eigen::DSizes<Eigen::DenseIndex, D>();
-    for (size_t i = 0; i < D; ++i) {
-      offsets[i] = 0;
-      extents[i] = new_out_dims[i];
-    }
-    int64_t start;
-    for (size_t i = 0; i < axes.size(); ++i) {
-      start = starts[i];
-      if (start < 0) {
-        start = (start + in_dims[axes[i]]);
+      for (size_t i = 0; i < axes.size(); ++i) {
+        offsets[axes[i]] = starts[i];
       }
-      start = std::max(start, static_cast<int64_t>(0));
-      offsets[axes[i]] = start;
-    }
-    auto in_t =
-        framework::EigenTensor<T, D, Eigen::RowMajor, Eigen::DenseIndex>::From(
-            *in);
-    auto out_t =
-        framework::EigenTensor<T, D, Eigen::RowMajor, Eigen::DenseIndex>::From(
-            *out, new_out_dims);
 
-    if (in->numel() <= Eigen::NumTraits<int>::highest()) {
-      // similar to tf.slice:
-      // if element number less than INT_MAX, change the type of index to int
-      Eigen::DSizes<int, D> offsets_32bit, extents_32bit;
-      for (size_t i = 0; i < D; i++) {
-        offsets_32bit[i] = offsets[i];
-        extents_32bit[i] = extents[i];
+      out->Resize(slice_dims);
+      out->mutable_data<T>(ctx.GetPlace());
+
+      auto in_t = framework::EigenTensor<T, D>::From(*in, in_dims);
+      auto out_t = framework::EigenTensor<T, D>::From(*out, slice_dims);
+      auto& eigen_place =
+          *ctx.template device_context<DeviceContext>().eigen_device();
+
+      if (in->numel() <= Eigen::NumTraits<int>::highest()) {
+        // similar to tf.slice:
+        // if element number less than INT_MAX, change the type of index to int
+        Eigen::DSizes<int, D> offsets_32bit, extents_32bit;
+        for (size_t i = 0; i < D; i++) {
+          offsets_32bit[i] = offsets[i];
+          extents_32bit[i] = extents[i];
+        }
+        EigenSlice<std::decay_t<decltype(eigen_place)>, T, D>::Eval(
+            eigen_place, framework::To32BitIndex(out_t),
+            framework::To32BitIndex(in_t), offsets_32bit, extents_32bit);
+      } else {
+        EigenSlice<std::decay_t<decltype(eigen_place)>, T, D>::Eval(
+            eigen_place, out_t, in_t, offsets, extents);
       }
-      EigenSlice<std::decay_t<decltype(place)>, T, D>::Eval(
-          place, framework::To32BitIndex(out_t), framework::To32BitIndex(in_t),
-          offsets_32bit, extents_32bit);
-    } else {
-      EigenSlice<std::decay_t<decltype(place)>, T, D>::Eval(place, out_t, in_t,
-                                                            offsets, extents);
-    }
 
-    out->Resize(out_dims);
+      out->Resize(out_dims);
+    }
   }
 };
 
@@ -285,11 +222,9 @@ template <typename DeviceContext, typename T>
 class SliceGradKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& ctx) const override {
-    const framework::Variable* input_var = ctx.InputVar("Input");
-    bool is_tensor_array = input_var->IsType<framework::LoDTensorArray>();
-    size_t rank = is_tensor_array
-                      ? 1
-                      : ctx.Input<framework::Tensor>("Input")->dims().size();
+    const Variable* input_var = ctx.InputVar("Input");
+    bool is_array = input_var->IsType<LoDTensorArray>();
+    size_t rank = is_array ? 1 : ctx.Input<Tensor>("Input")->dims().size();
 
     switch (rank) {
       case 1:
@@ -310,53 +245,48 @@ class SliceGradKernel : public framework::OpKernel<T> {
       case 6:
         SliceCompute<6>(ctx);
         break;
+      default:
+        PADDLE_THROW(platform::errors::InvalidArgument(
+            "The rank of input should be less than 7, but received %d.", rank));
     }
   }
 
  private:
   template <size_t D>
-  void SliceCompute(const framework::ExecutionContext& context) const {
-    auto axes = context.Attr<std::vector<int>>("axes");
-
-    auto starts_int = context.Attr<std::vector<int>>("starts");
+  void SliceCompute(const framework::ExecutionContext& ctx) const {
+    auto axes = ctx.Attr<std::vector<int>>("axes");
+    auto starts_int = ctx.Attr<std::vector<int>>("starts");
+    auto ends_int = ctx.Attr<std::vector<int>>("ends");
     std::vector<int64_t> starts(starts_int.begin(), starts_int.end());
-
-    auto ends_int = context.Attr<std::vector<int>>("ends");
     std::vector<int64_t> ends(ends_int.begin(), ends_int.end());
 
-    auto list_new_ends_tensor =
-        context.MultiInput<framework::Tensor>("EndsTensorList");
-    auto list_new_starts_tensor =
-        context.MultiInput<framework::Tensor>("StartsTensorList");
-
-    if (list_new_starts_tensor.size() > 0) {
-      starts = GetDataFromTensorList<int64_t>(list_new_starts_tensor);
-    } else if (context.HasInput("StartsTensor")) {
-      auto* starts_tensor = context.Input<framework::Tensor>("StartsTensor");
-      starts = GetDataFromTensor<int64_t>(starts_tensor);
+    // Get the accurate attribute value of starts and ends
+    auto starts_tensor_list = ctx.MultiInput<Tensor>("StartsTensorList");
+    if (ctx.HasInput("StartsTensor")) {
+      starts = GetDataFromTensor<int64_t>(ctx.Input<Tensor>("StartsTensor"));
+    } else if (starts_tensor_list.size() > 0) {
+      starts = GetDataFromTensorList<int64_t>(starts_tensor_list);
     }
 
-    if (list_new_ends_tensor.size() > 0) {
-      ends = GetDataFromTensorList<int64_t>(list_new_ends_tensor);
-    } else if (context.HasInput("EndsTensor")) {
-      auto* ends_tensor = context.Input<framework::Tensor>("EndsTensor");
-      ends = GetDataFromTensor<int64_t>(ends_tensor);
+    auto ends_tensor_list = ctx.MultiInput<Tensor>("EndsTensorList");
+    if (ctx.HasInput("EndsTensor")) {
+      ends = GetDataFromTensor<int64_t>(ctx.Input<Tensor>("EndsTensor"));
+    } else if (ends_tensor_list.size() > 0) {
+      ends = GetDataFromTensorList<int64_t>(ends_tensor_list);
     }
-    framework::Variable* d_input_var =
-        context.OutputVar(framework::GradVarName("Input"));
-    const framework::Variable* d_out_var =
-        context.InputVar(framework::GradVarName("Out"));
-    bool d_input_is_tensor_array =
-        d_input_var->IsType<framework::LoDTensorArray>();
-    bool d_out_is_tensor_array = d_out_var->IsType<framework::LoDTensorArray>();
-
-    if (d_input_is_tensor_array) {
-      auto* input_array = context.Input<framework::LoDTensorArray>("Input");
-      auto* d_input_array = context.Output<framework::LoDTensorArray>(
-          framework::GradVarName("Input"));
+
+    Variable* d_input_var = ctx.OutputVar(framework::GradVarName("Input"));
+    const Variable* d_out_var = ctx.InputVar(framework::GradVarName("Out"));
+    bool d_input_is_array = d_input_var->IsType<LoDTensorArray>();
+    bool d_out_is_array = d_out_var->IsType<LoDTensorArray>();
+
+    if (d_input_is_array) {
+      auto* input_array = ctx.Input<LoDTensorArray>("Input");
+      auto* d_in_arr =
+          ctx.Output<LoDTensorArray>(framework::GradVarName("Input"));
 
       int64_t d_in_size = input_array->size();
-      d_input_array->resize(d_in_size);
+      d_in_arr->resize(d_in_size);
       // If the input is LoDTensorArray, the rank of input is 1.
       // So only use the 0th element of starts.
       int64_t start = starts[0] < 0 ? (starts[0] + d_in_size) : starts[0];
@@ -364,68 +294,60 @@ class SliceGradKernel : public framework::OpKernel<T> {
       // set zero
       platform::DeviceContextPool& pool =
           platform::DeviceContextPool::Instance();
-      auto& dev_ctx = *pool.Get(context.GetPlace());
-      T value = T(0);
+      auto& dev_ctx = *pool.Get(ctx.GetPlace());
       math::SetConstant<DeviceContext, T> functor;
       for (int i = 0; i < d_in_size; ++i) {
         auto dim = input_array->at(i).dims();
-        d_input_array->at(i).Resize(dim);
-        d_input_array->at(i).mutable_data<T>(context.GetPlace());
+        d_in_arr->at(i).Resize(dim);
+        d_in_arr->at(i).mutable_data<T>(ctx.GetPlace());
         functor(reinterpret_cast<const DeviceContext&>(dev_ctx),
-                &d_input_array->at(i), static_cast<T>(value));
+                &d_in_arr->at(i), static_cast<T>(0));
       }
 
-      if (d_out_is_tensor_array) {
-        auto* d_out_array = context.Input<framework::LoDTensorArray>(
-            framework::GradVarName("Out"));
-        int d_out_size = d_out_array->size();
+      if (d_out_is_array) {
+        auto* d_out_arr =
+            ctx.Input<LoDTensorArray>(framework::GradVarName("Out"));
+        int d_out_size = d_out_arr->size();
         for (int i = 0; i < d_out_size; ++i) {
-          TensorCopy(d_out_array->at(i), context.GetPlace(),
-                     &(d_input_array->at(start + i)));
+          TensorCopy(d_out_arr->at(i), ctx.GetPlace(),
+                     &(d_in_arr->at(start + i)));
         }
-
       } else {
-        auto* d_out =
-            context.Input<framework::Tensor>(framework::GradVarName("Out"));
-        TensorCopy(*d_out, context.GetPlace(), &(d_input_array->at(start)));
+        auto* d_out = ctx.Input<Tensor>(framework::GradVarName("Out"));
+        TensorCopy(*d_out, ctx.GetPlace(), &(d_in_arr->at(start)));
       }
       return;
     }
 
-    auto* d_out =
-        context.Input<framework::Tensor>(framework::GradVarName("Out"));
-
-    auto* d_input =
-        context.Output<framework::Tensor>(framework::GradVarName("Input"));
-
-    d_input->mutable_data<T>(context.GetPlace());
+    auto* d_out = ctx.Input<Tensor>(framework::GradVarName("Out"));
+    auto* d_input = ctx.Output<Tensor>(framework::GradVarName("Input"));
+    d_input->mutable_data<T>(ctx.GetPlace());
 
     auto out_dims = d_out->dims();
     auto in_dims = d_input->dims();
 
-    auto decrease_axis = context.Attr<std::vector<int>>("decrease_axis");
-    if (decrease_axis.size() > 0) {
-      if (decrease_axis.size() == (size_t)in_dims.size()) {
+    auto decrease_axis = ctx.Attr<std::vector<int>>("decrease_axis");
+    auto decrease_size = decrease_axis.size();
+    if (decrease_size > 0) {
+      if (decrease_size == (size_t)in_dims.size()) {
         // all dims decrease
-        std::vector<int> vec_origin_out_shape(decrease_axis.size(), 1);
-        out_dims = framework::make_ddim(vec_origin_out_shape);
+        std::vector<int> origin_out_shape(decrease_size, 1);
+        out_dims = framework::make_ddim(std::vector<int>(decrease_size, 1));
       } else {
-        std::vector<int> vec_origin_out_shape(
-            out_dims.size() + decrease_axis.size(), -1);
-
-        for (size_t i = 0; i < decrease_axis.size(); ++i) {
-          vec_origin_out_shape[decrease_axis[i]] = 1;
+        std::vector<int> origin_out_shape(out_dims.size() + decrease_size, -1);
+        for (size_t i = 0; i < decrease_size; ++i) {
+          origin_out_shape[decrease_axis[i]] = 1;
         }
 
         int index = 0;
-        for (size_t i = 0; i < vec_origin_out_shape.size(); ++i) {
-          if (vec_origin_out_shape[i] == -1) {
-            vec_origin_out_shape[i] = out_dims[index];
+        for (size_t i = 0; i < origin_out_shape.size(); ++i) {
+          if (origin_out_shape[i] == -1) {
+            origin_out_shape[i] = out_dims[index];
             ++index;
           }
         }
 
-        out_dims = framework::make_ddim(vec_origin_out_shape);
+        out_dims = framework::make_ddim(origin_out_shape);
       }
     }
 
@@ -435,28 +357,26 @@ class SliceGradKernel : public framework::OpKernel<T> {
       offsets[i] = 0;
       extents[i] = out_dims[i];
     }
-    int64_t start;
+
     for (size_t i = 0; i < axes.size(); ++i) {
-      start = starts[i];
-      if (start < 0) {
-        start = (start + in_dims[axes[i]]);
-      }
+      int axis = axes[i];
+      int64_t start = starts[i] < 0 ? (starts[i] + in_dims[axis]) : starts[i];
       start = std::max(start, static_cast<int64_t>(0));
-      offsets[axes[i]] = start;
+      offsets[axis] = start;
     }
+
     Eigen::array<std::pair<int64_t, int64_t>, D> paddings;
     for (size_t i = 0; i < paddings.size(); ++i) {
       paddings[i].first = offsets[i];
       paddings[i].second = (in_dims[i] - out_dims[i]) - offsets[i];
     }
-    EigenPaddingCompute(context, d_input, in_dims, d_out, out_dims, paddings);
+    EigenPaddingCompute(ctx, d_input, in_dims, d_out, out_dims, paddings);
   }
 
   template <size_t D>
   void EigenPaddingCompute(
-      const framework::ExecutionContext& context, framework::Tensor* d_input,
-      const framework::DDim& in_dims, const framework::Tensor* d_out,
-      const framework::DDim& out_dims,
+      const framework::ExecutionContext& context, Tensor* d_input,
+      const DDim& in_dims, const Tensor* d_out, const DDim& out_dims,
       const Eigen::array<std::pair<int64_t, int64_t>, D>& paddings) const {
     if (D <= 3) {
       // if dimension less than 3, cannot reduce dimension
@@ -512,10 +432,8 @@ class SliceGradKernel : public framework::OpKernel<T> {
           out_tore_shape[1] = out_dims[pad_dim];
 
           // convert array from std::vector to DDim
-          framework::DDim reshaped_in_dims =
-              framework::make_ddim(in_tore_shape);
-          framework::DDim reshaped_out_dims =
-              framework::make_ddim(out_tore_shape);
+          DDim reshaped_in_dims = framework::make_ddim(in_tore_shape);
+          DDim reshaped_out_dims = framework::make_ddim(out_tore_shape);
 
           // after reshape: the first dimension do not need padding,
           // set padding[0] zero
@@ -543,10 +461,8 @@ class SliceGradKernel : public framework::OpKernel<T> {
           }
 
           // convert array from std::vector to DDim
-          framework::DDim reshaped_in_dims =
-              framework::make_ddim(in_tore_shape);
-          framework::DDim reshaped_out_dims =
-              framework::make_ddim(out_tore_shape);
+          DDim reshaped_in_dims = framework::make_ddim(in_tore_shape);
+          DDim reshaped_out_dims = framework::make_ddim(out_tore_shape);
 
           // after reshape:
           // the first dimension is the previous padding dimension
@@ -579,10 +495,8 @@ class SliceGradKernel : public framework::OpKernel<T> {
           }
 
           // convert array from std::vector to DDim
-          framework::DDim reshaped_in_dims =
-              framework::make_ddim(in_tore_shape);
-          framework::DDim reshaped_out_dims =
-              framework::make_ddim(out_tore_shape);
+          DDim reshaped_in_dims = framework::make_ddim(in_tore_shape);
+          DDim reshaped_out_dims = framework::make_ddim(out_tore_shape);
 
           // after reshape:
           // the first dimension do not need padding, set padding[0] zero
@@ -606,9 +520,8 @@ class SliceGradKernel : public framework::OpKernel<T> {
 
   template <size_t D>
   void LaunchEigenPadding(
-      const framework::ExecutionContext& context, framework::Tensor* d_input,
-      const framework::DDim& in_dims, const framework::Tensor* d_out,
-      const framework::DDim& out_dims,
+      const framework::ExecutionContext& context, Tensor* d_input,
+      const DDim& in_dims, const Tensor* d_out, const DDim& out_dims,
       const Eigen::array<std::pair<int64_t, int64_t>, D>& paddings) const {
     auto& place =
         *context.template device_context<DeviceContext>().eigen_device();
diff --git a/paddle/fluid/operators/slice_utils.h b/paddle/fluid/operators/slice_utils.h
new file mode 100644
index 0000000000000000000000000000000000000000..60782a9a9248f8b07b2953f7cf54a1329b137687
--- /dev/null
+++ b/paddle/fluid/operators/slice_utils.h
@@ -0,0 +1,143 @@
+/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+#include <paddle/fluid/framework/operator.h>
+#include <string>
+#include <vector>
+
+namespace paddle {
+namespace operators {
+using Tensor = framework::Tensor;
+
+template <typename T = int64_t>
+inline void CheckAndUpdateSliceAttrs(const framework::DDim in_dims,
+                                     const std::vector<T>& axes,
+                                     std::vector<T>* starts,
+                                     std::vector<T>* ends,
+                                     std::vector<int64_t>* steps = nullptr,
+                                     std::vector<T>* infer_flags = nullptr) {
+  for (size_t i = 0; i < axes.size(); ++i) {
+    T axis = axes[i];
+    T dim_value = in_dims[axis];
+
+    if (dim_value > 0) {
+      if (infer_flags != nullptr && (*infer_flags)[i] == -1) {
+        continue;
+      }
+      T start = (*starts)[i] < 0 ? ((*starts)[i] + dim_value) : (*starts)[i];
+      start = std::max(start, static_cast<T>(0));
+
+      T end = (*ends)[i] < 0 ? ((*ends)[i] + dim_value) : (*ends)[i];
+      end = std::min(end, dim_value);
+
+      T step = steps == nullptr ? 1 : (*steps)[i];
+      PADDLE_ENFORCE_NE(
+          step, 0, platform::errors::InvalidArgument(
+                       "Step should not be 0, but received step = %d.", step));
+
+      if (step > 0) {
+        start = std::min(start, dim_value);
+        end = std::max(end, static_cast<T>(0));
+        PADDLE_ENFORCE_GT(
+            end, start,
+            platform::errors::InvalidArgument(
+                "When step > 0, end should be greater than start, but "
+                "received end = %d, start = %d.",
+                end, start));
+      } else {
+        // NOTE(liym27): When step < 0, start should less and equal to
+        // dim_value-1
+        // "end is -1" means contain the 0-th element of this axis.
+        start = std::min(start, dim_value - 1);
+        end = std::max(end, static_cast<T>(-1));
+        PADDLE_ENFORCE_GT(
+            start, end,
+            platform::errors::InvalidArgument(
+                "When step < 0, start should be greater than end, but "
+                "received start = %d, end = %d.",
+                start, end));
+      }
+
+      (*starts)[i] = start;
+      (*ends)[i] = end;
+    }
+  }
+}
+
+template <typename T = int64_t>
+inline framework::DDim GetSliceDims(const framework::DDim in_dims,
+                                    const std::vector<T>& axes,
+                                    const std::vector<T>& starts,
+                                    const std::vector<T>& ends,
+                                    std::vector<T>* steps = nullptr,
+                                    std::vector<T>* infer_flags = nullptr) {
+  framework::DDim slice_dims(in_dims);
+
+  for (size_t i = 0; i < axes.size(); ++i) {
+    T axis = axes[i];
+    if (infer_flags != nullptr && (*infer_flags)[i] == -1) {
+      slice_dims[axis] = -1;
+      continue;
+    }
+
+    T start = starts[i];
+    T end = ends[i];
+    T step = steps == nullptr ? 1 : (*steps)[i];
+
+    if (step > 0) {
+      slice_dims[axis] = (end - start + step - 1) / step;
+    } else {
+      slice_dims[axis] = (end - start + step + 1) / step;
+    }
+  }
+  return slice_dims;
+}
+
+template <typename T = int64_t>
+inline framework::DDim GetDecreasedDims(const framework::DDim slice_dims,
+                                        const std::vector<T>& decrease_axes,
+                                        std::vector<T>* infer_flags = nullptr) {
+  framework::DDim decreased_dims(slice_dims);
+  if (decrease_axes.size() > 0) {
+    for (size_t i = 0; i < decrease_axes.size(); ++i) {
+      T axis = decrease_axes[i];
+      if (infer_flags && (*infer_flags)[i] != -1) {
+        PADDLE_ENFORCE_EQ(
+            decreased_dims[axis], 1,
+            platform::errors::InvalidArgument("decrease dim should be 1"));
+      }
+      decreased_dims[axis] = 0;
+    }
+
+    std::vector<T> new_shape;
+    for (int i = 0; i < decreased_dims.size(); ++i) {
+      if (decreased_dims[i] != 0) {
+        new_shape.push_back(decreased_dims[i]);
+      }
+    }
+
+    // NOTE(liym27): Paddle does not support that the rank of Tensor is 0, and
+    // uses [1] instead.
+    if (new_shape.size() == 0) {
+      new_shape.push_back(1);
+    }
+
+    decreased_dims = framework::make_ddim(new_shape);
+  }
+  return decreased_dims;
+}
+
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/fluid/operators/strided_slice_op.cc b/paddle/fluid/operators/strided_slice_op.cc
index d71be60e1f5c22dc0a43d1d035044c1b96c86c41..f8272d550b99917e0534d0c4223b7d54e6e450b2 100644
--- a/paddle/fluid/operators/strided_slice_op.cc
+++ b/paddle/fluid/operators/strided_slice_op.cc
@@ -324,6 +324,7 @@ REGISTER_OPERATOR(strided_slice_grad, ops::StridedSliceOpGrad,
 
 REGISTER_OP_CPU_KERNEL(
     strided_slice,
+    ops::StridedSliceKernel<paddle::platform::CPUDeviceContext, bool>,
     ops::StridedSliceKernel<paddle::platform::CPUDeviceContext, int>,
     ops::StridedSliceKernel<paddle::platform::CPUDeviceContext, int64_t>,
     ops::StridedSliceKernel<paddle::platform::CPUDeviceContext, float>,
@@ -335,6 +336,7 @@ REGISTER_OP_CPU_KERNEL(
 
 REGISTER_OP_CPU_KERNEL(
     strided_slice_grad,
+    ops::StridedSliceGradKernel<paddle::platform::CPUDeviceContext, bool>,
     ops::StridedSliceGradKernel<paddle::platform::CPUDeviceContext, int>,
     ops::StridedSliceGradKernel<paddle::platform::CPUDeviceContext, int64_t>,
     ops::StridedSliceGradKernel<paddle::platform::CPUDeviceContext, float>,
diff --git a/paddle/fluid/operators/strided_slice_op.cu b/paddle/fluid/operators/strided_slice_op.cu
index 68a8312f0818d418a820a742a9c4b832b5a8eb5b..f88605fbfc86dc30b16b4c0115eff2f6e9bbdc3b 100644
--- a/paddle/fluid/operators/strided_slice_op.cu
+++ b/paddle/fluid/operators/strided_slice_op.cu
@@ -18,6 +18,7 @@ limitations under the License. */
 namespace ops = paddle::operators;
 REGISTER_OP_CUDA_KERNEL(
     strided_slice,
+    ops::StridedSliceKernel<paddle::platform::CUDADeviceContext, bool>,
     ops::StridedSliceKernel<paddle::platform::CUDADeviceContext, int>,
     ops::StridedSliceKernel<paddle::platform::CUDADeviceContext, int64_t>,
     ops::StridedSliceKernel<paddle::platform::CUDADeviceContext, float>,
@@ -29,7 +30,8 @@ REGISTER_OP_CUDA_KERNEL(
 
 REGISTER_OP_CUDA_KERNEL(
     strided_slice_grad,
-    ops::StridedSliceGradKernel<paddle::platform::CPUDeviceContext, int>,
+    ops::StridedSliceGradKernel<paddle::platform::CUDADeviceContext, bool>,
+    ops::StridedSliceGradKernel<paddle::platform::CUDADeviceContext, int>,
     ops::StridedSliceGradKernel<paddle::platform::CUDADeviceContext, int64_t>,
     ops::StridedSliceGradKernel<paddle::platform::CUDADeviceContext, float>,
     ops::StridedSliceGradKernel<paddle::platform::CUDADeviceContext, double>,
diff --git a/paddle/fluid/operators/tril_triu_op.cc b/paddle/fluid/operators/tril_triu_op.cc
index 8fb0b3809503ecc86e33796a4bc7f7cb2d21f8bb..3e943c62e1ce17857e78e140efeb50e627e80a4e 100644
--- a/paddle/fluid/operators/tril_triu_op.cc
+++ b/paddle/fluid/operators/tril_triu_op.cc
@@ -105,13 +105,15 @@ REGISTER_OPERATOR(tril_triu, ops::TrilTriuOp, ops::TrilTriuOpMaker,
                   ops::TrilTriuGradOpMaker<paddle::imperative::OpBase>);
 REGISTER_OPERATOR(tril_triu_grad, ops::TrilTriuGradOp);
 REGISTER_OP_CPU_KERNEL(
-    tril_triu, ops::TrilTriuOpKernel<paddle::platform::CPUDeviceContext, float>,
+    tril_triu, ops::TrilTriuOpKernel<paddle::platform::CPUDeviceContext, bool>,
+    ops::TrilTriuOpKernel<paddle::platform::CPUDeviceContext, float>,
     ops::TrilTriuOpKernel<paddle::platform::CPUDeviceContext, double>,
     ops::TrilTriuOpKernel<paddle::platform::CPUDeviceContext, int>,
     ops::TrilTriuOpKernel<paddle::platform::CPUDeviceContext, int64_t>,
     ops::TrilTriuOpKernel<paddle::platform::CPUDeviceContext, plat::float16>);
 REGISTER_OP_CPU_KERNEL(
     tril_triu_grad,
+    ops::TrilTriuGradOpKernel<paddle::platform::CPUDeviceContext, bool>,
     ops::TrilTriuGradOpKernel<paddle::platform::CPUDeviceContext, float>,
     ops::TrilTriuGradOpKernel<paddle::platform::CPUDeviceContext, double>,
     ops::TrilTriuGradOpKernel<paddle::platform::CPUDeviceContext, int>,
diff --git a/paddle/fluid/operators/tril_triu_op.cu b/paddle/fluid/operators/tril_triu_op.cu
index d04acd340597928ba0fbbbebf2dfc7eda1d698ac..9cbbdeeb2ce28453f2c22d063975fa82aae5d3b3 100644
--- a/paddle/fluid/operators/tril_triu_op.cu
+++ b/paddle/fluid/operators/tril_triu_op.cu
@@ -18,7 +18,7 @@ namespace ops = paddle::operators;
 namespace plat = paddle::platform;
 
 REGISTER_OP_CUDA_KERNEL(
-    tril_triu,
+    tril_triu, ops::TrilTriuOpKernel<paddle::platform::CUDADeviceContext, bool>,
     ops::TrilTriuOpKernel<paddle::platform::CUDADeviceContext, float>,
     ops::TrilTriuOpKernel<paddle::platform::CUDADeviceContext, double>,
     ops::TrilTriuOpKernel<paddle::platform::CUDADeviceContext, int>,
@@ -26,6 +26,7 @@ REGISTER_OP_CUDA_KERNEL(
     ops::TrilTriuOpKernel<paddle::platform::CUDADeviceContext, plat::float16>);
 REGISTER_OP_CUDA_KERNEL(
     tril_triu_grad,
+    ops::TrilTriuGradOpKernel<paddle::platform::CUDADeviceContext, bool>,
     ops::TrilTriuGradOpKernel<paddle::platform::CUDADeviceContext, float>,
     ops::TrilTriuGradOpKernel<paddle::platform::CUDADeviceContext, double>,
     ops::TrilTriuGradOpKernel<paddle::platform::CUDADeviceContext, int>,
diff --git a/paddle/fluid/platform/device_context.cc b/paddle/fluid/platform/device_context.cc
index 7e983eb54ae2cdb44cf4ae5a949f0fac40ec4835..1179677fd6b9f57152cf7821f6fd088b8945c129 100644
--- a/paddle/fluid/platform/device_context.cc
+++ b/paddle/fluid/platform/device_context.cc
@@ -563,7 +563,7 @@ Place CUDAPinnedDeviceContext::GetPlace() const { return place_; }
 MKLDNNDeviceContext::MKLDNNDeviceContext(CPUPlace place)
     : CPUDeviceContext(place), p_blobmap_() {
   p_blobmap_.reset(new BlobMap());
-  p_exec_items_.reset(new ExecMap());
+  p_exec_items_.reset(new ExecShape());
   p_mutex_.reset(new std::mutex());
 }
 
@@ -644,10 +644,15 @@ void MKLDNNDeviceContext::ResetBlobMap(void* ptr) {
     if (ptr == nullptr) {
       p_blobmap_->clear();
     } else {
-      for (auto& v : (*p_exec_items_)[ptr]) {
-        (v.first)->erase(v.second);
+      // Iterate through all shapes and release
+      // for each shape and active executor all entries
+      // of this executor
+      for (auto& s : *p_exec_items_) {
+        for (auto& v : (*s.second)[ptr]) {
+          (v.first)->erase(v.second);
+        }
+        s.second->erase(ptr);
       }
-      p_exec_items_->erase(ptr);
     }
   } else {
     VLOG(3) << "Prevented Clearing DNNL cache.";
@@ -655,11 +660,24 @@ void MKLDNNDeviceContext::ResetBlobMap(void* ptr) {
   }
 }
 
+void MKLDNNDeviceContext::RemoveShapeEntriesWithExecutor(void) const {
+  p_exec_items_->erase(p_exec_items_->begin());
+}
+
 void MKLDNNDeviceContext::LinkEntryWithExecutor(BlobPtr_t<KeyBlob> pblob,
                                                 KeyBlob::iterator it) const {
+  // Take current input shape from TLS
   // Take current executor addess from TLS
   // and for this executor's items add the one defined with arguments
-  (*p_exec_items_)[tls().get_curr_exec()].push_back(std::make_pair(pblob, it));
+  auto key_it = p_exec_items_
+                    ->insert(std::make_pair(tls().cur_input_shape_str,
+                                            std::make_shared<ExecMap>()))
+                    .first;
+  (*key_it->second)[tls().get_curr_exec()].push_back(std::make_pair(pblob, it));
+
+  VLOG(3) << "LinkEntryWithExecutor, shapes: " << p_exec_items_->size()
+          << " curr exec size: "
+          << (*key_it->second)[tls().get_curr_exec()].size() << "\n";
 }
 
 void MKLDNNDeviceContext::BlockNextCacheClearing() {
@@ -716,6 +734,7 @@ void MKLDNNDeviceContext::SetBlob(const std::string& name,
       VLOG(2) << "sid=" << sid
               << ", remove all blobs of shape: " << sBlob->begin()->first;
       sBlob->erase(sBlob->begin()->first);
+      RemoveShapeEntriesWithExecutor();
     }
     pBlob = std::make_shared<KeyBlob>();
     (*sBlob)[tls().cur_input_shape_str] = pBlob;
@@ -739,7 +758,7 @@ void MKLDNNDeviceContext::SetBlob(const std::string& name,
   return;
 }
 
-unsigned int MKLDNNDeviceContext::GetCachedObjectsNumber(void) {
+unsigned int MKLDNNDeviceContext::GetCachedObjectsNumber(void) const {
   unsigned int num_entries = 0;
   for (auto const& l3 : *p_blobmap_) {
     for (auto const& l2 : *(l3.second)) {
diff --git a/paddle/fluid/platform/device_context.h b/paddle/fluid/platform/device_context.h
index 8d9d1fd96f463c8e05e9c7e6ba7ed42672459bec..e2dbc90b5d1444b7f27ac00439a769ee3165a911 100644
--- a/paddle/fluid/platform/device_context.h
+++ b/paddle/fluid/platform/device_context.h
@@ -749,8 +749,14 @@ class MKLDNNDeviceContext : public CPUDeviceContext {
   using ShapeBlob = umap_key_string_t<KeyBlob>;
   using BlobMap = umap_value_smart_t<int, ShapeBlob>;
 
-  using ExecMap = std::unordered_map<
-      void*, std::vector<std::pair<BlobPtr_t<KeyBlob>, KeyBlob::iterator>>>;
+  // Auxillary two-level structure (shape, executor) to easier control
+  // clearing cache objects related to specific executor
+
+  using ExecKey = void*;
+  using ExecMapCacheIterPair = std::pair<BlobPtr_t<KeyBlob>, KeyBlob::iterator>;
+  using ExecMap =
+      std::unordered_map<ExecKey, std::vector<ExecMapCacheIterPair>>;
+  using ExecShape = std::unordered_map<std::string, std::shared_ptr<ExecMap>>;
 
   explicit MKLDNNDeviceContext(CPUPlace place);
 
@@ -759,6 +765,7 @@ class MKLDNNDeviceContext : public CPUDeviceContext {
 
   // Register object to currently used executor's map
   void LinkEntryWithExecutor(BlobPtr_t<KeyBlob>, KeyBlob::iterator) const;
+  void RemoveShapeEntriesWithExecutor(void) const;
 
   // Remove all entries from the blob map
   void ResetBlobMap(void* ptr);
@@ -773,7 +780,7 @@ class MKLDNNDeviceContext : public CPUDeviceContext {
   void SetBlob(const std::string& name, std::shared_ptr<void> data) const;
 
   // Calculate number of oneDNN objects cached
-  unsigned int GetCachedObjectsNumber(void);
+  unsigned int GetCachedObjectsNumber(void) const;
 
   // Find a saved blob. Return nullptr if not found
   std::shared_ptr<void> GetBlob(const std::string& name) const;
@@ -786,7 +793,7 @@ class MKLDNNDeviceContext : public CPUDeviceContext {
   std::shared_ptr<BlobMap> p_blobmap_;
   // Map key is pointer of executor and value is a data(iterator in map) needed
   // to erase
-  std::shared_ptr<ExecMap> p_exec_items_;
+  std::shared_ptr<ExecShape> p_exec_items_;
   std::shared_ptr<std::mutex> p_mutex_;
   bool block_next_cache_clearing_ = false;
 };
diff --git a/paddle/fluid/platform/device_memory_aligment.cc b/paddle/fluid/platform/device_memory_aligment.cc
index f8e031104415e848101d97d2f66217847630c923..185646e7327006ef649b356de246810d69860b44 100644
--- a/paddle/fluid/platform/device_memory_aligment.cc
+++ b/paddle/fluid/platform/device_memory_aligment.cc
@@ -26,9 +26,11 @@ size_t Alignment(size_t size, const platform::Place &place) {
 #elif defined(PADDLE_WITH_XPU)
     // TODO(wangxi): add XpuMinChunkSize
     alignment = alignment;
+#elif defined(PADDLE_WITH_ASCEND_CL)
+    alignment = NPUMinChunkSize();
 #else
     PADDLE_THROW(platform::errors::PreconditionNotMet(
-        "Fluid is not compiled with CUDA."));
+        "Fluid is not compiled with CUDA or NPU."));
 #endif
   }
   size_t remaining = size % alignment;
diff --git a/paddle/fluid/platform/device_memory_aligment.h b/paddle/fluid/platform/device_memory_aligment.h
index a151e434833587549e35c3ccfe1d8d8f43469a76..e0f2f0f11c9c3ff697821813bd3ba6ac2d6a36ec 100644
--- a/paddle/fluid/platform/device_memory_aligment.h
+++ b/paddle/fluid/platform/device_memory_aligment.h
@@ -19,6 +19,8 @@ limitations under the License. */
 #include "paddle/fluid/platform/place.h"
 #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
 #include "paddle/fluid/platform/gpu_info.h"
+#elif defined(PADDLE_WITH_ASCEND_CL)
+#include "paddle/fluid/platform/npu_info.h"
 #endif
 
 namespace paddle {
diff --git a/paddle/fluid/platform/enforce.h b/paddle/fluid/platform/enforce.h
index d3890de89a5d140bfa09d04909f703f4ca771a05..c63ea3fa8573b8a7fd739931869c8f53259d8a77 100644
--- a/paddle/fluid/platform/enforce.h
+++ b/paddle/fluid/platform/enforce.h
@@ -775,13 +775,13 @@ inline std::string GetExternalErrorMsg(T status) {
       }
     }
 #else
-    char buf[100];
+    char buf[512];
     MEMORY_BASIC_INFORMATION mbi;
     HMODULE h_module =
         (::VirtualQuery(GetCurrentTraceBackString, &mbi, sizeof(mbi)) != 0)
             ? (HMODULE)mbi.AllocationBase
             : NULL;
-    GetModuleFileName(h_module, buf, 100);
+    GetModuleFileName(h_module, buf, 512);
     std::string strModule(buf);
     const size_t last_slash_idx = strModule.find_last_of("\\");
     std::string compare_path = strModule.substr(strModule.length() - 7);
diff --git a/paddle/fluid/platform/enforce_test.cc b/paddle/fluid/platform/enforce_test.cc
index 842d4cc139281aab48131759f63003b3fe3890c2..95a852ad6e92a3ec2f8ecc08f5378ed91301f3c3 100644
--- a/paddle/fluid/platform/enforce_test.cc
+++ b/paddle/fluid/platform/enforce_test.cc
@@ -417,7 +417,7 @@ TEST(enforce, cuda_success) {
       "An unsupported value or parameter was passed to the function (a "
       "negative vector size, for example).To correct: ensure that all the "
       "parameters being passed have valid values"));
-  /*
+
 #if !defined(__APPLE__) && defined(PADDLE_WITH_NCCL)
   EXPECT_TRUE(CheckCudaStatusSuccess(ncclSuccess));
   EXPECT_TRUE(CheckCudaStatusFailure(ncclUnhandledCudaError, "NCCL error"));
@@ -430,7 +430,6 @@ TEST(enforce, cuda_success) {
                                      "The call to NCCL is incorrect. This is "
                                      "usually reflecting a programming error"));
 #endif
-*/
 }
 #endif
 #endif
diff --git a/paddle/fluid/platform/gpu_launch_config.h b/paddle/fluid/platform/gpu_launch_config.h
index 6c265677d63e99c173b7fdce8de362dc9b381352..4da91b4e764a5285b005ebc459c4dfa4e52df9cd 100644
--- a/paddle/fluid/platform/gpu_launch_config.h
+++ b/paddle/fluid/platform/gpu_launch_config.h
@@ -37,6 +37,7 @@ struct GpuLaunchConfig {
   dim3 theory_thread_count = dim3(1, 1, 1);
   dim3 thread_per_block = dim3(1, 1, 1);
   dim3 block_per_grid = dim3(1, 1, 1);
+  int compute_capability = 0;
 };
 
 inline GpuLaunchConfig GetGpuLaunchConfig1D(
@@ -67,11 +68,14 @@ inline GpuLaunchConfig GetGpuLaunchConfig1D(
       std::min(max_threads, context.GetMaxThreadsPerBlock());
   const int block_count =
       std::min(DivUp(physical_thread_count, thread_per_block), sm);
+  // Get compute_capability
+  const int capability = context.GetComputeCapability();
 
   GpuLaunchConfig config;
   config.theory_thread_count.x = theory_thread_count;
   config.thread_per_block.x = thread_per_block;
   config.block_per_grid.x = block_count;
+  config.compute_capability = capability;
   return config;
 }
 
diff --git a/paddle/fluid/pybind/CMakeLists.txt b/paddle/fluid/pybind/CMakeLists.txt
index 5fcb1e30fbe677a8f87d3d5b3ad2228269e54f92..5e5475da89f9848c8a6087e45d61b0fa16d9579f 100644
--- a/paddle/fluid/pybind/CMakeLists.txt
+++ b/paddle/fluid/pybind/CMakeLists.txt
@@ -56,6 +56,7 @@ set(PYBIND_SRCS
   ir.cc
   inference_api.cc
   compatible.cc
+  io.cc
   generator_py.cc)
 
 if(WITH_ASCEND)
diff --git a/paddle/fluid/pybind/inference_api.cc b/paddle/fluid/pybind/inference_api.cc
index 8a5ad5852aedf5b157876c5d892d2ac4f42c022d..b2572e5aa4ba150c788ff2f0f728230f152aa76c 100644
--- a/paddle/fluid/pybind/inference_api.cc
+++ b/paddle/fluid/pybind/inference_api.cc
@@ -511,6 +511,7 @@ void BindAnalysisConfig(py::module *m) {
            py::arg("disable_trt_plugin_fp16") = false)
       .def("enable_tensorrt_oss", &AnalysisConfig::EnableTensorRtOSS)
       .def("tensorrt_oss_enabled", &AnalysisConfig::tensorrt_oss_enabled)
+      .def("exp_disable_tensorrt_ops", &AnalysisConfig::Exp_DisableTensorRtOPs)
       .def("enable_tensorrt_dla", &AnalysisConfig::EnableTensorRtDLA,
            py::arg("dla_core") = 0)
       .def("tensorrt_dla_enabled", &AnalysisConfig::tensorrt_dla_enabled)
diff --git a/paddle/fluid/pybind/io.cc b/paddle/fluid/pybind/io.cc
new file mode 100644
index 0000000000000000000000000000000000000000..fc49f76305461f2f99ebad8f1c4a6a34cb1e5382
--- /dev/null
+++ b/paddle/fluid/pybind/io.cc
@@ -0,0 +1,111 @@
+/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/pybind/io.h"
+#include "paddle/fluid/framework/lod_tensor.h"
+#include "paddle/fluid/framework/selected_rows.h"
+#include "paddle/fluid/platform/enforce.h"
+#include "paddle/fluid/pybind/pybind_boost_headers.h"
+
+namespace py = pybind11;
+namespace paddle {
+namespace pybind {
+
+void BindIO(pybind11::module *m) {
+  m->def("save_lod_tensor", [](const paddle::framework::LoDTensor &tensor,
+                               const std::string &str_file_name) {
+    std::ofstream fout(str_file_name, std::ios::binary);
+    PADDLE_ENFORCE_EQ(static_cast<bool>(fout), true,
+                      platform::errors::Unavailable(
+                          "Cannot open %s to save variables.", str_file_name));
+    paddle::framework::SerializeToStream(fout, tensor);
+
+    int64_t tellp = fout.tellp();
+    fout.close();
+    return tellp;
+  });
+
+  m->def("load_lod_tensor", [](paddle::framework::LoDTensor &tensor,
+                               const std::string &str_file_name) {
+    std::ifstream fin(str_file_name, std::ios::binary);
+    PADDLE_ENFORCE_EQ(static_cast<bool>(fin), true,
+                      platform::errors::Unavailable(
+                          "Cannot open %s to load variables.", str_file_name));
+
+    paddle::framework::DeserializeFromStream(fin, &tensor);
+    int64_t tellg = fin.tellg();
+    fin.close();
+    return tellg;
+  });
+
+  m->def("save_selected_rows",
+         [](const paddle::framework::SelectedRows &selected_rows,
+            const std::string &str_file_name) {
+           std::ofstream fout(str_file_name, std::ios::binary);
+           PADDLE_ENFORCE_EQ(
+               static_cast<bool>(fout), true,
+               platform::errors::Unavailable(
+                   "Cannot open %s to save SelectedRows.", str_file_name));
+
+           paddle::framework::SerializeToStream(fout, selected_rows);
+           int64_t tellp = fout.tellp();
+           fout.close();
+           return tellp;
+         });
+
+  m->def("load_selected_rows",
+         [](paddle::framework::SelectedRows &selected_rows,
+            const std::string &str_file_name) {
+           std::ifstream fin(str_file_name, std::ios::binary);
+           PADDLE_ENFORCE_EQ(
+               static_cast<bool>(fin), true,
+               platform::errors::Unavailable(
+                   "Cannot open %s to load SelectedRows.", str_file_name));
+
+           paddle::framework::DeserializeFromStream(fin, &selected_rows);
+           int64_t tellg = fin.tellg();
+           fin.close();
+           return tellg;
+         });
+
+  m->def("save_lod_tensor_to_memory",
+         [](const paddle::framework::LoDTensor &tensor) -> py::bytes {
+           std::ostringstream ss;
+           paddle::framework::SerializeToStream(ss, tensor);
+           return ss.str();
+         });
+
+  m->def("load_lod_tensor_from_memory", [](paddle::framework::LoDTensor &tensor,
+                                           const std::string &tensor_bytes) {
+    std::istringstream fin(tensor_bytes, std::ios::in | std::ios::binary);
+    paddle::framework::DeserializeFromStream(fin, &tensor);
+  });
+
+  m->def("save_selected_rows_to_memory",
+         [](const paddle::framework::SelectedRows &selected_rows) -> py::bytes {
+           std::ostringstream ss;
+           paddle::framework::SerializeToStream(ss, selected_rows);
+           return ss.str();
+         });
+
+  m->def("load_selected_rows_from_memory",
+         [](paddle::framework::SelectedRows &selected_rows,
+            const std::string &selected_rows_bytes) {
+           std::istringstream fin(selected_rows_bytes,
+                                  std::ios::in | std::ios::binary);
+           paddle::framework::DeserializeFromStream(fin, &selected_rows);
+         });
+}
+}  // namespace pybind
+}  // namespace paddle
diff --git a/paddle/fluid/pybind/io.h b/paddle/fluid/pybind/io.h
new file mode 100644
index 0000000000000000000000000000000000000000..dfe3154cb95da529536c0022fc82169d476f3913
--- /dev/null
+++ b/paddle/fluid/pybind/io.h
@@ -0,0 +1,24 @@
+/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include <Python.h>
+#include "paddle/fluid/pybind/pybind_boost_headers.h"
+
+namespace paddle {
+namespace pybind {
+void BindIO(pybind11::module* m);
+}  // namespace pybind
+}  // namespace paddle
diff --git a/paddle/fluid/pybind/op_function_generator.cc b/paddle/fluid/pybind/op_function_generator.cc
index bf3c77843219c75f9cf4a75f340eaa71f972991d..6278a23cea6440569dc7c52e6525d58d9075ef89 100644
--- a/paddle/fluid/pybind/op_function_generator.cc
+++ b/paddle/fluid/pybind/op_function_generator.cc
@@ -65,6 +65,7 @@ std::map<std::string, std::set<std::string>> op_ins_map = {
     {"box_coder", {"PriorBox", "PriorBoxVar", "TargetBox"}},
     {"momentum", {"Param", "Grad", "Velocity", "LearningRate"}},
     {"rnn", {"Input", "PreState", "WeightList", "SequenceLength"}},
+    {"run_program", {"X", "Params"}},
 };
 
 // NOTE(zhiqiu): Like op_ins_map.
@@ -98,6 +99,7 @@ std::map<std::string, std::set<std::string>> op_outs_map = {
     {"rnn", {"DropoutState", "Reserve", "Out", "State"}},
     {"lamb",
      {"ParamOut", "Moment1Out", "Moment2Out", "Beta1PowOut", "Beta2PowOut"}},
+    {"run_program", {"DOut"}},
 };
 
 // NOTE(zhiqiu): Commonly, the outputs in auto-generated OP function are
@@ -148,6 +150,7 @@ std::map<std::string, std::set<std::string>> op_passing_outs_map = {
     {"lamb",
      {"ParamOut", "Moment1Out", "Moment2Out", "Beta1PowOut", "Beta2PowOut"}},
     {"rnn", {"DropoutState"}},
+    {"run_program", {"Out", "DOut", "OutScope"}},
 };
 
 // NOTE(pangyoki): Tensor View Strategy.
@@ -173,7 +176,7 @@ std::set<std::string> inplace_op_duplicable_ins_set = {
 
 // clang-format off
 const char* OUT_INITIALIZER_TEMPLATE =
-    R"({"%s", {std::shared_ptr<imperative::VarBase>(new imperative::VarBase(tracer->GenerateUniqueName()))}})";
+    R"({"%s", {std::shared_ptr<imperative::VarBase>(new imperative::VarBase("auto_"+std::to_string(VarBaseUniqueNameID++)+"_"))}})";
 const char* OUT_DUPLICABLE_INITIALIZER_TEMPLATE = R"({"%s", ConstructDuplicableOutput(%s)})";
 
 const char* INPUT_INITIALIZER_TEMPLATE = R"({"%s", {%s}})";
@@ -255,12 +258,11 @@ R"(
   ConstructAttrMapFromPyArgs("%s", %d, &attrs, args);
   {
     py::gil_scoped_release release;
-    auto tracer = imperative::GetCurrentTracer();
     %s
     imperative::NameVarBaseMap outs = %s;
     imperative::NameVarBaseMap ins = %s;
     %s
-    tracer->TraceOp("%s", ins, outs, attrs, {%s});
+    imperative::GetCurrentTracer()->TraceOp("%s", ins, outs, attrs, {%s});
     return %s;
   }
 })";
@@ -585,7 +587,8 @@ int main(int argc, char* argv[]) {
   out << "namespace py = pybind11;"
       << "\n";
   out << "namespace paddle {\n"
-      << "namespace pybind {\n";
+      << "namespace pybind {\n\n";
+  out << "std::atomic<int> VarBaseUniqueNameID{0};\n";
   out << paddle::string::join_strings(std::get<0>(op_funcs), '\n');
   out << "\n\n";
 
diff --git a/paddle/fluid/pybind/pybind.cc b/paddle/fluid/pybind/pybind.cc
index 67c9b5df5ca74bc666ff90e0e6e939b8c13949df..830ca049beef555ffda81f92872fcbf83f75881e 100644
--- a/paddle/fluid/pybind/pybind.cc
+++ b/paddle/fluid/pybind/pybind.cc
@@ -69,6 +69,7 @@ limitations under the License. */
 #include "paddle/fluid/platform/monitor.h"
 #include "paddle/fluid/platform/place.h"
 #include "paddle/fluid/platform/profiler.h"
+#include "paddle/fluid/pybind/io.h"
 #ifdef PADDLE_WITH_ASCEND
 #include "paddle/fluid/pybind/ascend_wrapper_py.h"
 #endif
@@ -498,70 +499,6 @@ PYBIND11_MODULE(core_noavx, m) {
 #endif
     return tensor;
   });
-  m.def("_save_lod_tensor", [](const LoDTensor &tensor,
-                               const std::string &str_file_name) {
-    std::ofstream fout(str_file_name, std::ios::binary);
-    PADDLE_ENFORCE_EQ(static_cast<bool>(fout), true,
-                      platform::errors::Unavailable(
-                          "Cannot open %s to save variables.", str_file_name));
-    SerializeToStream(fout, tensor);
-
-    int64_t tellp = fout.tellp();
-    fout.close();
-    return tellp;
-  });
-  m.def("_load_lod_tensor", [](LoDTensor &tensor,
-                               const std::string &str_file_name) {
-    std::ifstream fin(str_file_name, std::ios::binary);
-    PADDLE_ENFORCE_EQ(static_cast<bool>(fin), true,
-                      platform::errors::Unavailable(
-                          "Cannot open %s to load variables.", str_file_name));
-
-    DeserializeFromStream(fin, &tensor);
-    int64_t tellg = fin.tellg();
-    fin.close();
-    return tellg;
-  });
-  m.def("_save_selected_rows", [](const SelectedRows &selected_rows,
-                                  const std::string &str_file_name) {
-    std::ofstream fout(str_file_name, std::ios::binary);
-    PADDLE_ENFORCE_EQ(
-        static_cast<bool>(fout), true,
-        platform::errors::Unavailable("Cannot open %s to save SelectedRows.",
-                                      str_file_name));
-
-    SerializeToStream(fout, selected_rows);
-    int64_t tellp = fout.tellp();
-    fout.close();
-    return tellp;
-  });
-  m.def("_load_selected_rows",
-        [](SelectedRows &selected_rows, const std::string &str_file_name) {
-          std::ifstream fin(str_file_name, std::ios::binary);
-          PADDLE_ENFORCE_EQ(
-              static_cast<bool>(fin), true,
-              platform::errors::Unavailable(
-                  "Cannot open %s to load SelectedRows.", str_file_name));
-
-          DeserializeFromStream(fin, &selected_rows);
-          int64_t tellg = fin.tellg();
-          fin.close();
-          return tellg;
-        });
-  m.def("_save_static_dict",
-        [](const std::string &str_file_name, const py::handle &vec_var_list,
-           const Scope &scope) {
-          std::vector<std::string> vec_name_list = GetNameList(vec_var_list);
-          SaveStaticNameListToDisk(str_file_name, vec_name_list, scope);
-        });
-
-  m.def("_load_static_dict",
-        [](const std::string &str_file_name, const py::handle &vec_var_list,
-           const Scope &scope, const Executor *executor) {
-          std::vector<std::string> vec_name_list = GetNameList(vec_var_list);
-          CreateVariableIfNotExit(vec_var_list, scope, executor);
-          LoadStaticNameListFromDisk(str_file_name, vec_name_list, scope);
-        });
 
   m.def("_create_loaded_parameter",
         [](const py::handle &vec_var_list, const Scope &scope,
@@ -569,26 +506,6 @@ PYBIND11_MODULE(core_noavx, m) {
           CreateVariableIfNotExit(vec_var_list, scope, executor);
         });
 
-  m.def("_save_dygraph_dict", [](const std::string &str_file_name,
-                                 const PyNameVarBaseMap &state_dict) {
-    auto vec_var_base_list = GetVarBaseList(state_dict);
-
-    SaveDygraphVarBaseListToDisk(str_file_name, vec_var_base_list);
-  });
-
-  m.def("_load_dygraph_dict", [](const std::string &str_file_name) {
-    auto load_tensor = LoadDygraphVarBaseListFromDisk(str_file_name);
-
-    std::unordered_map<std::string, std::shared_ptr<imperative::VarBase>>
-        map_output;
-
-    for (size_t i = 0; i < load_tensor.size(); ++i) {
-      map_output.emplace(load_tensor[i]->Name(), load_tensor[i]);
-    }
-
-    return map_output;
-  });
-
   m.def("save_op_version_info", [](framework::ProgramDesc &desc) {
     framework::compatible::pb::OpVersionMap pb_vmap{desc.OpVersionMap()};
     framework::compatible::SaveOpVersions(
@@ -3154,6 +3071,7 @@ All parameter, weight, gradient are variables in Paddle.
       .def("device_count", &ParallelExecutor::DeviceCount);
 
   BindFleetWrapper(&m);
+  BindIO(&m);
 
 #ifdef PADDLE_WITH_PSLIB
   BindHeterWrapper(&m);
diff --git a/paddle/scripts/paddle_build.bat b/paddle/scripts/paddle_build.bat
index 8c323490cc964c5e3b69d6b512fdee22041d9803..c4a93f0d4a1e9f689b1510fe037e2d0e397e01d1 100644
--- a/paddle/scripts/paddle_build.bat
+++ b/paddle/scripts/paddle_build.bat
@@ -22,11 +22,12 @@ setlocal enabledelayedexpansion
 
 rem -------clean up environment-----------
 set work_dir=%cd%
-set cache_dir=%work_dir:Paddle=cache%
+if not defined cache_dir set cache_dir=%work_dir:Paddle=cache%
 if not exist %cache_dir%\tools (
     git clone https://github.com/zhouwei25/tools.git %cache_dir%\tools
 )
 taskkill /f /im cmake.exe  2>NUL
+taskkill /f /im ninja.exe  2>NUL
 taskkill /f /im MSBuild.exe 2>NUL
 taskkill /f /im cl.exe 2>NUL
 taskkill /f /im lib.exe 2>NUL
@@ -217,7 +218,8 @@ set CUDA_ARCH_NAME=All
 
 call :cmake || goto cmake_error
 call :build || goto build_error
-call :zip_file || goto zip_file_error
+call :zip_cc_file || goto zip_cc_file_error
+call :zip_c_file || goto zip_c_file_error
 goto:success
 
 rem "Other configurations are added here"
@@ -689,7 +691,7 @@ goto:eof
 exit /b 1
 
 rem ---------------------------------------------------------------------------------------------
-:zip_file
+:zip_cc_file
 tree /F %cd%\paddle_inference_install_dir\paddle
 if exist paddle_inference.zip del paddle_inference.zip
 python -c "import shutil;shutil.make_archive('paddle_inference', 'zip', root_dir='paddle_inference_install_dir')"
@@ -701,10 +703,27 @@ for /F %%i in ("%libsize%") do (
 )
 goto:eof
 
-:zip_file_error
+:zip_cc_file_error
 echo Tar inference library failed!
 exit /b 1
 
+rem ---------------------------------------------------------------------------------------------
+:zip_c_file
+tree /F %cd%\paddle_inference_c_install_dir\paddle
+if exist paddle_inference_c.zip del paddle_inference_c.zip
+python -c "import shutil;shutil.make_archive('paddle_inference_c', 'zip', root_dir='paddle_inference_c_install_dir')"
+%cache_dir%\tools\busybox64.exe du -h -k paddle_inference_c.zip > lib_size.txt
+set /p libsize=< lib_size.txt
+for /F %%i in ("%libsize%") do (
+    set /a libsize_m=%%i/1024
+    echo "Windows Paddle_Inference CAPI ZIP Size: !libsize_m!M"
+)
+goto:eof
+
+:zip_c_file_error
+echo Tar inference capi library failed!
+exit /b 1
+
 :timestamp
 setlocal enabledelayedexpansion
 @ECHO OFF
@@ -763,6 +782,7 @@ echo    ========================================
 echo    Clean up environment  at the end ...
 echo    ========================================
 taskkill /f /im cmake.exe  2>NUL
+taskkill /f /im ninja.exe  2>NUL
 taskkill /f /im MSBuild.exe 2>NUL
 taskkill /f /im git.exe 2>NUL
 taskkill /f /im cl.exe 2>NUL
diff --git a/paddle/scripts/paddle_build.sh b/paddle/scripts/paddle_build.sh
index 47187871cf4c800f391ebfca7cb0c29fd1c85909..96dc8c67969458041a93490d75ce4007f3ae9f33 100755
--- a/paddle/scripts/paddle_build.sh
+++ b/paddle/scripts/paddle_build.sh
@@ -426,6 +426,13 @@ EOF
         buildSize=$(du -h --max-depth=0 ${PADDLE_ROOT}/build/paddle_inference.tgz |awk '{print $1}')
         echo "Paddle_Inference Size: $buildSize"
         echo "ipipe_log_param_Paddle_Inference_Size: $buildSize" >> ${PADDLE_ROOT}/build/build_summary.txt
+    elif [ "$1" == "paddle_inference_c" ]; then
+        cd ${PADDLE_ROOT}/build
+        cp -r paddle_inference_c_install_dir paddle_inference_c
+        tar -czf paddle_inference_c.tgz paddle_inference_c
+        buildSize=$(du -h --max-depth=0 ${PADDLE_ROOT}/build/paddle_inference_c.tgz |awk '{print $1}')
+        echo "Paddle_Inference Capi Size: $buildSize"
+        echo "ipipe_log_param_Paddle_Inference_capi_Size: $buildSize" >> ${PADDLE_ROOT}/build/build_summary.txt
     else
         SYSTEM=`uname -s`
         if [ "$SYSTEM" == "Darwin" ]; then
@@ -1941,6 +1948,7 @@ EOF
     echo "ipipe_log_param_Build_Time: $[ $endTime_s - $startTime_s ]s" >> ${PADDLE_ROOT}/build/build_summary.txt
 
     build_size "paddle_inference"
+    build_size "paddle_inference_c"
 }
 
 function tar_fluid_lib() {
@@ -2001,12 +2009,16 @@ function build_document_preview() {
     sh /paddle/tools/document_preview.sh ${PORT}
 }
 
-
-function example() {
+# origin name: example
+function exec_samplecode_test() {
     pip install ${PADDLE_ROOT}/build/python/dist/*.whl
     paddle version
     cd ${PADDLE_ROOT}/tools
-    python sampcd_processor.py cpu;example_error=$?
+    if [ "$1" = "cpu" ] ; then
+        python sampcd_processor.py cpu; example_error=$?
+    elif [ "$1" = "gpu" ] ; then
+        python sampcd_processor.py --threads=16 --full-test gpu; example_error=$?
+    fi
     if [ "$example_error" != "0" ];then
       echo "Code instance execution failed" >&2
       exit 5
@@ -2119,7 +2131,7 @@ function main() {
         check_sequence_op_unittest
         generate_api_spec ${PYTHON_ABI:-""} "PR"
         set +e
-        example_info=$(example)
+        example_info=$(exec_samplecode_test cpu)
         example_code=$?
         summary_check_problems $check_style_code $example_code "$check_style_info" "$example_info"
         assert_api_spec_approvals
@@ -2278,7 +2290,11 @@ function main() {
         build_document_preview
         ;;
       api_example)
-        example
+        example_info=$(exec_samplecode_test cpu)
+        example_code=$?
+        check_style_code=0
+        check_style_info=
+        summary_check_problems $check_style_code $example_code "$check_style_info" "$example_info"
         ;;
       test_op_benchmark)
         test_op_benchmark
diff --git a/python/paddle/__init__.py b/python/paddle/__init__.py
index 7bac330376c44fb9632258b81ccb00255ab33a7c..8730ed955224c94daed3ae3ef25688bd99af99b8 100755
--- a/python/paddle/__init__.py
+++ b/python/paddle/__init__.py
@@ -21,8 +21,7 @@ except ImportError:
      import paddle from the source directory; please install paddlepaddle*.whl firstly.'''
                      )
 
-import paddle.batch
-batch = batch.batch
+from .batch import batch  # noqa: F401
 from .fluid import monkey_patch_variable
 from .fluid.dygraph import monkey_patch_math_varbase
 monkey_patch_variable()
@@ -72,6 +71,7 @@ from .tensor.attribute import real  # noqa: F401
 from .tensor.attribute import imag  # noqa: F401
 from .tensor.creation import to_tensor  # noqa: F401
 from .tensor.creation import diag  # noqa: F401
+from .tensor.creation import diagflat  # noqa: F401
 from .tensor.creation import eye  # noqa: F401
 from .tensor.creation import linspace  # noqa: F401
 from .tensor.creation import ones  # noqa: F401
@@ -135,7 +135,6 @@ from .tensor.manipulation import squeeze  # noqa: F401
 from .tensor.manipulation import squeeze_  # noqa: F401
 from .tensor.manipulation import stack  # noqa: F401
 from .tensor.manipulation import strided_slice  # noqa: F401
-from .tensor.manipulation import transpose  # noqa: F401
 from .tensor.manipulation import unique  # noqa: F401
 from .tensor.manipulation import unsqueeze  # noqa: F401
 from .tensor.manipulation import unsqueeze_  # noqa: F401
@@ -191,7 +190,6 @@ from .tensor.math import floor_mod  # noqa: F401
 from .tensor.math import multiply  # noqa: F401
 from .tensor.math import add  # noqa: F401
 from .tensor.math import subtract  # noqa: F401
-from .tensor.math import atan  # noqa: F401
 from .tensor.math import logsumexp  # noqa: F401
 from .tensor.math import inverse  # noqa: F401
 from .tensor.math import log1p  # noqa: F401
@@ -206,6 +204,8 @@ from .tensor.math import isnan  # noqa: F401
 from .tensor.math import prod  # noqa: F401
 from .tensor.math import broadcast_shape  # noqa: F401
 from .tensor.math import conj  # noqa: F401
+from .tensor.math import neg  # noqa: F401
+from .tensor.math import lgamma  # noqa: F401
 
 from .tensor.random import multinomial  # noqa: F401
 from .tensor.random import standard_normal  # noqa: F401
@@ -244,9 +244,8 @@ from .framework import save  # noqa: F401
 from .framework import load  # noqa: F401
 from .framework import DataParallel  # noqa: F401
 
-from .framework import set_default_dtype  #DEFINE_ALIAS
-from .framework import get_default_dtype  #DEFINE_ALIAS
-from .framework import set_grad_enabled  #DEFINE_ALIAS
+from .framework import set_default_dtype  # noqa: F401
+from .framework import get_default_dtype  # noqa: F401
 
 from .tensor.search import index_sample  # noqa: F401
 from .tensor.stat import mean  # noqa: F401
@@ -281,7 +280,7 @@ import paddle.vision  # noqa: F401
 from .tensor.random import check_shape  # noqa: F401
 disable_static()
 
-__all__ = [     #noqa
+__all__ = [  # noqa
            'dtype',
            'uint8',
            'int8',
@@ -301,6 +300,7 @@ __all__ = [     #noqa
            'add',
            'subtract',
            'diag',
+           'diagflat',
            'isnan',
            'scatter_nd_add',
            'unstack',
@@ -323,7 +323,6 @@ __all__ = [     #noqa
            'cos',
            'tan',
            'mean',
-           'XPUPlace',
            'mv',
            'in_dynamic_mode',
            'min',
@@ -360,7 +359,6 @@ __all__ = [     #noqa
            'to_tensor',
            'gather_nd',
            'isinf',
-           'set_device',
            'uniform',
            'floor_divide',
            'remainder',
@@ -384,8 +382,6 @@ __all__ = [     #noqa
            'rand',
            'less_equal',
            'triu',
-           'is_compiled_with_cuda',
-           'is_compiled_with_rocm',
            'sin',
            'dist',
            'unbind',
@@ -414,14 +410,14 @@ __all__ = [     #noqa
            'bernoulli',
            'summary',
            'sinh',
-           'is_compiled_with_xpu',
-           'is_compiled_with_npu',
            'round',
            'DataParallel',
            'argmin',
            'prod',
            'broadcast_shape',
            'conj',
+           'neg',
+           'lgamma',
            'square',
            'divide',
            'ceil',
@@ -437,7 +433,6 @@ __all__ = [     #noqa
            'not_equal',
            'sum',
            'tile',
-           'get_device',
            'greater_equal',
            'isfinite',
            'create_parameter',
@@ -470,7 +465,6 @@ __all__ = [     #noqa
            'scatter_nd',
            'set_default_dtype',
            'expand_as',
-           'get_cudnn_version',
            'stack',
            'sqrt',
            'cholesky',
@@ -484,7 +478,6 @@ __all__ = [     #noqa
            'logical_not',
            'add_n',
            'minimum',
-           'ComplexTensor',
            'scatter',
            'scatter_',
            'floor',
@@ -493,5 +486,6 @@ __all__ = [     #noqa
            'log2',
            'log10',
            'concat',
-           'check_shape'
+           'check_shape',
+           'standard_normal'
 ]
diff --git a/python/paddle/amp/__init__.py b/python/paddle/amp/__init__.py
index 32587938512c44df82cfa7353ac45a6cc3094186..64992752b2e8d8fcc90308b3f61c6a55abd01bc5 100644
--- a/python/paddle/amp/__init__.py
+++ b/python/paddle/amp/__init__.py
@@ -12,7 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-from .auto_cast import auto_cast
-from .grad_scaler import GradScaler
+from .auto_cast import auto_cast  # noqa: F401
+from .grad_scaler import GradScaler  # noqa: F401
 
 __all__ = ['auto_cast', 'GradScaler']
diff --git a/python/paddle/amp/auto_cast.py b/python/paddle/amp/auto_cast.py
index b83f81b27d1a0745c7a2f3339bc3939eb2f19490..974f718c2d4e2319c2f74783e285a4eb9365c80e 100644
--- a/python/paddle/amp/auto_cast.py
+++ b/python/paddle/amp/auto_cast.py
@@ -14,7 +14,7 @@
 
 from paddle.fluid.dygraph.amp import amp_guard
 
-__all__ = ['auto_cast']
+__all__ = []
 
 
 def auto_cast(enable=True, custom_white_list=None, custom_black_list=None):
diff --git a/python/paddle/amp/grad_scaler.py b/python/paddle/amp/grad_scaler.py
index 72a67a92c495863aba62bdaa93811e59780ed846..770b660a9e11ff4fad06deec7b2f4bbbdf1964a8 100644
--- a/python/paddle/amp/grad_scaler.py
+++ b/python/paddle/amp/grad_scaler.py
@@ -14,7 +14,7 @@
 
 from paddle.fluid.dygraph.amp import AmpScaler
 
-__all__ = ['GradScaler']
+__all__ = []
 
 
 class GradScaler(AmpScaler):
diff --git a/python/paddle/autograd/__init__.py b/python/paddle/autograd/__init__.py
index 71110e95817879fa55bcfa98293139a29b79997a..569619f065a051d071eb8be6b8d8f63049b20d2f 100644
--- a/python/paddle/autograd/__init__.py
+++ b/python/paddle/autograd/__init__.py
@@ -12,10 +12,9 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-from ..fluid.dygraph.base import grad  #DEFINE_ALIAS
-
-from . import backward_mode
-from .backward_mode import backward
-from .py_layer import PyLayer, PyLayerContext
+from ..fluid.dygraph.base import grad  # noqa: F401
+from . import backward_mode  # noqa: F401
+from .backward_mode import backward  # noqa: F401
+from .py_layer import PyLayer, PyLayerContext  # noqa: F401
 
 __all__ = ['grad', 'backward', 'PyLayer', 'PyLayerContext']
diff --git a/python/paddle/autograd/backward_mode.py b/python/paddle/autograd/backward_mode.py
index 96e4336abaa6fa9ca5f23a56c551b8002c347888..6efbe777d537cadbe07a3bf21d807799e1227439 100644
--- a/python/paddle/autograd/backward_mode.py
+++ b/python/paddle/autograd/backward_mode.py
@@ -15,7 +15,7 @@
 from paddle.fluid import core
 from paddle.fluid import framework
 import paddle
-__all__ = ['backward']
+__all__ = []
 
 
 @framework.dygraph_only
diff --git a/python/paddle/autograd/py_layer.py b/python/paddle/autograd/py_layer.py
index 35e2cd24391775c6e9144d555e68ab12295385b6..5a22d22151a1cd12b68fc3672faec965f399d5fd 100644
--- a/python/paddle/autograd/py_layer.py
+++ b/python/paddle/autograd/py_layer.py
@@ -15,7 +15,7 @@
 import paddle
 from paddle.fluid.framework import dygraph_only
 from paddle.fluid import core
-__all__ = ['PyLayer', 'PyLayerContext']
+__all__ = []
 
 
 class PyLayerContext(object):
diff --git a/python/paddle/batch.py b/python/paddle/batch.py
index f6d2d8eb288744acab6d1c2f9d2a9db9a3087f58..f787f603f7e3ae0a6aa205596add48d192f54451 100644
--- a/python/paddle/batch.py
+++ b/python/paddle/batch.py
@@ -12,7 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-__all__ = ['batch']
+__all__ = []
 
 
 def batch(reader, batch_size, drop_last=False):
@@ -35,11 +35,11 @@ def batch(reader, batch_size, drop_last=False):
     Examples:
         .. code-block:: python
            
-            import paddle.fluid as fluid
+            import paddle
             def reader():
                 for i in range(10):
                     yield i
-            batch_reader = fluid.io.batch(reader, batch_size=2)
+            batch_reader = paddle.batch(reader, batch_size=2)
             
             for data in batch_reader():
                 print(data)
@@ -60,7 +60,7 @@ def batch(reader, batch_size, drop_last=False):
             if len(b) == batch_size:
                 yield b
                 b = []
-        if drop_last == False and len(b) != 0:
+        if drop_last is False and len(b) != 0:
             yield b
 
     # Batch size check
diff --git a/python/paddle/compat.py b/python/paddle/compat.py
index 7c753815c5ccd32cfa65668b71b22e19b381b2b6..886a787623ed18cd4b7a56f7ac6661f5f7bc6800 100644
--- a/python/paddle/compat.py
+++ b/python/paddle/compat.py
@@ -15,18 +15,11 @@
 import six
 import math
 
-__all__ = [
-    'long_type',
-    'to_text',
-    'to_bytes',
-    'round',
-    'floor_division',
-    'get_exception_message',
-]
+__all__ = []
 
 if six.PY2:
     int_type = int
-    long_type = long
+    long_type = long  # noqa: F821
 else:
     int_type = int
     long_type = int
diff --git a/python/paddle/dataset/image.py b/python/paddle/dataset/image.py
index c20672c2ce1577a2f992c682710a6e61da947b45..493a94e45d462a4669a1832a9def775da0234730 100644
--- a/python/paddle/dataset/image.py
+++ b/python/paddle/dataset/image.py
@@ -93,7 +93,7 @@ def batch_images_from_tar(data_file,
     :rtype: string
     """
     batch_dir = data_file + "_batch"
-    out_path = "%s/%s" % (batch_dir, dataset_name)
+    out_path = "%s/%s_%s" % (batch_dir, dataset_name, os.getpid())
     meta_file = "%s/%s.txt" % (batch_dir, dataset_name)
 
     if os.path.exists(out_path):
diff --git a/python/paddle/device.py b/python/paddle/device.py
index fce01d0d6751dca0076129ee5f9f4043b51ef09b..93e439ecf0aa420f178d646047a1b72c17189a65 100644
--- a/python/paddle/device.py
+++ b/python/paddle/device.py
@@ -18,21 +18,16 @@ import os
 from paddle.fluid import core
 from paddle.fluid import framework
 from paddle.fluid.dygraph.parallel import ParallelEnv
-from paddle.fluid.framework import is_compiled_with_cuda  #DEFINE_ALIAS
-from paddle.fluid.framework import is_compiled_with_rocm  #DEFINE_ALIAS
+from paddle.fluid.framework import is_compiled_with_cuda  # noqa: F401
+from paddle.fluid.framework import is_compiled_with_rocm  # noqa: F401
 
-__all__ = [
+
+__all__ = [  # npqa
     'get_cudnn_version',
     'set_device',
     'get_device',
     'XPUPlace',
     'is_compiled_with_xpu',
-    #            'cpu_places',
-    #            'CPUPlace',
-    #            'cuda_pinned_places',
-    #            'cuda_places',
-    #            'CUDAPinnedPlace',
-    #            'CUDAPlace',
     'is_compiled_with_cuda',
     'is_compiled_with_rocm',
     'is_compiled_with_npu'
@@ -68,7 +63,7 @@ def is_compiled_with_xpu():
         .. code-block:: python
 
             import paddle
-            support_xpu = paddle.device.is_compiled_with_xpu()
+            support_xpu = paddle.is_compiled_with_xpu()
     """
     return core.is_compiled_with_xpu()
 
@@ -82,9 +77,10 @@ def XPUPlace(dev_id):
 
     Examples:
         .. code-block:: python
-
+            # required: xpu
+            
             import paddle
-            place = paddle.device.XPUPlace(0)
+            place = paddle.XPUPlace(0)
     """
     return core.XPUPlace(dev_id)
 
@@ -127,15 +123,13 @@ def _convert_to_place(device):
         place = core.CPUPlace()
     elif lower_device == 'gpu':
         if not core.is_compiled_with_cuda():
-            raise ValueError(
-                "The device should not be 'gpu', " \
-                "since PaddlePaddle is not compiled with CUDA")
+            raise ValueError("The device should not be 'gpu', "
+                             "since PaddlePaddle is not compiled with CUDA")
         place = core.CUDAPlace(ParallelEnv().dev_id)
     elif lower_device == 'xpu':
         if not core.is_compiled_with_xpu():
-            raise ValueError(
-                "The device should not be 'xpu', " \
-                "since PaddlePaddle is not compiled with XPU")
+            raise ValueError("The device should not be 'xpu', "
+                             "since PaddlePaddle is not compiled with XPU")
         selected_xpus = os.getenv("FLAGS_selected_xpus", "0").split(",")
         device_id = int(selected_xpus[0])
         place = core.XPUPlace(device_id)
@@ -149,7 +143,7 @@ def _convert_to_place(device):
         if avaliable_gpu_device:
             if not core.is_compiled_with_cuda():
                 raise ValueError(
-                    "The device should not be {}, since PaddlePaddle is " \
+                    "The device should not be {}, since PaddlePaddle is "
                     "not compiled with CUDA".format(avaliable_gpu_device))
             device_info_list = device.split(':', 1)
             device_id = device_info_list[1]
@@ -158,7 +152,7 @@ def _convert_to_place(device):
         if avaliable_xpu_device:
             if not core.is_compiled_with_xpu():
                 raise ValueError(
-                    "The device should not be {}, since PaddlePaddle is " \
+                    "The device should not be {}, since PaddlePaddle is "
                     "not compiled with XPU".format(avaliable_xpu_device))
             device_info_list = device.split(':', 1)
             device_id = device_info_list[1]
diff --git a/python/paddle/distributed/collective.py b/python/paddle/distributed/collective.py
index e3b8d783b2ea5d7d555588edfda10dcb3d3115ff..0ffb1d9f881ba159cf6a96359ba4d1a7258103d7 100644
--- a/python/paddle/distributed/collective.py
+++ b/python/paddle/distributed/collective.py
@@ -894,8 +894,25 @@ def _mp_allreduce(tensor,
                 "use_model_parallel", use_model_parallel)
         else:
             raise ValueError("Unknown parameter: {}.".format(op))
-    else:
-        raise NotImplementedError("No support _mp_allreduce in dygraph mode.")
+
+    op_type = 'c_allreduce_sum'
+    helper = LayerHelper(op_type, **locals())
+    out = helper.create_variable_for_type_inference(dtype=tensor.dtype)
+
+    check_variable_and_dtype(
+        tensor, 'tensor', ['float16', 'float32', 'float64', 'int32', 'int64'],
+        op_type)
+
+    helper.append_op(
+        type=op_type,
+        inputs={'X': tensor},
+        outputs={'Out': out},
+        attrs={
+            'ring_id': ring_id,
+            'use_calc_stream': use_calc_stream,
+            'use_model_parallel': use_model_parallel,
+        })
+    return out
 
 
 def _c_lookup_table(table, index, start_index=0, name=None):
@@ -915,6 +932,19 @@ def _c_lookup_table(table, index, start_index=0, name=None):
     if in_dygraph_mode():
         return core.ops.c_embedding(table, index, "start_index", start_index)
 
+    op_type = 'c_embedding'
+    helper = LayerHelper(op_type, **locals())
+    dtype = helper.input_dtype(input_param_name='table')
+    check_variable_and_dtype(index, 'input', ['int32', 'int64'], op_type)
+    tmp = helper.create_variable_for_type_inference(dtype)
+    helper.append_op(
+        type='c_embedding',
+        inputs={'Ids': index,
+                'W': table},
+        outputs={'Out': tmp},
+        attrs={"start_index": start_index})
+    return tmp
+
 
 class _Linear(layers.Layer):
     """
@@ -954,6 +984,35 @@ class _Linear(layers.Layer):
             self.weight.shape[0], self.weight.shape[1], self._dtype, name_str)
 
 
+def _c_softmax_with_cross_entropy(logits,
+                                  label,
+                                  group=None,
+                                  return_softmax=False):
+    if group is not None and not group.is_member():
+        return
+    ring_id = 0 if group is None else group.id
+    global_rank = _get_global_env().rank
+    rank = global_rank if group is None else group.get_group_rank(global_rank)
+    nranks = _get_global_env().world_size if group is None else group.nranks
+
+    input_dims = len(list(logits.shape))
+    label_dims = len(list(label.shape))
+    if input_dims - 1 != label_dims and input_dims != label_dims:
+        raise ValueError(
+            'Expected nput_dims - 1 = label_dims or input_dims == label_dims\
+             (got nput_dims{}, label_dims{})'.format(input_dims, label_dims))
+    if input_dims - 1 == label_dims:
+        label = paddle.unsqueeze(label, axis=-1)
+
+    if in_dygraph_mode():
+        softmax, loss = core.ops.c_softmax_with_cross_entropy(
+            logits, label, 'ring_id', ring_id, 'rank', rank, 'nranks', nranks)
+        if not return_softmax:
+            return loss
+        else:
+            return loss, softmax
+
+
 def _linear(x, weight, bias=None, name=None):
     """
     Fuction Linear
@@ -1107,47 +1166,34 @@ def _parallel_embedding(x,
         return
     ring_id = 0 if group is None else group.id
 
-    origin_num_embeddings = origin_size[0]
-    embedding = paddle.nn.Embedding(
-        per_part_embeddings,
-        origin_size[1],
-        padding_idx=per_part_embeddings - 1,
-        sparse=False,
-        weight_attr=param_attr,
-        name=name)
-
-    origin_input_shape = x.shape
-    if len(origin_input_shape) == 2:
-        x = paddle.unsqueeze(x, axis=-1)
-    else:
-        assert origin_input_shape[-1] == 1, (
-            "The last dimension size of x must be 1.")
-    x_shard = paddle.shard_index(x, origin_num_embeddings, num_partitions,
-                                 inner_rank, per_part_embeddings - 1)
-    if len(origin_input_shape) == 2:
-        x_shard = paddle.squeeze(x_shard, axis=-1)
-    emb_out = embedding(x_shard)
+    helper = LayerHelper("_parallel_embedding", **locals())
+
+    per_part_size = per_part_embeddings
+    rank = inner_rank
+
+    vocab_start_index = rank * per_part_size
+    dtype = helper.get_default_dtype()
+    size = [per_part_size, origin_size[1]]
+
+    weight = helper.create_parameter(
+        attr=param_attr, shape=size, dtype=dtype, is_bias=False)
+
+    if num_partitions == 1:
+        return paddle.nn.functional.embedding(
+            x, weight=weight, padding_idx=None, sparse=False, name=name)
+
     startup_block = paddle.static.default_startup_program().global_block()
     main_block = paddle.static.default_main_program().global_block()
-    startup_block.vars[embedding.weight.name].is_distributed = True
-    main_block.vars[embedding.weight.name].is_distributed = True
-    out = main_block.create_var(
-        shape=emb_out.shape,
-        dtype=emb_out.dtype,
-        type=emb_out.type,
-        lod_level=emb_out.lod_level,
-        persistable=False,
-        is_data=False,
-        need_check_feed=emb_out.desc.need_check_feed())
-    main_block.append_op(
-        type='c_allreduce_sum',
-        inputs={'X': emb_out},
-        outputs={'Out': out},
-        attrs={
-            'ring_id': ring_id,
-            'use_calc_stream': True,
-            'use_model_parallel': True
-        })
+    startup_block.vars[weight.name].is_distributed = True
+    main_block.vars[weight.name].is_distributed = True
+
+    output_parallel = paddle.distributed.collective._c_lookup_table(
+        weight, x, start_index=vocab_start_index, name=name)
+    out = paddle.distributed.collective._mp_allreduce(
+        output_parallel,
+        group=group,
+        use_calc_stream=True,
+        use_model_parallel=True)
     return out
 
 
@@ -1259,11 +1305,11 @@ def split(x,
     if operation == "embedding":
         assert axis == 0, ("We only support to split the weight of embedding "
                            "along the first axis now.")
-        per_part_size = (size[0] + num_partitions - 1) // num_partitions
-        last_part_size = size[0] - per_part_size * (num_partitions - 1)
-        if inner_rank == num_partitions - 1: per_part_size = last_part_size
-        per_part_size += 1  # make the last row as the padding index
+        assert size[0] % num_partitions == 0, \
+            "The length of the vocabulary must be divisible by num_partitions " \
+            "but received vocabulary={} num_partitions={}".format(size[0], num_partitions)
 
+        per_part_size = size[0] // num_partitions
         emb_out = _parallel_embedding(
             x,
             per_part_size,
diff --git a/python/paddle/distributed/fleet/ascend_utils.py b/python/paddle/distributed/fleet/ascend_utils.py
index 27437c50fad66a3438c2b6bab46e30631d2e93bd..2f6c210165ec15c0b73efd370a399b386b84f484 100644
--- a/python/paddle/distributed/fleet/ascend_utils.py
+++ b/python/paddle/distributed/fleet/ascend_utils.py
@@ -80,8 +80,9 @@ def _get_ascend_rankfile(rank_table_file_path):
             nodes = os.getenv("DLS_TASK_NUMBER", None)
             assert nodes is not None, "DLS_TASK_NUMBER didn't set!"
             for node in range(int(nodes)):
-                node_ip = os.getenv(f"VC_CUSTOM{node}_HOSTS", None)
-                assert node_ip is not None, f"VC_CUSTOM{node}_HOSTS didn't set!"
+                node_ip = os.getenv("VC_CUSTOM{}_HOSTS".format(node), None)
+                assert node_ip is not None, "VC_CUSTOM{}_HOSTS didn't set!".format(
+                    node)
                 node_ips.append(node_ip)
             return node_ips, device_count
         node_ips.append(server['server_id'])
diff --git a/python/paddle/distributed/fleet/base/distributed_strategy.py b/python/paddle/distributed/fleet/base/distributed_strategy.py
index 0a989fe90f96a6f44659070658e2bc3c4fd8d5c9..e44a0e0459d31fdb0a56d02394cf4c66d480a499 100644
--- a/python/paddle/distributed/fleet/base/distributed_strategy.py
+++ b/python/paddle/distributed/fleet/base/distributed_strategy.py
@@ -14,7 +14,7 @@
 
 import paddle
 from paddle.distributed.fleet.proto import distributed_strategy_pb2
-from paddle.fluid.framework import Variable, set_flags, core
+from paddle.fluid.framework import Variable, set_flags, core, _global_flags
 from paddle.fluid.wrapped_decorator import wrap_decorator
 import google.protobuf.text_format
 import google.protobuf
@@ -121,18 +121,18 @@ class DistributedStrategy(object):
 
         # Set the default values of the following flags to the ones set by users
         key = 'FLAGS_cudnn_batchnorm_spatial_persistent'
-        if core.globals().is_public(key):
+        if _global_flags().is_public(key):
             self.strategy.cudnn_batchnorm_spatial_persistent = bool(
-                core.globals()[key])
+                _global_flags()[key])
         key = 'FLAGS_conv_workspace_size_limit'
-        if core.globals().is_public(key):
-            self.strategy.conv_workspace_size_limit = int(core.globals()[key])
+        if _global_flags().is_public(key):
+            self.strategy.conv_workspace_size_limit = int(_global_flags()[key])
         key = 'FLAGS_cudnn_exhaustive_search'
-        if core.globals().is_public(key):
-            self.strategy.cudnn_exhaustive_search = bool(core.globals()[key])
+        if _global_flags().is_public(key):
+            self.strategy.cudnn_exhaustive_search = bool(_global_flags()[key])
         key = 'FLAGS_sync_nccl_allreduce'
-        if core.globals().is_public(key):
-            self.strategy.sync_nccl_allreduce = bool(core.globals()[key])
+        if _global_flags().is_public(key):
+            self.strategy.sync_nccl_allreduce = bool(_global_flags()[key])
 
         self.__lock_attr = True
 
@@ -286,7 +286,7 @@ class DistributedStrategy(object):
             self.a_sync_configs = {"k_steps": 0}
         else:
             raise ValueError(
-                "The type of `flag` is invalid, expected type is bool, but received %s".
+                "The type of `flag` is invalid, expected type is bool, but received {}".
                 format(type(flag)))
 
     @property
@@ -853,6 +853,27 @@ class DistributedStrategy(object):
                 "WARNING: without_graph_optimization should have value of bool type"
             )
 
+    @property
+    def fuse_grad_size_in_num(self):
+        """
+        This based on raw_program_optimizer program and allreduce the num of the fused op
+        Examples:
+          .. code-block:: python
+            import paddle.distributed.fleet as fleet
+            strategy = fleet.DistributedStrategy()
+            strategy.fuse_grad_size_in_num = 2
+        """
+        return self.strategy.fuse_grad_size_in_num
+
+    @fuse_grad_size_in_num.setter
+    @is_strict_auto
+    def fuse_grad_size_in_num(self, num):
+        if isinstance(num, int):
+            self.strategy.fuse_grad_size_in_num = num
+        else:
+            print(
+                "WARNING: fuse_grad_size_in_num should have value of int32 type")
+
     @property
     def pipeline(self):
         """
@@ -1561,8 +1582,8 @@ class DistributedStrategy(object):
         ]
 
         for i, key in enumerate(keys):
-            if core.globals().is_public(key):
-                core.globals()[key] = values[i]
+            if _global_flags().is_public(key):
+                _global_flags()[key] = values[i]
 
     def _is_strict_auto(self):
         global non_auto_func_called
diff --git a/python/paddle/distributed/fleet/launch_utils.py b/python/paddle/distributed/fleet/launch_utils.py
index c69b21538b61ad207db385afa99cbbc1448a2b71..ee5eb807fad7019cd2b7dff85170325a6d8230d1 100644
--- a/python/paddle/distributed/fleet/launch_utils.py
+++ b/python/paddle/distributed/fleet/launch_utils.py
@@ -83,7 +83,7 @@ class Cluster(object):
     def __ne__(self, cluster):
         return not self.__eq__(cluster)
 
-    def update_pods(cluster):
+    def update_pods(self, cluster):
         self.pods = copy.copy(cluster.pods)
 
     def trainers_nranks(self):
@@ -195,7 +195,7 @@ class Pod(object):
                 self.id != pod.id or \
                 self.addr != pod.addr or \
                 self.port != pod.port:
-            logger.debug("pod {} != pod".format(self, pod))
+            logger.debug("pod {} != {}".format(self, pod))
             return False
 
         if len(self.trainers) != len(pod.trainers):
diff --git a/python/paddle/distributed/fleet/meta_optimizers/common.py b/python/paddle/distributed/fleet/meta_optimizers/common.py
index 707284a784c38e5ac7b0f3b8248ca03b6c4506bb..9e891062bcbccbca4f34d8a2e211ca5f3ece44a3 100644
--- a/python/paddle/distributed/fleet/meta_optimizers/common.py
+++ b/python/paddle/distributed/fleet/meta_optimizers/common.py
@@ -77,9 +77,12 @@ class CollectiveHelper(object):
                            wait_port,
                            global_ring_id=None,
                            sync=True):
-        nranks = len(endpoints)
-        other_endpoints = endpoints[:]
-        other_endpoints.remove(current_endpoint)
+        # if current_endpoint is None, it means just for sync,
+        # no group is created.
+        if current_endpoint:
+            nranks = len(endpoints)
+            other_endpoints = endpoints[:]
+            other_endpoints.remove(current_endpoint)
 
         if rank == 0 and wait_port:
             wait_server_ready(other_endpoints)
@@ -117,6 +120,12 @@ class CollectiveHelper(object):
                 attrs={OP_ROLE_KEY: OpRole.Forward})
 
         block = program.global_block()
+        if current_endpoint is None:
+            assert endpoints is None
+            assert sync
+            _add_sync_by_allreduce(block)
+            return
+
         if core.is_compiled_with_cuda():
             comm_id_var = block.create_var(
                 name=unique_name.generate('nccl_id'),
diff --git a/python/paddle/distributed/fleet/meta_optimizers/dygraph_optimizer/hybrid_parallel_optimizer.py b/python/paddle/distributed/fleet/meta_optimizers/dygraph_optimizer/hybrid_parallel_optimizer.py
index c2d79a62c7663a01d5cd1e7ca9ac705612e1db03..bceabeee3c3dce9f355bb9a31a037a13cca4edd3 100644
--- a/python/paddle/distributed/fleet/meta_optimizers/dygraph_optimizer/hybrid_parallel_optimizer.py
+++ b/python/paddle/distributed/fleet/meta_optimizers/dygraph_optimizer/hybrid_parallel_optimizer.py
@@ -14,6 +14,7 @@
 
 from __future__ import print_function
 import sys
+import paddle
 from paddle.optimizer import Optimizer
 from paddle.fluid.clip import ClipGradByGlobalNorm
 from ...utils.hybrid_parallel_util import fused_allreduce_gradients
@@ -22,6 +23,8 @@ from paddle.fluid.dygraph import base as imperative_base
 from paddle.fluid import framework
 from paddle.fluid.framework import Variable
 from ...utils.log_util import logger
+from paddle.fluid import core
+from paddle.fluid import layers
 
 __all__ = []
 
diff --git a/python/paddle/distributed/fleet/meta_optimizers/pipeline_optimizer.py b/python/paddle/distributed/fleet/meta_optimizers/pipeline_optimizer.py
index a0bf4cc5bc0975d7d3b88039d3a5603f28584a1a..481b90910def175838c4baedec9e25c9363bc943 100755
--- a/python/paddle/distributed/fleet/meta_optimizers/pipeline_optimizer.py
+++ b/python/paddle/distributed/fleet/meta_optimizers/pipeline_optimizer.py
@@ -138,6 +138,9 @@ class PipelineOptimizer(MetaOptimizerBase):
                 first_node = pair[0] + start_index
                 second_node = pair[1] + start_index
                 if self.rank != first_node and self.rank != second_node:
+                    collective_helper._init_communicator(
+                        self.startup_program, None, None, None, None, False,
+                        self.global_ring_id, True)
                     continue
                 pipeline_endpoints = [
                     self.endpoints[first_node], self.endpoints[second_node]
diff --git a/python/paddle/distributed/fleet/meta_optimizers/raw_program_optimizer.py b/python/paddle/distributed/fleet/meta_optimizers/raw_program_optimizer.py
index 243f6efe53185d504832ff3e5cd89b5322fc53e0..1333f794cc97e3c28d0aa5bad4f408b62384c270 100755
--- a/python/paddle/distributed/fleet/meta_optimizers/raw_program_optimizer.py
+++ b/python/paddle/distributed/fleet/meta_optimizers/raw_program_optimizer.py
@@ -1,4 +1,4 @@
-#   Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+#   Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -14,9 +14,12 @@
 from __future__ import print_function
 from __future__ import division
 import os
+import collections
+import numpy as np
 
 import paddle.fluid as fluid
 from paddle.fluid import core, unique_name
+from paddle.fluid.dygraph import Layer, LayerList
 from ..base.private_helper_function import wait_server_ready
 from .meta_optimizer_base import MetaOptimizerBase
 from .common import OpRole, OP_ROLE_KEY, OP_ROLE_VAR_KEY, CollectiveHelper, is_loss_grad_op, is_backward_op, is_optimizer_op
@@ -38,6 +41,9 @@ class RawProgramOptimizer(MetaOptimizerBase):
         super(RawProgramOptimizer, self)._set_basic_info(
             loss, role_maker, user_defined_optimizer, user_defined_strategy)
         self.without_graph_optimization = user_defined_strategy.without_graph_optimization
+        self.fuse_all_reduce_ops = user_defined_strategy.fuse_all_reduce_ops
+        if self.fuse_all_reduce_ops:
+            self.fuse_grad_size_in_num = user_defined_strategy.fuse_grad_size_in_num
 
     def _can_apply(self):
         if not self.role_maker._is_collective:
@@ -113,7 +119,8 @@ class RawProgramOptimizer(MetaOptimizerBase):
 
         optimize_ops, params_grads = self.inner_opt.minimize(
             loss, startup_program, parameter_list, no_grad_set)
-
+        if self.nranks == 1:
+            return optimize_ops, params_grads
         self._init_process_group()
 
         self.main_program = program
@@ -123,7 +130,11 @@ class RawProgramOptimizer(MetaOptimizerBase):
 
     def _transpile_main_program(self, loss):
         self._insert_loss_grad_ops(loss)
-        self._insert_allreduce_ops()
+        if self.fuse_all_reduce_ops and core.is_compiled_with_npu():
+            self._calc_stream = True
+            self._allreduce_fusion_program()
+        else:
+            self._insert_allreduce_ops()
 
     def _insert_loss_grad_ops(self, loss):
         """
@@ -194,3 +205,260 @@ class RawProgramOptimizer(MetaOptimizerBase):
                     attrs={'ring_id': ring_id,
                            OP_ROLE_KEY: OpRole.Backward})
                 break
+
+    # TODO(Liu yuang): ADD CUDA allreduce_fusion fuction.
+    # This function helps reduce the input of allreduce by integrating can save communication time.
+    def _allreduce_fusion_program(self):
+        block = self.main_program.global_block()
+        ring_id = self.global_ring_id
+        record_idx, allreduce_input_vars, allreduce_output_vars = [], [], []
+        block_ops = len(list(enumerate(block.ops)))
+
+        for idx, op in reversed(list(enumerate(block.ops))):
+            if is_backward_op(op) and \
+                    OP_ROLE_VAR_KEY in op.attr_names:
+                op_role_var = op.attr(OP_ROLE_VAR_KEY)
+                if len(op_role_var) == 0:
+                    continue
+                assert len(op_role_var) % 2 == 0
+                for i in range(0, len(op_role_var), 2):
+                    param_name = op_role_var[i]
+                    param = block.var(param_name)
+                    grad_name = op_role_var[i + 1]
+                    grad = block.var(grad_name)
+                    if param.is_distributed:
+                        continue
+                    if ".cast_fp16@GRAD" in grad_name:
+                        param_name = param_name + ".cast_fp16"
+                        if not block.has_var(param_name):
+                            raise ValueError("op cast name error {}".format(
+                                op.type))
+                        else:
+                            param = block.var(param_name)
+
+                    if len(allreduce_output_vars) == 0:
+                        allreduce_output_vars.append([grad])
+                        allreduce_input_vars.append([param])
+                        if self.fuse_grad_size_in_num == 1:
+                            record_idx.append([idx, idx])
+                            continue
+                        record_idx.append([-2, idx])
+                    elif len(allreduce_output_vars[
+                            -1]) == self.fuse_grad_size_in_num:
+                        allreduce_output_vars.append([grad])
+                        allreduce_input_vars.append([param])
+                        if self.fuse_grad_size_in_num == 1:
+                            record_idx.append([idx, idx])
+                            continue
+                        if idx != block_ops - 1:
+                            record_idx.append([-2, idx])
+                    else:
+                        allreduce_output_vars[-1].append(grad)
+                        allreduce_input_vars[-1].append(param)
+                        record_idx[-1][0] = idx
+
+                if record_idx[-1][0] == -2:
+                    record_idx[-1][0] = record_idx[-1][1]
+
+        assert len(allreduce_output_vars) == len(
+            record_idx
+        ), "It has different lens between the allreduce_output_vars and record_idx."
+
+        if not allreduce_output_vars or not allreduce_input_vars:
+            return
+
+        self.vars = collections.OrderedDict()
+        index, offset_pos, pos, offset = 0, 0, 0, 0
+        start, end = record_idx[index]
+        men_list = [end, start]
+
+        # Here we need to explain the flag. When integrating OP, we will encounter different groups of the same Op.
+        # Because we insert coalesce tensor in reverse ops,
+        # we need to use flag to record whether the current OP has been inserted into coalesce tensor。
+        # For example:
+        # [(3, 2), (2, 2), (1, 0)], (3, 2), (2, 2) using same op, but in different groups.
+
+        for idx, op in reversed(list(enumerate(block.ops))):
+            if idx == start:
+                pos = 0
+                flag = True if end == men_list[-1] else False
+                offset = offset_pos if flag else 0
+                done_output_vars, done_input_vars = self._split_fuction(
+                    allreduce_output_vars[index], allreduce_input_vars[index])
+                for id_, done_output_var in enumerate(done_output_vars):
+                    if flag:
+                        tmp_var = block.create_var(
+                            name=unique_name.generate(
+                                'FusedOutput_{}_{}'.format(start, id_ +
+                                                           offset)),
+                            dtype=done_output_var[0].dtype,
+                            persistable=False,
+                            stop_gradient=True)
+                        self.vars['FusedOutput_{}_{}'.format(start, id_ +
+                                                             offset)] = tmp_var
+
+                        block._insert_op(
+                            idx + id_ + offset,
+                            type="coalesce_tensor",
+                            inputs={"Input": done_input_vars[id_]},
+                            outputs={
+                                "Output": done_output_var,
+                                "FusedOutput": tmp_var
+                            },
+                            attrs={
+                                "copy_data": False,
+                                "use_align": True,
+                                "dtype": done_output_var[0].dtype
+                            })
+                        pos += 1
+                    else:
+                        tmp_var = block.create_var(
+                            name=unique_name.generate(
+                                'FusedOutput_{}_{}'.format(start, id_)),
+                            dtype=done_output_var[0].dtype,
+                            persistable=False,
+                            stop_gradient=True)
+                        self.vars['FusedOutput_{}_{}'.format(start,
+                                                             id_)] = tmp_var
+
+                        block._insert_op(
+                            idx + id_,
+                            type="coalesce_tensor",
+                            inputs={"Input": done_input_vars[id_]},
+                            outputs={
+                                "Output": done_output_var,
+                                "FusedOutput": tmp_var
+                            },
+                            attrs={
+                                "copy_data": False,
+                                "use_align": True,
+                                "dtype": done_output_var[0].dtype
+                            })
+                        pos += 1
+                offset_pos = pos
+
+                # TODO(Liu yuang): ADD CUDA and NPU's EVENT and c_allreduce_sum.
+                for id_ in range(len(done_output_vars)):
+                    if flag:
+                        block._insert_op(
+                            end + id_ + pos + 1,
+                            type='c_allreduce_sum',
+                            inputs={
+                                'X': self.vars['FusedOutput_{}_{}'.format(
+                                    start, id_ + offset)]
+                            },
+                            outputs={
+                                'Out': self.vars['FusedOutput_{}_{}'.format(
+                                    start, id_ + offset)]
+                            },
+                            attrs={
+                                'ring_id': ring_id,
+                                'use_calc_stream': True
+                                if self._calc_stream else False,
+                                OP_ROLE_KEY: OpRole.Backward
+                            })
+                    else:
+                        block._insert_op(
+                            end + id_ + pos + 1,
+                            type='c_allreduce_sum',
+                            inputs={
+                                'X': self.vars['FusedOutput_{}_{}'.format(start,
+                                                                          id_)]
+                            },
+                            outputs={
+                                'Out': self.vars['FusedOutput_{}_{}'.format(
+                                    start, id_)]
+                            },
+                            attrs={
+                                'ring_id': ring_id,
+                                'use_calc_stream': True
+                                if self._calc_stream else False,
+                                OP_ROLE_KEY: OpRole.Backward
+                            })
+                index += 1
+                men_list.append(end)
+                men_list.append(start)
+                if len(record_idx) == index:
+                    start = end = -1
+                    continue
+                start, end = record_idx[index]
+
+        if not self._calc_stream:
+            for idx, op in enumerate(block.ops):
+                if is_optimizer_op(op):
+                    block._insert_op(
+                        idx,
+                        type='c_sync_comm_stream',
+                        inputs={'X': block.create_var()},
+                        outputs={'Out': block.create_var()},
+                        attrs={
+                            'ring_id': ring_id,
+                            OP_ROLE_KEY: OpRole.Backward
+                        })
+                    break
+
+    # Integrate grads of the same type to form a combination. If skip_comb is selected, will return grads of the same group.
+    # For example:[(fp16, fp16), (fp32), (fp16)] -> [(fp16, fp16, fp16), (fp32)]
+    def _split_fuction(self,
+                       allreduce_output_vars,
+                       allreduce_input_vars,
+                       skip_comb=True):
+        input_vars, final_input_vars, output_vars, final_output_vars = [], [], [], []
+        if len(allreduce_output_vars) - 1 == 0:
+            final_output_vars.append(allreduce_output_vars)
+            final_input_vars.append(allreduce_input_vars)
+            return final_output_vars, final_input_vars
+
+        for idx in range(len(allreduce_input_vars) - 1):
+            if allreduce_input_vars[idx].dtype == allreduce_input_vars[idx +
+                                                                       1].dtype:
+                input_vars.append(allreduce_input_vars[idx])
+                if idx == len(allreduce_input_vars) - 2:
+                    input_vars.append(allreduce_input_vars[idx + 1])
+                    final_input_vars.append(input_vars)
+            else:
+                input_vars.append(allreduce_input_vars[idx])
+                final_input_vars.append(input_vars)
+                input_vars = []
+                if idx == len(allreduce_input_vars) - 2:
+                    input_vars.append(allreduce_input_vars[idx + 1])
+                    final_input_vars.append(input_vars)
+
+        for idx in range(len(allreduce_output_vars) - 1):
+            if allreduce_output_vars[idx].dtype == allreduce_output_vars[
+                    idx + 1].dtype:
+                output_vars.append(allreduce_output_vars[idx])
+                if idx == len(allreduce_output_vars) - 2:
+                    output_vars.append(allreduce_output_vars[idx + 1])
+                    final_output_vars.append(output_vars)
+            else:
+                output_vars.append(allreduce_output_vars[idx])
+                final_output_vars.append(output_vars)
+                output_vars = []
+                if idx == len(allreduce_output_vars) - 2:
+                    output_vars.append(allreduce_output_vars[idx + 1])
+                    final_output_vars.append(output_vars)
+        if skip_comb:
+            input_fp16_vars, input_fp32_vars, output_fp16_vars, output_fp32_vars = [], [], [], []
+            for final_input_var in final_input_vars:
+                if final_input_var[0].dtype == core.VarDesc.VarType.FP16:
+                    input_fp16_vars.extend(final_input_var)
+                else:
+                    input_fp32_vars.extend(final_input_var)
+
+            for final_output_var in final_output_vars:
+                if final_output_var[0].dtype == core.VarDesc.VarType.FP16:
+                    output_fp16_vars.extend(final_output_var)
+                else:
+                    output_fp32_vars.extend(final_output_var)
+            final_output_vars, final_input_vars = [], []
+            if output_fp16_vars:
+                final_output_vars.append(output_fp16_vars)
+            if output_fp32_vars:
+                final_output_vars.append(output_fp32_vars)
+            if input_fp16_vars:
+                final_input_vars.append(input_fp16_vars)
+            if input_fp32_vars:
+                final_input_vars.append(input_fp32_vars)
+
+        return final_output_vars, final_input_vars
diff --git a/python/paddle/distributed/fleet/meta_optimizers/sharding/utils.py b/python/paddle/distributed/fleet/meta_optimizers/sharding/utils.py
index 285647352dfbb0c8faf56900e3fd16ab5700950f..85f114d7f71413688ee8793635af79ebb2cda850 100755
--- a/python/paddle/distributed/fleet/meta_optimizers/sharding/utils.py
+++ b/python/paddle/distributed/fleet/meta_optimizers/sharding/utils.py
@@ -368,7 +368,8 @@ def insert_reduce_ops(block,
     for var in reduce_vars:
 
         root_id = get_grad_device(var, shard)
-        assert root_id >= 0, "root id should be a positive int".format(var)
+        assert root_id >= 0, "root id should be a positive int, but now root id is {}".format(
+            root_id)
         block._insert_op_without_sync(
             insert_idx,
             type='c_reduce_sum',
diff --git a/python/paddle/distributed/fleet/meta_parallel/__init__.py b/python/paddle/distributed/fleet/meta_parallel/__init__.py
index 894771a3d5005f9b803d4a5842d266a0247c9279..0750c2c250e2bb90f7384d644661d6eb059bb22a 100644
--- a/python/paddle/distributed/fleet/meta_parallel/__init__.py
+++ b/python/paddle/distributed/fleet/meta_parallel/__init__.py
@@ -15,6 +15,7 @@
 from .parallel_layers import VocabParallelEmbedding  # noqa: F401
 from .parallel_layers import ColumnParallelLinear  # noqa: F401
 from .parallel_layers import RowParallelLinear  # noqa: F401
+from .parallel_layers import ParallelCrossEntropy  # noqa: F401
 from .parallel_layers import LayerDesc  # noqa: F401
 from .parallel_layers import PipelineLayer  # noqa: F401
 from .parallel_layers import RNGStatesTracker  # noqa: F401
diff --git a/python/paddle/distributed/fleet/meta_parallel/parallel_layers/__init__.py b/python/paddle/distributed/fleet/meta_parallel/parallel_layers/__init__.py
index 6a33611403ace0f3ceefeea8b108c6ff8fa2d885..72da962b8914eb2d5eb92e40960c60a4703c6d52 100644
--- a/python/paddle/distributed/fleet/meta_parallel/parallel_layers/__init__.py
+++ b/python/paddle/distributed/fleet/meta_parallel/parallel_layers/__init__.py
@@ -15,6 +15,7 @@
 from .mp_layers import VocabParallelEmbedding  # noqa: F401
 from .mp_layers import ColumnParallelLinear  # noqa: F401
 from .mp_layers import RowParallelLinear  # noqa: F401
+from .mp_layers import ParallelCrossEntropy  # noqa: F401
 from .pp_layers import LayerDesc  # noqa: F401
 from .pp_layers import PipelineLayer  # noqa: F401
 from .random import RNGStatesTracker  # noqa: F401
diff --git a/python/paddle/distributed/fleet/meta_parallel/parallel_layers/mp_layers.py b/python/paddle/distributed/fleet/meta_parallel/parallel_layers/mp_layers.py
index 91f9868f96ef90278b4d9340bf9d52c957ff5b1f..f091c890f6854246d2f76a41403ae518e39e1b8d 100644
--- a/python/paddle/distributed/fleet/meta_parallel/parallel_layers/mp_layers.py
+++ b/python/paddle/distributed/fleet/meta_parallel/parallel_layers/mp_layers.py
@@ -18,6 +18,7 @@ from .random import get_rng_state_tracker
 from paddle.nn import functional as F
 from paddle import framework
 from ...base import topology as tp
+from paddle.autograd import PyLayer
 
 __all__ = []
 
@@ -243,3 +244,19 @@ class RowParallelLinear(Layer):
 
         output = output_ + self.bias if self.bias is not None else output_
         return output
+
+
+class ParallelCrossEntropy(Layer):
+    def __init__(self, name=None):
+        super(ParallelCrossEntropy, self).__init__()
+        self.name = name
+        self.model_parallel_group = tp._HYBRID_PARALLEL_GROUP.get_model_parallel_group(
+        )
+        self.world_size = tp._HYBRID_PARALLEL_GROUP.get_model_parallel_world_size(
+        )
+        self.rank = tp._HYBRID_PARALLEL_GROUP.get_model_parallel_rank()
+
+    def forward(self, input, label):
+        loss = paddle.distributed.collective._c_softmax_with_cross_entropy(
+            input, label, group=self.model_parallel_group)
+        return loss
diff --git a/python/paddle/distributed/fleet/utils/fs.py b/python/paddle/distributed/fleet/utils/fs.py
index 087942e70a226379778c4df1ceacb4e422b75e5b..f9cedba7773fbf3b9b3f497258bd2011556e1816 100644
--- a/python/paddle/distributed/fleet/utils/fs.py
+++ b/python/paddle/distributed/fleet/utils/fs.py
@@ -841,8 +841,7 @@ class HDFSClient(FS):
                     fs_src_path))
 
             if self.is_exist(fs_dst_path):
-                raise FSFileExistsError("{} exists already".format(
-                    fs_src_path, fs_dst_path, fs_dst_path))
+                raise FSFileExistsError("{} exists already".format(fs_dst_path))
 
         return self._try_mv(fs_src_path, fs_dst_path)
 
diff --git a/python/paddle/distributed/parallel.py b/python/paddle/distributed/parallel.py
index bc042e722947a0f0293f655a5d58f7823c0d0d03..efe747408428a68772726c28af469b975836511e 100644
--- a/python/paddle/distributed/parallel.py
+++ b/python/paddle/distributed/parallel.py
@@ -29,9 +29,7 @@ from paddle.fluid.dygraph import parallel_helper
 from paddle.fluid.dygraph.parallel import ParallelEnv
 from paddle.distributed.fleet.base.private_helper_function import wait_server_ready  # noqa: F401
 
-__all__ = [  #noqa
-    "init_parallel_env"
-]
+__all__ = []
 
 ParallelStrategy = core.ParallelStrategy
 
@@ -152,7 +150,6 @@ def init_parallel_env():
     init_gloo = int(os.getenv("PADDLE_WITH_GLOO", "0"))
     if init_gloo:
         ep_rank_0 = parallel_env.trainer_endpoints[0].split(":")
-        ep_rank = parallel_env.trainer_endpoints[parallel_env.rank].split(":")
         manager = Manager()
         # glboal dict to store status
         http_server_d = manager.dict()
diff --git a/python/paddle/distributed/utils.py b/python/paddle/distributed/utils.py
index e84025c2eb6d204be470b586e1adb6d739b76f52..447c059537ba3f2ba728f1acc76748f8a9154fca 100644
--- a/python/paddle/distributed/utils.py
+++ b/python/paddle/distributed/utils.py
@@ -25,6 +25,7 @@ import subprocess
 from contextlib import closing
 import socket
 from paddle.fluid import core
+from distutils.util import strtobool
 
 __all__ = [     #noqa
            'get_host_name_ip',
@@ -166,7 +167,7 @@ class Cluster(object):
     def __ne__(self, cluster):
         return not self.__eq__(cluster)
 
-    def update_pods(cluster):
+    def update_pods(self, cluster):
         self.pods = copy.copy(cluster.pods)
 
     def trainers_nranks(self):
@@ -264,7 +265,7 @@ class Pod(object):
                 self.id != pod.id or \
                 self.addr != pod.addr or \
                 self.port != pod.port:
-            logger.debug("pod {} != pod".format(self, pod))
+            logger.debug("pod {} != {}".format(self, pod))
             return False
 
         if len(self.trainers) != len(pod.trainers):
@@ -384,7 +385,7 @@ def add_arguments(argname, type, default, help, argparser, **kwargs):
         add_argument("name", str, "Jonh", "User name.", parser)
         args = parser.parse_args()
     """
-    type = distutils.util.strtobool if type == bool else type
+    type = strtobool if type == bool else type
     argparser.add_argument(
         "--" + argname,
         default=default,
diff --git a/python/paddle/fluid/backward.py b/python/paddle/fluid/backward.py
index 25412a86a8b940b9cba7210fbd17271955295bd1..708167a0273996fbb67eddec711ccff2aca5e759 100755
--- a/python/paddle/fluid/backward.py
+++ b/python/paddle/fluid/backward.py
@@ -456,7 +456,7 @@ def _addup_repetitive_outputs_(op_descs, block_idx):
     In these cases, the variable should be the accumulation of all the outputs.
     `sum_op`s are added to implement the accumulate.
     """
-    _MAX_ADD_NUM_ = core.globals()['FLAGS_max_inplace_grad_add']
+    _MAX_ADD_NUM_ = framework._global_flags()['FLAGS_max_inplace_grad_add']
     #pending_sum_ops = []
     pending_sum_ops = collections.OrderedDict()
     var_rename_count = collections.defaultdict(int)
diff --git a/python/paddle/fluid/contrib/mixed_precision/decorator.py b/python/paddle/fluid/contrib/mixed_precision/decorator.py
index 3cb9fe75559b1615f2ed1a01bd31742c2996e090..d5d2e7a0d963963d217ab82cefef93a71aca814a 100644
--- a/python/paddle/fluid/contrib/mixed_precision/decorator.py
+++ b/python/paddle/fluid/contrib/mixed_precision/decorator.py
@@ -303,14 +303,23 @@ class OptimizerWithMixedPrecision(object):
         if self._is_distributed:
             # if distributed, split check_finite_and_unscale to overlap
             # unscale with communication
-            for p, g in params_grads:
-                with self._train_program._optimized_guard([p, g]):
+            if core.is_compiled_with_npu():
+                with self._train_program._optimized_guard(grads):
                     _, found_inf = check_finite_and_unscale(
-                        [g, ],
+                        grads,
                         self._loss_scaling,
                         name="find_infinite_scale",
                         float_status=self._float_status)
                     found_infs.append(found_inf)
+            else:
+                for p, g in params_grads:
+                    with self._train_program._optimized_guard([p, g]):
+                        _, found_inf = check_finite_and_unscale(
+                            [g, ],
+                            self._loss_scaling,
+                            name="find_infinite_scale",
+                            float_status=self._float_status)
+                        found_infs.append(found_inf)
         elif self._use_pure_fp16:
             if fp32_grads:
                 with self._train_program._optimized_guard(fp32_grads):
diff --git a/python/paddle/fluid/contrib/model_stat.py b/python/paddle/fluid/contrib/model_stat.py
index ca4bfac5ba5a14065af002b62f9987f5177fbd7a..11ab8800f287f415e4088ac47b4e4c48c066c4dd 100644
--- a/python/paddle/fluid/contrib/model_stat.py
+++ b/python/paddle/fluid/contrib/model_stat.py
@@ -150,6 +150,7 @@ def _format_summary(collected_ops_list):
     '''
     _verify_dependent_package()
 
+    from prettytable import PrettyTable
     summary_table = PrettyTable(
         ["No.", "TYPE", "INPUT", "OUTPUT", "PARAMs", "FLOPs"])
     summary_table.align = 'r'
diff --git a/python/paddle/fluid/contrib/slim/quantization/imperative/qat.py b/python/paddle/fluid/contrib/slim/quantization/imperative/qat.py
index 66b11d1f17ad412de616f7053665a2045c09359e..600ce6397e1af37474a47c117c61dad0774da5fd 100644
--- a/python/paddle/fluid/contrib/slim/quantization/imperative/qat.py
+++ b/python/paddle/fluid/contrib/slim/quantization/imperative/qat.py
@@ -251,24 +251,25 @@ class ImperativeQuantizeInputs(object):
         super(ImperativeQuantizeInputs, self).__init__()
 
         self._quantizable_layer_type = tuple(
-            utils.quant_input_layers_map[layer]
-            if layer in utils.quant_input_layers_map else layer
+            utils.layer_name_map[layer]
+            if layer in utils.layer_name_map else layer
             for layer in quantizable_layer_type)
         for layer in self._quantizable_layer_type:
-            assert not isinstance(layer, str), \
+            assert not isinstance(layer, str) \
+                and layer in utils.fake_quant_input_layers, \
                 "%s is unspported to be quantized." % layer
 
         quantize_type = {
             'abs_max', 'moving_average_abs_max', 'channel_wise_abs_max'
         }
-        assert weight_quantize_type in quantize_type, \
+        assert weight_quantize_type != 'moving_average_abs_max' \
+            and weight_quantize_type in quantize_type, \
             "Unsupported weight_quantize_type: %s. It can only " \
-            "be abs_max or moving_average_abs_max or " \
-            "channel_wise_abs_max." % weight_quantize_type
-        assert activation_quantize_type != 'channel_wise_abs_max' \
-            and activation_quantize_type in quantize_type, \
+            "be abs_max or channel_wise_abs_max." % weight_quantize_type
+        # TODO (jc): activation_quantize_type supports range_abs_max
+        assert activation_quantize_type == 'moving_average_abs_max', \
             "Unsupported activation_quantize_type: %s. It can " \
-            "only be abs_max or moving_average_abs_max now." \
+            "only be moving_average_abs_max now." \
             % activation_quantize_type
 
         bits_check = lambda bits: isinstance(bits, int) \
@@ -305,30 +306,22 @@ class ImperativeQuantizeInputs(object):
         assert isinstance(model, dygraph.Layer), \
             "The model must be the instance of dygraph.Layer."
 
-        for name, layer in model.named_sublayers():
-            if not isinstance(layer, self._quantizable_layer_type) \
-                or (hasattr(layer, "skip_quant") \
-                    and layer.skip_quant == True):
+        for name, cur_layer in model.named_sublayers():
+            if not isinstance(cur_layer, self._quantizable_layer_type) \
+                or (hasattr(cur_layer, "skip_quant") \
+                    and cur_layer.skip_quant == True):
                 continue
 
-            # TODO(jc): optimize this module
-            last_idx = 0
-            idx = 0
-            obj = model
-            while idx < len(name):
-                if (name[idx] == '.'):
-                    if hasattr(obj, name[last_idx:idx]):
-                        obj = getattr(obj, name[last_idx:idx])
-                        last_idx = idx + 1
-                idx += 1
-            target = name[last_idx:idx]
-
-            quant_layer = self._get_input_quantized_layer(layer)
-            setattr(obj, target, quant_layer)
+            parent_layer, sub_name = \
+                utils.find_parent_layer_and_sub_name(model, name)
+
+            cur_quant_layer = self._get_input_quantized_layer(cur_layer)
+            setattr(parent_layer, sub_name, cur_quant_layer)
 
     def _get_input_quantized_layer(self, layer):
         quant_layer_name = None
-        for key, value in utils.quant_input_layers_map.items():
+
+        for key, value in utils.layer_name_map.items():
             if isinstance(layer, value):
                 quant_layer_name = 'Quantized' + key
                 break
@@ -336,10 +329,6 @@ class ImperativeQuantizeInputs(object):
             "The layer %s is unsupported to be quantized." \
             % layer.full_name()
 
-        layer_with_weight = ['QuantizedConv2D', 'QuantizedLinear']
-        if quant_layer_name not in layer_with_weight:
-            quant_layer_name = 'QuantizedNoweightLayer'
-
         return quant_nn.__dict__[quant_layer_name](layer, **self._kwargs)
 
 
@@ -374,25 +363,21 @@ class ImperativeQuantizeOutputs(object):
         assert isinstance(model, dygraph.Layer), \
             "The model must be the instance of dygraph.Layer."
 
-        for name, layer in model.named_sublayers():
-            if not self._is_target_layer(layer):
+        for cur_name, cur_layer in model.named_sublayers():
+            if not self._is_target_layer(cur_layer):
                 continue
 
-            # TODO(jc): optimize this module
-            last_idx = 0
-            idx = 0
-            obj = model
-            while idx < len(name):
-                if (name[idx] == '.'):
-                    if hasattr(obj, name[last_idx:idx]):
-                        obj = getattr(obj, name[last_idx:idx])
-                        last_idx = idx + 1
-                idx += 1
-            target = name[last_idx:idx]
-
-            quant_layer = quant_nn.__dict__["QuantizedOutputLayer"](
-                layer, self._moving_rate)
-            setattr(obj, target, quant_layer)
+            parent_layer, sub_name = \
+                utils.find_parent_layer_and_sub_name(model, cur_name)
+
+            if isinstance(cur_layer, tuple(utils.fake_quant_output_layers)):
+                cur_quant_layer = quant_nn.FakeQuantMAOutputScaleLayer(
+                    cur_layer, self._moving_rate)
+            else:
+                cur_quant_layer = quant_nn.MAOutputScaleLayer(cur_layer,
+                                                              self._moving_rate)
+
+            setattr(parent_layer, sub_name, cur_quant_layer)
 
     def save_quantized_model(self, layer, path, input_spec=None, **config):
         """
@@ -468,9 +453,18 @@ class ImperativeQuantizeOutputs(object):
         """
         Whether the layer needs to calculate output scales.
         """
-        return isinstance(layer, utils.quant_output_layers) \
-            or ('quantized' in layer.full_name() and \
-                'quantized_noweight' not in layer.full_name())
+        flag = False
+        if isinstance(layer, dygraph.Layer):
+            # exclude fake_quant ops in quant_nn file
+            if utils.is_leaf_layer(layer) and \
+                not isinstance(layer, tuple(utils.fake_quant_leaf_layers)):
+                flag = True
+            # consider QuantizedConv2D and QuantizedLinear ops
+            if isinstance(layer, tuple(utils.fake_quant_wrap_layers)):
+                flag = True
+        if isinstance(layer, paddle.nn.quant.FloatFunctionalLayer):
+            flag = True
+        return flag
 
     def _save_output_scale(self, program, scope):
         """
@@ -514,4 +508,4 @@ class ImperativeQuantizeOutputs(object):
         previous_ops = [utils.find_previous_op(block, arg_name) \
             for arg_name in in_op.input_arg_names]
         return any(op is not None and op.type not in \
-            utils.fake_quantize_dequantize_types for op in previous_ops)
+            utils.fake_quantize_dequantize_op_types for op in previous_ops)
diff --git a/python/paddle/fluid/contrib/slim/quantization/imperative/quant_nn.py b/python/paddle/fluid/contrib/slim/quantization/imperative/quant_nn.py
index f6fef0689d43afd832aa8a5360fc7823575d8223..fd1f7f423ff8f42b8022c686a6a380119079beb2 100644
--- a/python/paddle/fluid/contrib/slim/quantization/imperative/quant_nn.py
+++ b/python/paddle/fluid/contrib/slim/quantization/imperative/quant_nn.py
@@ -22,17 +22,28 @@ from paddle.fluid.framework import in_dygraph_mode
 from paddle.fluid.initializer import Constant
 from paddle.fluid.data_feeder import check_variable_and_dtype
 from paddle.nn import functional as F
+import logging
+from paddle.fluid.log_helper import get_logger
 
 __all__ = [
-    'FakeQuantMovingAverage', 'FakeQuantAbsMax',
-    'FakeChannelWiseQuantDequantAbsMax', 'QuantizedConv2D', 'QuantizedLinear',
-    'QuantizedNoweightLayer', 'MovingAverageAbsMaxScale'
+    'FakeQuantMovingAverageAbsMax',
+    'FakeQuantAbsMax',
+    'FakeQuantChannelWiseAbsMax',
+    'QuantizedConv2D',
+    'QuantizedLinear',
+    'QuantizedNoweightLayer',
+    'MovingAverageAbsMaxScale',
+    'MAOutputScaleLayer',
+    'FakeQuantMAOutputScaleLayer',
 ]
 
+_logger = get_logger(
+    __name__, logging.INFO, fmt='%(asctime)s-%(levelname)s: %(message)s')
 
-class FakeQuantMovingAverage(layers.Layer):
+
+class FakeQuantMovingAverageAbsMax(layers.Layer):
     r"""
-    FakeQuantMovingAverage layer does the moving_average_abs_max quant and then dequant.
+    FakeQuantMovingAverageAbsMax layer does the moving_average_abs_max quant and then dequant.
     Its computational formula is described as below:
 
     :math:`scale = (moving\_rate*accum+max(abs(x)))/(moving\_rate*state+1)`
@@ -45,7 +56,7 @@ class FakeQuantMovingAverage(layers.Layer):
                  moving_rate=0.9,
                  quant_bits=8,
                  dtype='float32'):
-        super(FakeQuantMovingAverage, self).__init__()
+        super(FakeQuantMovingAverageAbsMax, self).__init__()
         self._moving_rate = moving_rate
         self._quant_bits = quant_bits
 
@@ -98,7 +109,7 @@ class FakeQuantMovingAverage(layers.Layer):
             return out
 
         check_variable_and_dtype(input, 'input', ['float32'],
-                                 "FakeQuantMovingAverage")
+                                 "FakeQuantMovingAverageAbsMax")
         attrs = {
             'moving_rate': self._moving_rate,
             'bit_length': self._quant_bits,
@@ -210,7 +221,7 @@ class FakeQuantAbsMax(layers.Layer):
         return quant_out
 
 
-class FakeChannelWiseQuantDequantAbsMax(layers.Layer):
+class FakeQuantChannelWiseAbsMax(layers.Layer):
     def __init__(self,
                  name=None,
                  channel_num=None,
@@ -219,7 +230,7 @@ class FakeChannelWiseQuantDequantAbsMax(layers.Layer):
                  dtype='float32',
                  quant_on_weight=False):
         assert quant_on_weight == True, "Channel_wise only can be used on weight quantization."
-        super(FakeChannelWiseQuantDequantAbsMax, self).__init__()
+        super(FakeQuantChannelWiseAbsMax, self).__init__()
         self._quant_bits = quant_bits
         self._quant_axis = quant_axis
         self._dtype = dtype
@@ -265,7 +276,7 @@ class FakeChannelWiseQuantDequantAbsMax(layers.Layer):
             return out
 
         check_variable_and_dtype(input, 'input', ['float32'],
-                                 "FakeChannelWiseQuantDequantAbsMax")
+                                 "FakeQuantChannelWiseAbsMax")
         attrs = {'bit_length': self._quant_bits, 'quant_axis': self._quant_axis}
         inputs = {"X": [input]}
         quant_out = self._helper.create_variable(
@@ -313,8 +324,8 @@ def _get_fake_quant_type(quant_type, **kwargs):
             "when you use channel_wise_abs_max strategy.")
     fake_quant_map = {
         'abs_max': FakeQuantAbsMax,
-        'moving_average_abs_max': FakeQuantMovingAverage,
-        'channel_wise_abs_max': FakeChannelWiseQuantDequantAbsMax
+        'moving_average_abs_max': FakeQuantMovingAverageAbsMax,
+        'channel_wise_abs_max': FakeQuantChannelWiseAbsMax
     }
 
     return fake_quant_map[quant_type](**call_args)
@@ -498,12 +509,7 @@ class QuantizedNoweightLayer(layers.Layer):
             quant_on_weight=False)
 
     def forward(self, input):
-        quant_input = self._fake_quant_input(input)
-        # TODO (jc): support ops that have several inputs
-        if isinstance(input, list):
-            assert len(input) == 1, \
-                "The QuantizedNoweightLayer should only have one input."
-        return self._layer.forward(quant_input)
+        return self._layer.forward(self._fake_quant_input(input))
 
 
 class MovingAverageAbsMaxScale(layers.Layer):
@@ -590,19 +596,56 @@ class MovingAverageAbsMaxScale(layers.Layer):
         return quant_out
 
 
-class QuantizedOutputLayer(layers.Layer):
-    def __init__(self, layer=None, moving_rate=0.9, dtype='float32'):
+class MAOutputScaleLayer(layers.Layer):
+    """
+    Calculate the scale (moving average abs max) for the output of the input layer.
+    Add MovingAverageMaxScale layer to the behind of the input layer.
+    """
+
+    def __init__(self, layer=None, moving_rate=0.9, name=None, dtype='float32'):
         r"""
-        Add MovingAverageMaxScale layer to the behind of the input layer.
+        Construct
         """
-        super(QuantizedOutputLayer, self).__init__()
+        super(MAOutputScaleLayer, self).__init__()
         self._layer = layer
-        self._moving_average_abs_max_scale = \
-            MovingAverageAbsMaxScale(layer.full_name(), moving_rate, dtype)
+        if name is None:
+            name = layer.full_name()
+        self._ma_output_scale = \
+            MovingAverageAbsMaxScale(name, moving_rate, dtype)
+
+    def forward(self, *inputs, **kwargs):
+        out = self._layer(*inputs, **kwargs)
+        # TODO (jc): support the ops of several outputs
+        if (isinstance(out, list) or isinstance(out, tuple)) and len(out) > 1:
+            return out
+        else:
+            return self._ma_output_scale(out)
 
-    def forward(self, input):
-        if isinstance(input, list):
-            assert len(input) == 1, \
-                "The QuantizedOutputLayer should only have one input."
-        out = self._layer(input)
-        return self._moving_average_abs_max_scale(out)
+
+class FakeQuantMAOutputScaleLayer(layers.Layer):
+    def __init__(self,
+                 layer,
+                 weight_bits=8,
+                 activation_bits=8,
+                 moving_rate=0.9,
+                 name=None,
+                 *args,
+                 **kwargs):
+
+        super(FakeQuantMAOutputScaleLayer, self).__init__()
+        self._layer = layer
+        self._fake_quant_output = _get_fake_quant_type(
+            'moving_average_abs_max',
+            name=layer.full_name() if name is None else name,
+            moving_rate=moving_rate,
+            quant_bits=activation_bits,
+            dtype=self._dtype,
+            quant_on_weight=False)
+
+    def forward(self, *inputs, **kwargs):
+        out = self._layer(*inputs, **kwargs)
+        # TODO (jc): support the ops of several outputs
+        if (isinstance(out, list) or isinstance(out, tuple)) and len(out) > 1:
+            return out
+        else:
+            return self._fake_quant_output(out)
diff --git a/python/paddle/fluid/contrib/slim/quantization/imperative/utils.py b/python/paddle/fluid/contrib/slim/quantization/imperative/utils.py
index 491f8a7e25cbcd451bed0f5daef20b225f1c6a1f..94639b9cc68f94f20ac21e5cf54613ae6fc8d49d 100644
--- a/python/paddle/fluid/contrib/slim/quantization/imperative/utils.py
+++ b/python/paddle/fluid/contrib/slim/quantization/imperative/utils.py
@@ -13,9 +13,11 @@
 # limitations under the License.
 
 import paddle
+from paddle.fluid import dygraph
 import numpy as np
+from . import quant_nn
 
-quant_input_layers_map = {
+layer_name_map = {
     'Conv2D': paddle.nn.Conv2D,
     'Linear': paddle.nn.Linear,
     'AdaptiveAvgPool2D': paddle.nn.AdaptiveAvgPool2D,
@@ -37,30 +39,38 @@ quant_input_layers_map = {
     'LayerNorm': paddle.nn.LayerNorm,
 }
 
-fake_quantize_dequantize_types = [
-    "fake_quantize_dequantize_abs_max",
-    "fake_channel_wise_quantize_dequantize_abs_max",
-    "fake_quantize_dequantize_moving_average_abs_max"
+# Apply fake quant for the inputs of these layers
+# TODO (jc): support paddle.nn.Conv2DTranspose
+fake_quant_input_layers = [paddle.nn.Conv2D, paddle.nn.Linear]
+
+# Apply fake quant for the output of these layers
+# TODO(jc): fix the problem of adding duplicate fake_quant ops
+# paddle.nn.AdaptiveAvgPool2D, paddle.nn.AvgPool2D, paddle.nn.ReLU,paddle.nn.LeakyReLU
+fake_quant_output_layers = [
+    paddle.nn.quant.add, paddle.nn.quant.subtract, paddle.nn.quant.multiply,
+    paddle.nn.quant.divide
+]
+
+fake_quant_leaf_layers = [
+    quant_nn.FakeQuantAbsMax,
+    quant_nn.FakeQuantChannelWiseAbsMax,
+    quant_nn.FakeQuantMovingAverageAbsMax,
+    quant_nn.MovingAverageAbsMaxScale,
 ]
 
-quant_output_layers = (
-    paddle.nn.Conv2D, paddle.nn.Conv2DTranspose, paddle.nn.Linear,
-    paddle.nn.AdaptiveAvgPool2D, paddle.nn.AdaptiveMaxPool2D,
-    paddle.nn.AvgPool2D, paddle.nn.MaxPool2D, paddle.nn.BatchNorm,
-    paddle.nn.BatchNorm2D, paddle.nn.LayerNorm, paddle.nn.SyncBatchNorm,
-    paddle.nn.ELU, paddle.nn.GELU, paddle.nn.Hardshrink, paddle.nn.Hardsigmoid,
-    paddle.nn.Hardswish, paddle.nn.Hardtanh, paddle.nn.LeakyReLU,
-    paddle.nn.LogSigmoid, paddle.nn.LogSoftmax, paddle.nn.Maxout,
-    paddle.nn.PReLU, paddle.nn.ReLU, paddle.nn.ReLU6, paddle.nn.SELU,
-    paddle.nn.Sigmoid, paddle.nn.Softmax, paddle.nn.Softplus,
-    paddle.nn.Softshrink, paddle.nn.Softsign, paddle.nn.Swish, paddle.nn.Tanh,
-    paddle.nn.Tanhshrink, paddle.nn.ThresholdedReLU, paddle.nn.Upsample)
+fake_quant_wrap_layers = [quant_nn.QuantizedConv2D, quant_nn.QuantizedLinear]
 
 weight_op_types = [
     "conv2d", "depthwise_conv2d", "matmul", "conv2d_transpose",
     "depthwise_conv2d_transpose"
 ]
 
+fake_quantize_dequantize_op_types = [
+    "fake_quantize_dequantize_abs_max",
+    "fake_channel_wise_quantize_dequantize_abs_max",
+    "fake_quantize_dequantize_moving_average_abs_max"
+]
+
 
 def load_variable_data(scope, var_name):
     '''
@@ -90,3 +100,36 @@ def find_next_ops(block, var_name):
         if var_name in op.input_arg_names:
             res_ops.append(op)
     return res_ops
+
+
+def find_parent_layer_and_sub_name(model, name):
+    """
+    Given the model and the name of a layer, find the parent layer and
+    the sub_name of the layer.
+    For example, if name is 'block_1/convbn_1/conv_1', the parent layer is
+    'block_1/convbn_1' and the sub_name is `conv_1`.
+    """
+    assert isinstance(model, dygraph.Layer), \
+            "The model must be the instance of paddle.nn.Layer."
+    assert len(name) > 0, "The input (name) should not be empty."
+
+    last_idx = 0
+    idx = 0
+    parent_layer = model
+    while idx < len(name):
+        if name[idx] == '.':
+            sub_name = name[last_idx:idx]
+            if hasattr(parent_layer, sub_name):
+                parent_layer = getattr(parent_layer, sub_name)
+                last_idx = idx + 1
+        idx += 1
+    sub_name = name[last_idx:idx]
+    return parent_layer, sub_name
+
+
+def is_leaf_layer(layer):
+    """
+    Whether the layer is leaf layer.
+    """
+    return isinstance(layer, dygraph.Layer) \
+        and len(layer.sublayers()) == 0
diff --git a/python/paddle/fluid/contrib/slim/tests/CMakeLists.txt b/python/paddle/fluid/contrib/slim/tests/CMakeLists.txt
index 249de87090ed8a50fa3e00cd4087b42636c70fa0..20c60dc58b78dc6134254018a3e5d7c886cff524 100644
--- a/python/paddle/fluid/contrib/slim/tests/CMakeLists.txt
+++ b/python/paddle/fluid/contrib/slim/tests/CMakeLists.txt
@@ -270,12 +270,6 @@ list(REMOVE_ITEM TEST_OPS
 #TODO(wanghaoshuang): Fix this unitest failed on GCC8.
 LIST(REMOVE_ITEM TEST_OPS test_auto_pruning)
 LIST(REMOVE_ITEM TEST_OPS test_filter_pruning)
-
-# only tests on singal GPU environment
-LIST(REMOVE_ITEM TEST_OPS test_imperative_qat_addquantdequant)
-
-py_test_modules(test_imperative_qat_addquantdequant MODULES test_imperative_qat_addquantdequant ENVS
-	CUDA_VISIBLE_DEVICES=0)
 	
 # fix
 if(WIN32)
@@ -313,7 +307,6 @@ set_tests_properties(test_quantization_pass PROPERTIES TIMEOUT 120)
 set_tests_properties(test_imperative_qat_channelwise PROPERTIES TIMEOUT 120)
 set_tests_properties(test_user_defined_quantization PROPERTIES TIMEOUT 120)
 set_tests_properties(test_imperative_qat PROPERTIES TIMEOUT 120)
-set_tests_properties(test_imperative_qat_addquantdequant PROPERTIES TIMEOUT 120)
 set_tests_properties(test_imperative_out_scale PROPERTIES TIMEOUT 120)
 if(LINUX AND WITH_MKLDNN)
     set_tests_properties(test_quant2_int8_mobilenetv1_mkldnn PROPERTIES TIMEOUT 120)
diff --git a/python/paddle/fluid/contrib/slim/tests/imperative_test_utils.py b/python/paddle/fluid/contrib/slim/tests/imperative_test_utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..cc26f6a88f2e0f32435b631d36c58bd7ae20744b
--- /dev/null
+++ b/python/paddle/fluid/contrib/slim/tests/imperative_test_utils.py
@@ -0,0 +1,224 @@
+#   copyright (c) 2021 paddlepaddle authors. all rights reserved.
+#
+# licensed under the apache license, version 2.0 (the "license");
+# you may not use this file except in compliance with the license.
+# you may obtain a copy of the license at
+#
+#     http://www.apache.org/licenses/license-2.0
+#
+# unless required by applicable law or agreed to in writing, software
+# distributed under the license is distributed on an "as is" basis,
+# without warranties or conditions of any kind, either express or implied.
+# see the license for the specific language governing permissions and
+# limitations under the license.
+import numpy as np
+import logging
+
+import paddle
+import paddle.fluid as fluid
+from paddle.fluid import core
+from paddle.fluid.dygraph.container import Sequential
+from paddle.nn import ReLU, ReLU6, LeakyReLU, Sigmoid, Softmax, PReLU
+from paddle.nn import Linear, Conv2D, Softmax, BatchNorm2D, MaxPool2D
+
+from paddle.fluid.log_helper import get_logger
+
+_logger = get_logger(
+    __name__, logging.INFO, fmt='%(asctime)s-%(levelname)s: %(message)s')
+
+
+def fix_model_dict(model):
+    fixed_state = {}
+    for name, param in model.named_parameters():
+        p_shape = param.numpy().shape
+        p_value = param.numpy()
+        if name.endswith("bias"):
+            value = np.zeros_like(p_value).astype('float32')
+        else:
+            value = np.random.normal(
+                loc=0.0, scale=0.01,
+                size=np.product(p_shape)).reshape(p_shape).astype('float32')
+        fixed_state[name] = value
+    model.set_dict(fixed_state)
+    return model
+
+
+def train_lenet(lenet, reader, optimizer):
+    loss_list = []
+    lenet.train()
+
+    for batch_id, data in enumerate(reader()):
+        x_data = np.array([x[0].reshape(1, 28, 28)
+                           for x in data]).astype('float32')
+        y_data = np.array([x[1] for x in data]).astype('int64').reshape(-1, 1)
+
+        img = paddle.to_tensor(x_data)
+        label = paddle.to_tensor(y_data)
+
+        out = lenet(img)
+        loss = fluid.layers.cross_entropy(out, label)
+        avg_loss = fluid.layers.mean(loss)
+        avg_loss.backward()
+
+        optimizer.minimize(avg_loss)
+        lenet.clear_gradients()
+
+        if batch_id % 100 == 0:
+            loss_list.append(avg_loss.numpy()[0])
+            _logger.info('{}: {}'.format('loss', avg_loss.numpy()))
+
+    return loss_list
+
+
+class ImperativeLenet(fluid.dygraph.Layer):
+    def __init__(self, num_classes=10):
+        super(ImperativeLenet, self).__init__()
+        conv2d_w1_attr = fluid.ParamAttr(name="conv2d_w_1")
+        conv2d_w2_attr = fluid.ParamAttr(name="conv2d_w_2")
+        fc_w1_attr = fluid.ParamAttr(name="fc_w_1")
+        fc_w2_attr = fluid.ParamAttr(name="fc_w_2")
+        fc_w3_attr = fluid.ParamAttr(name="fc_w_3")
+        conv2d_b2_attr = fluid.ParamAttr(name="conv2d_b_2")
+        fc_b1_attr = fluid.ParamAttr(name="fc_b_1")
+        fc_b2_attr = fluid.ParamAttr(name="fc_b_2")
+        fc_b3_attr = fluid.ParamAttr(name="fc_b_3")
+        self.features = Sequential(
+            Conv2D(
+                in_channels=1,
+                out_channels=6,
+                kernel_size=3,
+                stride=1,
+                padding=1,
+                weight_attr=conv2d_w1_attr,
+                bias_attr=False),
+            BatchNorm2D(6),
+            ReLU(),
+            MaxPool2D(
+                kernel_size=2, stride=2),
+            Conv2D(
+                in_channels=6,
+                out_channels=16,
+                kernel_size=5,
+                stride=1,
+                padding=0,
+                weight_attr=conv2d_w2_attr,
+                bias_attr=conv2d_b2_attr),
+            BatchNorm2D(16),
+            PReLU(),
+            MaxPool2D(
+                kernel_size=2, stride=2))
+
+        self.fc = Sequential(
+            Linear(
+                in_features=400,
+                out_features=120,
+                weight_attr=fc_w1_attr,
+                bias_attr=fc_b1_attr),
+            LeakyReLU(),
+            Linear(
+                in_features=120,
+                out_features=84,
+                weight_attr=fc_w2_attr,
+                bias_attr=fc_b2_attr),
+            Sigmoid(),
+            Linear(
+                in_features=84,
+                out_features=num_classes,
+                weight_attr=fc_w3_attr,
+                bias_attr=fc_b3_attr),
+            Softmax())
+        self.add = paddle.nn.quant.add()
+
+    def forward(self, inputs):
+        x = self.features(inputs)
+
+        x = fluid.layers.flatten(x, 1)
+        x = self.add(x, paddle.to_tensor(0.0))  # For CI
+        x = self.fc(x)
+        return x
+
+
+class ImperativeLenetWithSkipQuant(fluid.dygraph.Layer):
+    def __init__(self, num_classes=10):
+        super(ImperativeLenetWithSkipQuant, self).__init__()
+
+        conv2d_w1_attr = fluid.ParamAttr(name="conv2d_w_1")
+        conv2d_w2_attr = fluid.ParamAttr(name="conv2d_w_2")
+        fc_w1_attr = fluid.ParamAttr(name="fc_w_1")
+        fc_w2_attr = fluid.ParamAttr(name="fc_w_2")
+        fc_w3_attr = fluid.ParamAttr(name="fc_w_3")
+        conv2d_b1_attr = fluid.ParamAttr(name="conv2d_b_1")
+        conv2d_b2_attr = fluid.ParamAttr(name="conv2d_b_2")
+        fc_b1_attr = fluid.ParamAttr(name="fc_b_1")
+        fc_b2_attr = fluid.ParamAttr(name="fc_b_2")
+        fc_b3_attr = fluid.ParamAttr(name="fc_b_3")
+        self.conv2d_0 = Conv2D(
+            in_channels=1,
+            out_channels=6,
+            kernel_size=3,
+            stride=1,
+            padding=1,
+            weight_attr=conv2d_w1_attr,
+            bias_attr=conv2d_b1_attr)
+        self.conv2d_0.skip_quant = True
+
+        self.batch_norm_0 = BatchNorm2D(6)
+        self.relu_0 = ReLU()
+        self.pool2d_0 = MaxPool2D(kernel_size=2, stride=2)
+        self.conv2d_1 = Conv2D(
+            in_channels=6,
+            out_channels=16,
+            kernel_size=5,
+            stride=1,
+            padding=0,
+            weight_attr=conv2d_w2_attr,
+            bias_attr=conv2d_b2_attr)
+        self.conv2d_1.skip_quant = False
+
+        self.batch_norm_1 = BatchNorm2D(16)
+        self.relu6_0 = ReLU6()
+        self.pool2d_1 = MaxPool2D(kernel_size=2, stride=2)
+        self.linear_0 = Linear(
+            in_features=400,
+            out_features=120,
+            weight_attr=fc_w1_attr,
+            bias_attr=fc_b1_attr)
+        self.linear_0.skip_quant = True
+
+        self.leaky_relu_0 = LeakyReLU()
+        self.linear_1 = Linear(
+            in_features=120,
+            out_features=84,
+            weight_attr=fc_w2_attr,
+            bias_attr=fc_b2_attr)
+        self.linear_1.skip_quant = False
+
+        self.sigmoid_0 = Sigmoid()
+        self.linear_2 = Linear(
+            in_features=84,
+            out_features=num_classes,
+            weight_attr=fc_w3_attr,
+            bias_attr=fc_b3_attr)
+        self.linear_2.skip_quant = False
+        self.softmax_0 = Softmax()
+
+    def forward(self, inputs):
+        x = self.conv2d_0(inputs)
+        x = self.batch_norm_0(x)
+        x = self.relu_0(x)
+        x = self.pool2d_0(x)
+        x = self.conv2d_1(x)
+        x = self.batch_norm_1(x)
+        x = self.relu6_0(x)
+        x = self.pool2d_1(x)
+
+        x = fluid.layers.flatten(x, 1)
+
+        x = self.linear_0(x)
+        x = self.leaky_relu_0(x)
+        x = self.linear_1(x)
+        x = self.sigmoid_0(x)
+        x = self.linear_2(x)
+        x = self.softmax_0(x)
+
+        return x
diff --git a/python/paddle/fluid/contrib/slim/tests/test_imperative_out_scale.py b/python/paddle/fluid/contrib/slim/tests/test_imperative_out_scale.py
index 8d6ce76ef0fa5f3d1b1e9400c705ffc625fcf9bb..6cc58a38f227a5181971110b369c47fa706bddae 100644
--- a/python/paddle/fluid/contrib/slim/tests/test_imperative_out_scale.py
+++ b/python/paddle/fluid/contrib/slim/tests/test_imperative_out_scale.py
@@ -28,7 +28,6 @@ from paddle.fluid import core
 from paddle.fluid.optimizer import AdamOptimizer
 from paddle.fluid.framework import IrGraph
 from paddle.fluid.contrib.slim.quantization import ImperativeQuantAware
-from paddle.fluid.contrib.slim.quantization import OutScaleForTrainingPass, OutScaleForInferencePass, QuantizationTransformPass
 from paddle.fluid.dygraph.container import Sequential
 from paddle.fluid.dygraph.io import INFER_MODEL_SUFFIX, INFER_PARAMS_SUFFIX
 from paddle.nn.layer import ReLU, LeakyReLU, Sigmoid, Softmax, PReLU
@@ -36,6 +35,8 @@ from paddle.nn import Linear, Conv2D, Softmax, BatchNorm2D, MaxPool2D
 from paddle.fluid.log_helper import get_logger
 from paddle.fluid.dygraph import nn
 
+from imperative_test_utils import fix_model_dict, train_lenet, ImperativeLenet
+
 paddle.enable_static()
 
 os.environ["CPU_NUM"] = "1"
@@ -54,59 +55,6 @@ def get_vaild_warning_num(warning, w):
     return num
 
 
-def StaticLenet(data, num_classes=10, classifier_activation='softmax'):
-    conv2d_w1_attr = fluid.ParamAttr(name="conv2d_w_1")
-    conv2d_w2_attr = fluid.ParamAttr(name="conv2d_w_2")
-    fc_w1_attr = fluid.ParamAttr(name="fc_w_1")
-    fc_w2_attr = fluid.ParamAttr(name="fc_w_2")
-    fc_w3_attr = fluid.ParamAttr(name="fc_w_3")
-    conv2d_b2_attr = fluid.ParamAttr(name="conv2d_b_2")
-    fc_b1_attr = fluid.ParamAttr(name="fc_b_1")
-    fc_b2_attr = fluid.ParamAttr(name="fc_b_2")
-    fc_b3_attr = fluid.ParamAttr(name="fc_b_3")
-    conv1 = fluid.layers.conv2d(
-        data,
-        num_filters=6,
-        filter_size=3,
-        stride=1,
-        padding=1,
-        param_attr=conv2d_w1_attr,
-        bias_attr=False)
-    batch_norm1 = layers.batch_norm(conv1)
-    relu1 = layers.relu(batch_norm1)
-    pool1 = fluid.layers.pool2d(
-        relu1, pool_size=2, pool_type='max', pool_stride=2)
-    conv2 = fluid.layers.conv2d(
-        pool1,
-        num_filters=16,
-        filter_size=5,
-        stride=1,
-        padding=0,
-        param_attr=conv2d_w2_attr,
-        bias_attr=conv2d_b2_attr)
-    batch_norm2 = layers.batch_norm(conv2)
-    prelu1 = layers.prelu(batch_norm2, mode='all')
-    pool2 = fluid.layers.pool2d(
-        prelu1, pool_size=2, pool_type='max', pool_stride=2)
-
-    fc1 = fluid.layers.fc(input=pool2,
-                          size=120,
-                          param_attr=fc_w1_attr,
-                          bias_attr=fc_b1_attr)
-    leaky_relu1 = layers.leaky_relu(fc1, alpha=0.01)
-    fc2 = fluid.layers.fc(input=leaky_relu1,
-                          size=84,
-                          param_attr=fc_w2_attr,
-                          bias_attr=fc_b2_attr)
-    sigmoid1 = layers.sigmoid(fc2)
-    fc3 = fluid.layers.fc(input=sigmoid1,
-                          size=num_classes,
-                          param_attr=fc_w3_attr,
-                          bias_attr=fc_b3_attr)
-    softmax1 = layers.softmax(fc3, use_cudnn=True)
-    return softmax1
-
-
 class ImperativeLenet(fluid.dygraph.Layer):
     def __init__(self, num_classes=10):
         super(ImperativeLenet, self).__init__()
@@ -175,38 +123,11 @@ class ImperativeLenet(fluid.dygraph.Layer):
 
 class TestImperativeOutSclae(unittest.TestCase):
     def test_out_scale_acc(self):
-        def _build_static_lenet(main, startup, is_test=False, seed=1000):
-            with fluid.unique_name.guard():
-                with fluid.program_guard(main, startup):
-                    main.random_seed = seed
-                    startup.random_seed = seed
-                    img = fluid.layers.data(
-                        name='image', shape=[1, 28, 28], dtype='float32')
-                    label = fluid.layers.data(
-                        name='label', shape=[1], dtype='int64')
-                    prediction = StaticLenet(img)
-                    if not is_test:
-                        loss = fluid.layers.cross_entropy(
-                            input=prediction, label=label)
-                        avg_loss = fluid.layers.mean(loss)
-                    else:
-                        avg_loss = prediction
-            return img, label, avg_loss
-
-        reader = paddle.batch(
-            paddle.dataset.mnist.test(), batch_size=32, drop_last=True)
-        weight_quantize_type = 'abs_max'
-        activation_quantize_type = 'moving_average_abs_max'
-        param_init_map = {}
         seed = 1000
         lr = 0.001
-        dynamic_out_scale_list = []
-        static_out_scale_list = []
 
-        # imperative train
-        _logger.info(
-            "--------------------------dynamic graph qat--------------------------"
-        )
+        weight_quantize_type = 'abs_max'
+        activation_quantize_type = 'moving_average_abs_max'
         imperative_out_scale = ImperativeQuantAware(
             weight_quantize_type=weight_quantize_type,
             activation_quantize_type=activation_quantize_type)
@@ -215,207 +136,46 @@ class TestImperativeOutSclae(unittest.TestCase):
             np.random.seed(seed)
             fluid.default_main_program().random_seed = seed
             fluid.default_startup_program().random_seed = seed
+
             lenet = ImperativeLenet()
-            fixed_state = {}
-            for name, param in lenet.named_parameters():
-                p_shape = param.numpy().shape
-                p_value = param.numpy()
-                if name.endswith("bias"):
-                    value = np.zeros_like(p_value).astype('float32')
-                else:
-                    value = np.random.normal(
-                        loc=0.0, scale=0.01, size=np.product(p_shape)).reshape(
-                            p_shape).astype('float32')
-                fixed_state[name] = value
-                param_init_map[param.name] = value
-            lenet.set_dict(fixed_state)
+            lenet = fix_model_dict(lenet)
             imperative_out_scale.quantize(lenet)
+
+            reader = paddle.batch(
+                paddle.dataset.mnist.test(), batch_size=32, drop_last=True)
             adam = AdamOptimizer(
                 learning_rate=lr, parameter_list=lenet.parameters())
-            dynamic_loss_rec = []
-            lenet.train()
-            for batch_id, data in enumerate(reader()):
-                x_data = np.array([x[0].reshape(1, 28, 28)
-                                   for x in data]).astype('float32')
-                y_data = np.array(
-                    [x[1] for x in data]).astype('int64').reshape(-1, 1)
-
-                img = fluid.dygraph.to_variable(x_data)
-                label = fluid.dygraph.to_variable(y_data)
-
-                out = lenet(img)
-                loss = fluid.layers.cross_entropy(out, label)
-                avg_loss = fluid.layers.mean(loss)
-                avg_loss.backward()
-                adam.minimize(avg_loss)
-                lenet.clear_gradients()
-                dynamic_loss_rec.append(avg_loss.numpy()[0])
-                if batch_id % 100 == 0:
-                    _logger.info('{}: {}'.format('loss', avg_loss.numpy()))
-
+            loss_list = train_lenet(lenet, reader, adam)
             lenet.eval()
 
         param_save_path = "test_save_quantized_model/lenet.pdparams"
         save_dict = lenet.state_dict()
         paddle.save(save_dict, param_save_path)
 
-        path = "./dynamic_outscale_infer_model/lenet"
-        dynamic_save_dir = "./dynamic_outscale_infer_model"
-
+        save_path = "./dynamic_outscale_infer_model/lenet"
         imperative_out_scale.save_quantized_model(
             layer=lenet,
-            path=path,
+            path=save_path,
             input_spec=[
                 paddle.static.InputSpec(
                     shape=[None, 1, 28, 28], dtype='float32')
             ])
 
-        _logger.info(
-            "--------------------------static graph qat--------------------------"
-        )
-        static_loss_rec = []
-        if core.is_compiled_with_cuda():
-            place = core.CUDAPlace(0)
-        else:
-            place = core.CPUPlace()
-        exe = fluid.Executor(place)
-
-        main = fluid.Program()
-        infer = fluid.Program()
-        startup = fluid.Program()
-        static_img, static_label, static_loss = _build_static_lenet(
-            main, startup, False, seed)
-        infer_img, _, infer_pre = _build_static_lenet(infer, startup, True,
-                                                      seed)
-        with fluid.unique_name.guard():
-            with fluid.program_guard(main, startup):
-                opt = AdamOptimizer(learning_rate=lr)
-                opt.minimize(static_loss)
-
-        scope = core.Scope()
-        with fluid.scope_guard(scope):
-            exe.run(startup)
-        for param in main.all_parameters():
-            if "batch_norm" in param.name:
-                param_name = param.name.replace("norm", "norm2d")
-            elif 'prelu' in param.name:
-                param_name = param.name.replace("prelu", 'p_re_lu')
-            else:
-                param_name = param.name
-            param_tensor = scope.var(param.name).get_tensor()
-            param_tensor.set(param_init_map[param_name], place)
-        main_graph = IrGraph(core.Graph(main.desc), for_test=False)
-        infer_graph = IrGraph(core.Graph(infer.desc), for_test=True)
-        transform_pass = QuantizationTransformPass(
-            scope=scope,
-            place=place,
-            activation_quantize_type=activation_quantize_type,
-            weight_quantize_type=weight_quantize_type,
-            quantizable_op_type=['conv2d', 'depthwise_conv2d', 'mul'])
-        transform_pass.apply(main_graph)
-        transform_pass.apply(infer_graph)
-        outscale_pass = OutScaleForTrainingPass(scope=scope, place=place)
-        outscale_pass.apply(main_graph)
-        build_strategy = fluid.BuildStrategy()
-        build_strategy.fuse_all_reduce_ops = False
-        binary = fluid.CompiledProgram(main_graph.graph).with_data_parallel(
-            loss_name=static_loss.name, build_strategy=build_strategy)
-
-        feeder = fluid.DataFeeder(
-            feed_list=[static_img, static_label], place=place)
-        with fluid.scope_guard(scope):
-            for batch_id, data in enumerate(reader()):
-                loss_v, = exe.run(binary,
-                                  feed=feeder.feed(data),
-                                  fetch_list=[static_loss])
-                static_loss_rec.append(loss_v[0])
-                if batch_id % 100 == 0:
-                    _logger.info('{}: {}'.format('loss', loss_v))
-        scale_inference_pass = OutScaleForInferencePass(scope=scope)
-        scale_inference_pass.apply(infer_graph)
-
-        save_program = infer_graph.to_program()
-        static_save_dir = "./static_outscale_infer_model"
-        with fluid.scope_guard(scope):
-            fluid.io.save_inference_model(
-                dirname=static_save_dir,
-                feeded_var_names=[infer_img.name],
-                target_vars=[infer_pre],
-                executor=exe,
-                main_program=save_program,
-                model_filename="lenet" + INFER_MODEL_SUFFIX,
-                params_filename="lenet" + INFER_PARAMS_SUFFIX)
-
-        rtol = 1e-05
-        atol = 1e-08
-        for i, (loss_d,
-                loss_s) in enumerate(zip(dynamic_loss_rec, static_loss_rec)):
-            diff = np.abs(loss_d - loss_s)
-            if diff > (atol + rtol * np.abs(loss_s)):
-                _logger.info(
-                    "diff({}) at {}, dynamic loss = {}, static loss = {}".
-                    format(diff, i, loss_d, loss_s))
-                break
-        self.assertTrue(
-            np.allclose(
-                np.array(dynamic_loss_rec),
-                np.array(static_loss_rec),
-                rtol=rtol,
-                atol=atol,
-                equal_nan=True),
-            msg='Failed to do the imperative qat.')
-
-        # load dynamic model
-        [dynamic_inference_program, feed_target_names, fetch_targets] = (
-            fluid.io.load_inference_model(
-                dirname=dynamic_save_dir,
-                executor=exe,
-                model_filename="lenet" + INFER_MODEL_SUFFIX,
-                params_filename="lenet" + INFER_PARAMS_SUFFIX))
-        # load static model
-        [static_inference_program, feed_target_names, fetch_targets] = (
-            fluid.io.load_inference_model(
-                dirname=static_save_dir,
-                executor=exe,
-                model_filename="lenet" + INFER_MODEL_SUFFIX,
-                params_filename="lenet" + INFER_PARAMS_SUFFIX))
-
-        dynamic_ops = dynamic_inference_program.global_block().ops
-        static_ops = static_inference_program.global_block().ops
-
-        for op in dynamic_ops[:]:
-            if op.type == "flatten2" or 'fake' in op.type:
-                dynamic_ops.remove(op)
-
-        for op in static_ops[:]:
-            if 'fake' in op.type:
-                static_ops.remove(op)
-
-        op_count = 0
-        for i in range(len(dynamic_ops)):
-            if dynamic_ops[i].has_attr("out_threshold"):
-                op_count += 1
-                self.assertTrue(dynamic_ops[i].type == static_ops[i].type)
-                if dynamic_ops[i].attr("out_threshold") != static_ops[i].attr(
-                        "out_threshold"):
-                    _logger.info(dynamic_ops[i].attr("out_threshold"))
-                    _logger.info(static_ops[i].attr("out_threshold"))
-                self.assertTrue(dynamic_ops[i].attr("out_threshold") ==
-                                static_ops[i].attr("out_threshold"))
-
-        _logger.info("op_cout: {}".format(op_count))
-        self.assertTrue(op_count == 14)
+        for i in range(len(loss_list) - 1):
+            self.assertTrue(
+                loss_list[i] > loss_list[i + 1],
+                msg='Failed to do the imperative qat.')
 
 
 class TestSaveQuanztizedModelFromCheckPoint(unittest.TestCase):
     def test_save_quantized_model(self):
-        weight_quantize_type = 'abs_max'
-        activation_quantize_type = 'moving_average_abs_max'
+        lr = 0.001
+
         load_param_path = "test_save_quantized_model/lenet.pdparams"
-        path = "./dynamic_outscale_infer_model_from_checkpoint/lenet"
-        dynamic_model_save_dir = "./dynamic_outscale_infer_model_from_checkpoint"
-        static_model_save_dir = "./static_outscale_infer_model"
+        save_path = "./dynamic_outscale_infer_model_from_checkpoint/lenet"
 
+        weight_quantize_type = 'abs_max'
+        activation_quantize_type = 'moving_average_abs_max'
         imperative_out_scale = ImperativeQuantAware(
             weight_quantize_type=weight_quantize_type,
             activation_quantize_type=activation_quantize_type)
@@ -426,56 +186,25 @@ class TestSaveQuanztizedModelFromCheckPoint(unittest.TestCase):
             imperative_out_scale.quantize(lenet)
             lenet.set_dict(load_dict)
 
+            reader = paddle.batch(
+                paddle.dataset.mnist.test(), batch_size=32, drop_last=True)
+            adam = AdamOptimizer(
+                learning_rate=lr, parameter_list=lenet.parameters())
+            loss_list = train_lenet(lenet, reader, adam)
+            lenet.eval()
+
         imperative_out_scale.save_quantized_model(
             layer=lenet,
-            path=path,
+            path=save_path,
             input_spec=[
                 paddle.static.InputSpec(
                     shape=[None, 1, 28, 28], dtype='float32')
             ])
 
-        if core.is_compiled_with_cuda():
-            place = core.CUDAPlace(0)
-        else:
-            place = core.CPUPlace()
-        exe = fluid.Executor(place)
-
-        # load dynamic model
-        [dynamic_inference_program, feed_target_names, fetch_targets] = (
-            fluid.io.load_inference_model(
-                dirname=dynamic_model_save_dir,
-                executor=exe,
-                model_filename="lenet" + INFER_MODEL_SUFFIX,
-                params_filename="lenet" + INFER_PARAMS_SUFFIX))
-        # load static model
-        [static_inference_program, feed_target_names, fetch_targets] = (
-            fluid.io.load_inference_model(
-                dirname=static_model_save_dir,
-                executor=exe,
-                model_filename="lenet" + INFER_MODEL_SUFFIX,
-                params_filename="lenet" + INFER_PARAMS_SUFFIX))
-
-        dynamic_ops = dynamic_inference_program.global_block().ops
-        static_ops = static_inference_program.global_block().ops
-
-        for op in dynamic_ops[:]:
-            if op.type == "flatten2" or 'fake' in op.type:
-                dynamic_ops.remove(op)
-
-        for op in static_ops[:]:
-            if 'fake' in op.type:
-                static_ops.remove(op)
-
-        op_count = 0
-        for i in range(len(dynamic_ops)):
-            if dynamic_ops[i].has_attr("out_threshold"):
-                op_count += 1
-                self.assertTrue(dynamic_ops[i].type == static_ops[i].type)
-                self.assertTrue(dynamic_ops[i].attr("out_threshold") ==
-                                static_ops[i].attr("out_threshold"))
-
-        _logger.info("op_cout: {}".format(op_count))
-        self.assertTrue(op_count == 14)
+        for i in range(len(loss_list) - 1):
+            self.assertTrue(
+                loss_list[i] > loss_list[i + 1],
+                msg='Failed to do the imperative qat.')
 
 
 if __name__ == '__main__':
diff --git a/python/paddle/fluid/contrib/slim/tests/test_imperative_qat.py b/python/paddle/fluid/contrib/slim/tests/test_imperative_qat.py
index 99a23525409f3746982db402132db5d04f936bd4..bf411e5b38efaed4c99b628d19f1021df5e0facf 100644
--- a/python/paddle/fluid/contrib/slim/tests/test_imperative_qat.py
+++ b/python/paddle/fluid/contrib/slim/tests/test_imperative_qat.py
@@ -21,20 +21,20 @@ import shutil
 import time
 import unittest
 import logging
+
 import paddle
 import paddle.fluid as fluid
 from paddle.fluid import core
 from paddle.fluid.optimizer import AdamOptimizer
-from paddle.fluid.framework import IrGraph
 from paddle.fluid.contrib.slim.quantization import ImperativeQuantAware
-from paddle.fluid.contrib.slim.quantization import QuantizationTransformPass
 from paddle.fluid.dygraph.container import Sequential
 from paddle.nn import Linear, Conv2D, Softmax
-from paddle.fluid.dygraph.nn import Pool2D
 from paddle.fluid.log_helper import get_logger
 from paddle.fluid.dygraph.io import INFER_MODEL_SUFFIX, INFER_PARAMS_SUFFIX
 from paddle.fluid.contrib.slim.quantization.imperative.quant_nn import QuantizedConv2D
 
+from imperative_test_utils import fix_model_dict, ImperativeLenet
+
 paddle.enable_static()
 
 os.environ["CPU_NUM"] = "1"
@@ -45,115 +45,6 @@ _logger = get_logger(
     __name__, logging.INFO, fmt='%(asctime)s-%(levelname)s: %(message)s')
 
 
-def StaticLenet(data, num_classes=10):
-    conv2d_w1_attr = fluid.ParamAttr(name="conv2d_w_1")
-    conv2d_w2_attr = fluid.ParamAttr(name="conv2d_w_2")
-    fc_w1_attr = fluid.ParamAttr(name="fc_w_1")
-    fc_w2_attr = fluid.ParamAttr(name="fc_w_2")
-    fc_w3_attr = fluid.ParamAttr(name="fc_w_3")
-    conv2d_b1_attr = fluid.ParamAttr(name="conv2d_b_1")
-    conv2d_b2_attr = fluid.ParamAttr(name="conv2d_b_2")
-    fc_b1_attr = fluid.ParamAttr(name="fc_b_1")
-    fc_b2_attr = fluid.ParamAttr(name="fc_b_2")
-    fc_b3_attr = fluid.ParamAttr(name="fc_b_3")
-    conv1 = fluid.layers.conv2d(
-        data,
-        num_filters=6,
-        filter_size=3,
-        stride=1,
-        padding=1,
-        param_attr=conv2d_w1_attr,
-        bias_attr=conv2d_b1_attr)
-    pool1 = fluid.layers.pool2d(
-        conv1, pool_size=2, pool_type='max', pool_stride=2)
-    conv2 = fluid.layers.conv2d(
-        pool1,
-        num_filters=16,
-        filter_size=5,
-        stride=1,
-        padding=0,
-        param_attr=conv2d_w2_attr,
-        bias_attr=conv2d_b2_attr)
-    pool2 = fluid.layers.pool2d(
-        conv2, pool_size=2, pool_type='max', pool_stride=2)
-
-    fc1 = fluid.layers.fc(input=pool2,
-                          size=120,
-                          param_attr=fc_w1_attr,
-                          bias_attr=fc_b1_attr)
-    fc2 = fluid.layers.fc(input=fc1,
-                          size=84,
-                          param_attr=fc_w2_attr,
-                          bias_attr=fc_b2_attr)
-    fc3 = fluid.layers.fc(input=fc2,
-                          size=num_classes,
-                          param_attr=fc_w3_attr,
-                          bias_attr=fc_b3_attr)
-    fc4 = fluid.layers.softmax(fc3, use_cudnn=True)
-
-    return fc4
-
-
-class ImperativeLenet(fluid.dygraph.Layer):
-    def __init__(self, num_classes=10):
-        super(ImperativeLenet, self).__init__()
-        conv2d_w1_attr = fluid.ParamAttr(name="conv2d_w_1")
-        conv2d_w2_attr = fluid.ParamAttr(name="conv2d_w_2")
-        fc_w1_attr = fluid.ParamAttr(name="fc_w_1")
-        fc_w2_attr = fluid.ParamAttr(name="fc_w_2")
-        fc_w3_attr = fluid.ParamAttr(name="fc_w_3")
-        conv2d_b1_attr = fluid.ParamAttr(name="conv2d_b_1")
-        conv2d_b2_attr = fluid.ParamAttr(name="conv2d_b_2")
-        fc_b1_attr = fluid.ParamAttr(name="fc_b_1")
-        fc_b2_attr = fluid.ParamAttr(name="fc_b_2")
-        fc_b3_attr = fluid.ParamAttr(name="fc_b_3")
-        self.features = Sequential(
-            Conv2D(
-                in_channels=1,
-                out_channels=6,
-                kernel_size=3,
-                stride=1,
-                padding=1,
-                weight_attr=conv2d_w1_attr,
-                bias_attr=conv2d_b1_attr),
-            Pool2D(
-                pool_size=2, pool_type='max', pool_stride=2),
-            Conv2D(
-                in_channels=6,
-                out_channels=16,
-                kernel_size=5,
-                stride=1,
-                padding=0,
-                weight_attr=conv2d_w2_attr,
-                bias_attr=conv2d_b2_attr),
-            Pool2D(
-                pool_size=2, pool_type='max', pool_stride=2))
-
-        self.fc = Sequential(
-            Linear(
-                in_features=400,
-                out_features=120,
-                weight_attr=fc_w1_attr,
-                bias_attr=fc_b1_attr),
-            Linear(
-                in_features=120,
-                out_features=84,
-                weight_attr=fc_w2_attr,
-                bias_attr=fc_b2_attr),
-            Linear(
-                in_features=84,
-                out_features=num_classes,
-                weight_attr=fc_w3_attr,
-                bias_attr=fc_b3_attr),
-            Softmax())
-
-    def forward(self, inputs):
-        x = self.features(inputs)
-        x = fluid.layers.flatten(x, 1)
-        x = self.fc(x)
-        return x
-
-
 class TestImperativeQat(unittest.TestCase):
     """
     QAT = quantization-aware training
@@ -164,19 +55,26 @@ class TestImperativeQat(unittest.TestCase):
         timestamp = time.strftime('%Y-%m-%d-%H-%M-%S', time.localtime())
         cls.root_path = os.path.join(os.getcwd(), "imperative_qat_" + timestamp)
         cls.save_path = os.path.join(cls.root_path, "lenet")
-        cls.dynamic_root_path = os.path.join(os.getcwd(),
-                                             "dynamic_mnist_" + timestamp)
-        cls.dynamic_save_path = os.path.join(cls.dynamic_root_path, "model")
 
     @classmethod
     def tearDownClass(cls):
-        shutil.rmtree(cls.root_path)
-        shutil.rmtree(cls.dynamic_root_path)
+        try:
+            shutil.rmtree(cls.root_path)
+        except Exception as e:
+            print("Failed to delete {} due to {}".format(cls.root_path, str(e)))
+
+    def set_vars(self):
+        self.weight_quantize_type = None
+        self.activation_quantize_type = None
+        print('weight_quantize_type', self.weight_quantize_type)
+
+    def run_qat_save(self):
+        self.set_vars()
 
-    def test_qat_save(self):
         imperative_qat = ImperativeQuantAware(
-            weight_quantize_type='abs_max',
-            activation_quantize_type='moving_average_abs_max')
+            weight_quantize_type=self.weight_quantize_type,
+            activation_quantize_type=self.activation_quantize_type)
+
         with fluid.dygraph.guard():
             # For CI coverage
             conv1 = Conv2D(
@@ -190,10 +88,17 @@ class TestImperativeQat(unittest.TestCase):
             data = np.random.uniform(-1, 1, [10, 3, 32, 32]).astype('float32')
             quant_conv1(fluid.dygraph.to_variable(data))
 
+            seed = 1
+            np.random.seed(seed)
+            fluid.default_main_program().random_seed = seed
+            fluid.default_startup_program().random_seed = seed
+
             lenet = ImperativeLenet()
+            lenet = fix_model_dict(lenet)
             imperative_qat.quantize(lenet)
             adam = AdamOptimizer(
                 learning_rate=0.001, parameter_list=lenet.parameters())
+
             train_reader = paddle.batch(
                 paddle.dataset.mnist.train(), batch_size=32, drop_last=True)
             test_reader = paddle.batch(
@@ -226,6 +131,7 @@ class TestImperativeQat(unittest.TestCase):
                         break
 
                 lenet.eval()
+                eval_acc_top1_list = []
                 for batch_id, data in enumerate(test_reader()):
                     x_data = np.array([x[0].reshape(1, 28, 28)
                                        for x in data]).astype('float32')
@@ -242,14 +148,19 @@ class TestImperativeQat(unittest.TestCase):
                         input=out, label=label, k=5)
 
                     if batch_id % 100 == 0:
+                        eval_acc_top1_list.append(float(acc_top1.numpy()))
                         _logger.info(
                             "Test | At epoch {} step {}: acc1 = {:}, acc5 = {:}".
                             format(epoch, batch_id,
                                    acc_top1.numpy(), acc_top5.numpy()))
 
-            # save weights
-            model_dict = lenet.state_dict()
-            fluid.save_dygraph(model_dict, "save_temp")
+                # check eval acc
+                eval_acc_top1 = sum(eval_acc_top1_list) / len(
+                    eval_acc_top1_list)
+                print('eval_acc_top1', eval_acc_top1)
+                self.assertTrue(
+                    eval_acc_top1 > 0.9,
+                    msg="The test acc {%f} is less than 0.9." % eval_acc_top1)
 
             # test the correctness of `paddle.jit.save`
             data = next(test_reader())
@@ -260,13 +171,14 @@ class TestImperativeQat(unittest.TestCase):
             before_save = lenet(test_img)
 
         # save inference quantized model
-        paddle.jit.save(
+        imperative_qat.save_quantized_model(
             layer=lenet,
-            path=TestImperativeQat.save_path,
+            path=self.save_path,
             input_spec=[
                 paddle.static.InputSpec(
                     shape=[None, 1, 28, 28], dtype='float32')
             ])
+        print('Quantized model saved in {%s}' % self.save_path)
 
         if core.is_compiled_with_cuda():
             place = core.CUDAPlace(0)
@@ -275,183 +187,27 @@ class TestImperativeQat(unittest.TestCase):
         exe = fluid.Executor(place)
         [inference_program, feed_target_names,
          fetch_targets] = fluid.io.load_inference_model(
-             dirname=TestImperativeQat.root_path,
+             dirname=self.root_path,
              executor=exe,
              model_filename="lenet" + INFER_MODEL_SUFFIX,
              params_filename="lenet" + INFER_PARAMS_SUFFIX)
         after_save, = exe.run(inference_program,
                               feed={feed_target_names[0]: test_data},
                               fetch_list=fetch_targets)
-
+        # check
         self.assertTrue(
             np.allclose(after_save, before_save.numpy()),
             msg='Failed to save the inference quantized model.')
 
-    def test_qat_acc(self):
-        def _build_static_lenet(main, startup, is_test=False, seed=1000):
-            with fluid.unique_name.guard():
-                with fluid.program_guard(main, startup):
-                    main.random_seed = seed
-                    startup.random_seed = seed
-                    img = fluid.layers.data(
-                        name='image', shape=[1, 28, 28], dtype='float32')
-                    label = fluid.layers.data(
-                        name='label', shape=[1], dtype='int64')
-                    prediction = StaticLenet(img)
-                    if not is_test:
-                        loss = fluid.layers.cross_entropy(
-                            input=prediction, label=label)
-                        avg_loss = fluid.layers.mean(loss)
-                    else:
-                        avg_loss = prediction
-            return img, label, avg_loss
-
-        reader = paddle.batch(
-            paddle.dataset.mnist.test(), batch_size=32, drop_last=True)
-        weight_quantize_type = 'abs_max'
-        activation_quant_type = 'moving_average_abs_max'
-        param_init_map = {}
-        seed = 1000
-        lr = 0.01
-
-        # imperative train
-        _logger.info(
-            "--------------------------dynamic graph qat--------------------------"
-        )
-        imperative_qat = ImperativeQuantAware(
-            weight_quantize_type=weight_quantize_type,
-            activation_quantize_type=activation_quant_type)
 
-        with fluid.dygraph.guard():
-            np.random.seed(seed)
-            fluid.default_main_program().random_seed = seed
-            fluid.default_startup_program().random_seed = seed
-            lenet = ImperativeLenet()
-            fixed_state = {}
-            for name, param in lenet.named_parameters():
-                p_shape = param.numpy().shape
-                p_value = param.numpy()
-                if name.endswith("bias"):
-                    value = np.zeros_like(p_value).astype('float32')
-                else:
-                    value = np.random.normal(
-                        loc=0.0, scale=0.01, size=np.product(p_shape)).reshape(
-                            p_shape).astype('float32')
-                fixed_state[name] = value
-                param_init_map[param.name] = value
-            lenet.set_dict(fixed_state)
+class TestImperativeQatAbsMax(TestImperativeQat):
+    def set_vars(self):
+        self.weight_quantize_type = 'abs_max'
+        self.activation_quantize_type = 'moving_average_abs_max'
+        print('weight_quantize_type', self.weight_quantize_type)
 
-            imperative_qat.quantize(lenet)
-            adam = AdamOptimizer(
-                learning_rate=lr, parameter_list=lenet.parameters())
-            dynamic_loss_rec = []
-            lenet.train()
-            for batch_id, data in enumerate(reader()):
-                x_data = np.array([x[0].reshape(1, 28, 28)
-                                   for x in data]).astype('float32')
-                y_data = np.array(
-                    [x[1] for x in data]).astype('int64').reshape(-1, 1)
-
-                img = fluid.dygraph.to_variable(x_data)
-                label = fluid.dygraph.to_variable(y_data)
-
-                out = lenet(img)
-                loss = fluid.layers.cross_entropy(out, label)
-                avg_loss = fluid.layers.mean(loss)
-                avg_loss.backward()
-                adam.minimize(avg_loss)
-                lenet.clear_gradients()
-                dynamic_loss_rec.append(avg_loss.numpy()[0])
-                if batch_id % 100 == 0:
-                    _logger.info('{}: {}'.format('loss', avg_loss.numpy()))
-
-        paddle.jit.save(
-            layer=lenet,
-            path=TestImperativeQat.dynamic_save_path,
-            input_spec=[
-                paddle.static.InputSpec(
-                    shape=[None, 1, 28, 28], dtype='float32')
-            ])
-
-        # static graph train
-        _logger.info(
-            "--------------------------static graph qat--------------------------"
-        )
-        static_loss_rec = []
-        if core.is_compiled_with_cuda():
-            place = core.CUDAPlace(0)
-        else:
-            place = core.CPUPlace()
-        exe = fluid.Executor(place)
-
-        main = fluid.Program()
-        infer = fluid.Program()
-        startup = fluid.Program()
-        static_img, static_label, static_loss = _build_static_lenet(
-            main, startup, False, seed)
-        infer_img, _, infer_pre = _build_static_lenet(infer, startup, True,
-                                                      seed)
-        with fluid.unique_name.guard():
-            with fluid.program_guard(main, startup):
-                opt = AdamOptimizer(learning_rate=lr)
-                opt.minimize(static_loss)
-
-        scope = core.Scope()
-        with fluid.scope_guard(scope):
-            exe.run(startup)
-        for param in main.all_parameters():
-            param_tensor = scope.var(param.name).get_tensor()
-            param_tensor.set(param_init_map[param.name], place)
-
-        main_graph = IrGraph(core.Graph(main.desc), for_test=False)
-        infer_graph = IrGraph(core.Graph(infer.desc), for_test=True)
-        transform_pass = QuantizationTransformPass(
-            scope=scope,
-            place=place,
-            activation_quantize_type=activation_quant_type,
-            weight_quantize_type=weight_quantize_type,
-            quantizable_op_type=['conv2d', 'depthwise_conv2d', 'mul'])
-        transform_pass.apply(main_graph)
-        transform_pass.apply(infer_graph)
-        build_strategy = fluid.BuildStrategy()
-        build_strategy.fuse_all_reduce_ops = False
-        binary = fluid.CompiledProgram(main_graph.graph).with_data_parallel(
-            loss_name=static_loss.name, build_strategy=build_strategy)
-
-        feeder = fluid.DataFeeder(
-            feed_list=[static_img, static_label], place=place)
-        with fluid.scope_guard(scope):
-            for batch_id, data in enumerate(reader()):
-                loss_v, = exe.run(binary,
-                                  feed=feeder.feed(data),
-                                  fetch_list=[static_loss])
-                static_loss_rec.append(loss_v[0])
-                if batch_id % 100 == 0:
-                    _logger.info('{}: {}'.format('loss', loss_v))
-
-        save_program = infer_graph.to_program()
-        with fluid.scope_guard(scope):
-            fluid.io.save_inference_model("./static_mnist", [infer_img.name],
-                                          [infer_pre], exe, save_program)
-        rtol = 1e-05
-        atol = 1e-08
-        for i, (loss_d,
-                loss_s) in enumerate(zip(dynamic_loss_rec, static_loss_rec)):
-            diff = np.abs(loss_d - loss_s)
-            if diff > (atol + rtol * np.abs(loss_s)):
-                _logger.info(
-                    "diff({}) at {}, dynamic loss = {}, static loss = {}".
-                    format(diff, i, loss_d, loss_s))
-                break
-
-        self.assertTrue(
-            np.allclose(
-                np.array(dynamic_loss_rec),
-                np.array(static_loss_rec),
-                rtol=rtol,
-                atol=atol,
-                equal_nan=True),
-            msg='Failed to do the imperative qat.')
+    def test_qat(self):
+        self.run_qat_save()
 
 
 if __name__ == '__main__':
diff --git a/python/paddle/fluid/contrib/slim/tests/test_imperative_qat_addquantdequant.py b/python/paddle/fluid/contrib/slim/tests/test_imperative_qat_addquantdequant.py
deleted file mode 100644
index f5b3e89ef415c113add9b04f65d3f27cd16244a1..0000000000000000000000000000000000000000
--- a/python/paddle/fluid/contrib/slim/tests/test_imperative_qat_addquantdequant.py
+++ /dev/null
@@ -1,494 +0,0 @@
-#   copyright (c) 2018 paddlepaddle authors. all rights reserved.
-#
-# licensed under the apache license, version 2.0 (the "license");
-# you may not use this file except in compliance with the license.
-# you may obtain a copy of the license at
-#
-#     http://www.apache.org/licenses/license-2.0
-#
-# unless required by applicable law or agreed to in writing, software
-# distributed under the license is distributed on an "as is" basis,
-# without warranties or conditions of any kind, either express or implied.
-# see the license for the specific language governing permissions and
-# limitations under the license.
-
-from __future__ import print_function
-
-import os
-import numpy as np
-import random
-import shutil
-import time
-import unittest
-import logging
-import paddle
-import six
-import paddle.fluid as fluid
-from paddle.nn import functional
-from paddle.nn import Linear, Conv2D, Softmax, BatchNorm
-from paddle.fluid.layers import nn
-from paddle.fluid import core
-from paddle.fluid.layer_helper import LayerHelper
-from paddle.fluid.optimizer import AdamOptimizer
-from paddle.fluid.framework import IrGraph
-from paddle.fluid.contrib.slim.quantization import ImperativeQuantAware, QuantizationTransformPass, AddQuantDequantPass
-from paddle.fluid.dygraph.container import Sequential
-from paddle.fluid.dygraph.nn import Pool2D
-from paddle.nn.layer.activation import ReLU, LeakyReLU, ReLU6, Tanh, Swish
-from paddle.fluid.log_helper import get_logger
-from paddle.fluid.dygraph.io import INFER_MODEL_SUFFIX, INFER_PARAMS_SUFFIX
-
-paddle.enable_static()
-
-os.environ["CPU_NUM"] = "1"
-if core.is_compiled_with_cuda():
-    fluid.set_flags({"FLAGS_cudnn_deterministic": True})
-
-_logger = get_logger(
-    __name__, logging.INFO, fmt='%(asctime)s-%(levelname)s: %(message)s')
-
-
-def StaticLenet(data, num_classes=10):
-    conv2d_w1_attr = fluid.ParamAttr(name="conv2d_w_1")
-    conv2d_w2_attr = fluid.ParamAttr(name="conv2d_w_2")
-    conv2d_w3_attr = fluid.ParamAttr(name="conv2d_w_3")
-    fc_w1_attr = fluid.ParamAttr(name="fc_w_1")
-    fc_w2_attr = fluid.ParamAttr(name="fc_w_2")
-    fc_w3_attr = fluid.ParamAttr(name="fc_w_3")
-    conv2d_b1_attr = fluid.ParamAttr(name="conv2d_b_1")
-    conv2d_b2_attr = fluid.ParamAttr(name="conv2d_b_2")
-    conv2d_b3_attr = fluid.ParamAttr(name="conv2d_b_3")
-    fc_b1_attr = fluid.ParamAttr(name="fc_b_1")
-    fc_b2_attr = fluid.ParamAttr(name="fc_b_2")
-    fc_b3_attr = fluid.ParamAttr(name="fc_b_3")
-
-    conv1 = fluid.layers.conv2d(
-        data,
-        num_filters=6,
-        filter_size=3,
-        stride=1,
-        padding=1,
-        param_attr=conv2d_w1_attr,
-        bias_attr=conv2d_b1_attr)
-    conv1 = fluid.layers.leaky_relu(conv1, alpha=0.02)
-    pool1 = fluid.layers.pool2d(
-        conv1, pool_size=2, pool_type='max', pool_stride=2)
-    conv2 = fluid.layers.conv2d(
-        pool1,
-        num_filters=16,
-        filter_size=5,
-        stride=1,
-        padding=0,
-        param_attr=conv2d_w2_attr,
-        bias_attr=conv2d_b2_attr)
-    pool2 = fluid.layers.pool2d(
-        conv2, pool_size=2, pool_type='max', pool_stride=2)
-    pool2 = fluid.layers.relu(pool2)
-    pool2 = fluid.layers.swish(pool2)
-    conv3 = fluid.layers.conv2d(
-        pool2,
-        num_filters=16,
-        filter_size=1,
-        stride=1,
-        padding=0,
-        param_attr=conv2d_w3_attr,
-        bias_attr=conv2d_b3_attr)
-    conv3 = fluid.layers.relu6(conv3)
-    conv3 = paddle.tensor.math.tanh(conv3)
-    fc1 = fluid.layers.fc(input=conv3,
-                          size=120,
-                          param_attr=fc_w1_attr,
-                          bias_attr=fc_b1_attr)
-    fc2 = fluid.layers.fc(input=fc1,
-                          size=84,
-                          param_attr=fc_w2_attr,
-                          bias_attr=fc_b2_attr)
-    fc3 = fluid.layers.fc(input=fc2,
-                          size=num_classes,
-                          param_attr=fc_w3_attr,
-                          bias_attr=fc_b3_attr)
-    fc3 = fluid.layers.softmax(fc3, use_cudnn=True)
-
-    return fc3
-
-
-class ImperativeLenet(fluid.dygraph.Layer):
-    def __init__(self, num_classes=10):
-        super(ImperativeLenet, self).__init__()
-        conv2d_w1_attr = fluid.ParamAttr(name="conv2d_w_1")
-        conv2d_w2_attr = fluid.ParamAttr(name="conv2d_w_2")
-        conv2d_w3_attr = fluid.ParamAttr(name="conv2d_w_3")
-        fc_w1_attr = fluid.ParamAttr(name="fc_w_1")
-        fc_w2_attr = fluid.ParamAttr(name="fc_w_2")
-        fc_w3_attr = fluid.ParamAttr(name="fc_w_3")
-        conv2d_b1_attr = fluid.ParamAttr(name="conv2d_b_1")
-        conv2d_b2_attr = fluid.ParamAttr(name="conv2d_b_2")
-        conv2d_b3_attr = fluid.ParamAttr(name="conv2d_b_3")
-        fc_b1_attr = fluid.ParamAttr(name="fc_b_1")
-        fc_b2_attr = fluid.ParamAttr(name="fc_b_2")
-        fc_b3_attr = fluid.ParamAttr(name="fc_b_3")
-        self.features = Sequential(
-            Conv2D(
-                in_channels=1,
-                out_channels=6,
-                kernel_size=3,
-                stride=1,
-                padding=1,
-                weight_attr=conv2d_w1_attr,
-                bias_attr=conv2d_b1_attr),
-            LeakyReLU(negative_slope=0.02),
-            Pool2D(
-                pool_size=2, pool_type='max', pool_stride=2),
-            Conv2D(
-                in_channels=6,
-                out_channels=16,
-                kernel_size=5,
-                stride=1,
-                padding=0,
-                weight_attr=conv2d_w2_attr,
-                bias_attr=conv2d_b2_attr),
-            Pool2D(
-                pool_size=2, pool_type='max', pool_stride=2),
-            ReLU(),
-            Swish(),
-            Conv2D(
-                in_channels=16,
-                out_channels=16,
-                kernel_size=1,
-                stride=1,
-                padding=0,
-                weight_attr=conv2d_w3_attr,
-                bias_attr=conv2d_b3_attr),
-            ReLU6(),
-            Tanh())
-        self.fc = Sequential(
-            Linear(
-                in_features=400,
-                out_features=120,
-                weight_attr=fc_w1_attr,
-                bias_attr=fc_b1_attr),
-            Linear(
-                in_features=120,
-                out_features=84,
-                weight_attr=fc_w2_attr,
-                bias_attr=fc_b2_attr),
-            Linear(
-                in_features=84,
-                out_features=num_classes,
-                weight_attr=fc_w3_attr,
-                bias_attr=fc_b3_attr),
-            Softmax())
-
-    def forward(self, inputs):
-        x = self.features(inputs)
-        x = fluid.layers.flatten(x, 1)
-        x = self.fc(x)
-        return x
-
-
-class TestImperativeAddQuantDequant(unittest.TestCase):
-    @classmethod
-    def setUpClass(cls):
-        timestamp = time.strftime('%Y-%m-%d-%H-%M-%S', time.localtime())
-        cls.root_path = os.path.join(os.getcwd(),
-                                     "imperative_qat_aqd_" + timestamp)
-        cls.save_path = os.path.join(cls.root_path, "lenet")
-        cls.dynamic_root_path = os.path.join(os.getcwd(),
-                                             "dynamic_mnist_aqd_" + timestamp)
-        cls.dynamic_save_path = os.path.join(cls.dynamic_root_path, "model")
-
-    @classmethod
-    def tearDownClass(cls):
-        shutil.rmtree(cls.root_path)
-        shutil.rmtree(cls.dynamic_root_path)
-
-    def test_qat_save(self):
-
-        imperative_qat = ImperativeQuantAware(
-            weight_quantize_type='abs_max',
-            activation_quantize_type='moving_average_abs_max',
-            quantizable_layer_type=[
-                'Conv2D', 'Linear', 'ReLU', 'LeakyReLU', 'ReLU6', 'Tanh',
-                'Swish'
-            ])
-
-        with fluid.dygraph.guard():
-            lenet = ImperativeLenet()
-            imperative_qat.quantize(lenet)
-            adam = AdamOptimizer(
-                learning_rate=0.001, parameter_list=lenet.parameters())
-            train_reader = paddle.batch(
-                paddle.dataset.mnist.train(), batch_size=32, drop_last=True)
-            test_reader = paddle.batch(
-                paddle.dataset.mnist.test(), batch_size=32)
-
-            epoch_num = 1
-            for epoch in range(epoch_num):
-                lenet.train()
-                for batch_id, data in enumerate(train_reader()):
-                    x_data = np.array([x[0].reshape(1, 28, 28)
-                                       for x in data]).astype('float32')
-                    y_data = np.array(
-                        [x[1] for x in data]).astype('int64').reshape(-1, 1)
-
-                    img = fluid.dygraph.to_variable(x_data)
-                    label = fluid.dygraph.to_variable(y_data)
-                    out = lenet(img)
-                    acc = fluid.layers.accuracy(out, label)
-                    loss = fluid.layers.cross_entropy(out, label)
-                    avg_loss = fluid.layers.mean(loss)
-                    avg_loss.backward()
-                    adam.minimize(avg_loss)
-                    lenet.clear_gradients()
-                    if batch_id % 100 == 0:
-                        _logger.info(
-                            "Train | At epoch {} step {}: loss = {:}, acc= {:}".
-                            format(epoch, batch_id,
-                                   avg_loss.numpy(), acc.numpy()))
-                    if batch_id == 500:  # For shortening CI time
-                        break
-
-                lenet.eval()
-                for batch_id, data in enumerate(test_reader()):
-                    x_data = np.array([x[0].reshape(1, 28, 28)
-                                       for x in data]).astype('float32')
-                    y_data = np.array(
-                        [x[1] for x in data]).astype('int64').reshape(-1, 1)
-
-                    img = fluid.dygraph.to_variable(x_data)
-                    label = fluid.dygraph.to_variable(y_data)
-
-                    out = lenet(img)
-                    acc_top1 = fluid.layers.accuracy(
-                        input=out, label=label, k=1)
-                    acc_top5 = fluid.layers.accuracy(
-                        input=out, label=label, k=5)
-
-                    if batch_id % 100 == 0:
-                        _logger.info(
-                            "Test | At epoch {} step {}: acc1 = {:}, acc5 = {:}".
-                            format(epoch, batch_id,
-                                   acc_top1.numpy(), acc_top5.numpy()))
-
-            # save weights
-            model_dict = lenet.state_dict()
-            fluid.save_dygraph(model_dict, "save_temp")
-
-            # test the correctness of `paddle.jit.save`
-            data = next(test_reader())
-            test_data = np.array([x[0].reshape(1, 28, 28)
-                                  for x in data]).astype('float32')
-            test_img = fluid.dygraph.to_variable(test_data)
-            lenet.eval()
-            before_save = lenet(test_img)
-
-        # save inference quantized model
-        paddle.jit.save(
-            layer=lenet,
-            path=TestImperativeAddQuantDequant.save_path,
-            input_spec=[
-                paddle.static.InputSpec(
-                    shape=[None, 1, 28, 28], dtype='float32')
-            ])
-        if core.is_compiled_with_cuda():
-            place = core.CUDAPlace(0)
-        else:
-            place = core.CPUPlace()
-        exe = fluid.Executor(place)
-        [inference_program, feed_target_names,
-         fetch_targets] = fluid.io.load_inference_model(
-             dirname=TestImperativeAddQuantDequant.root_path,
-             executor=exe,
-             model_filename="lenet" + INFER_MODEL_SUFFIX,
-             params_filename="lenet" + INFER_PARAMS_SUFFIX)
-        after_save, = exe.run(inference_program,
-                              feed={feed_target_names[0]: test_data},
-                              fetch_list=fetch_targets)
-
-        self.assertTrue(
-            np.allclose(after_save, before_save.numpy()),
-            msg='Failed to save the inference quantized model.')
-
-    def test_qat_acc(self):
-        def _build_static_lenet(main, startup, is_test=False, seed=1000):
-            with fluid.unique_name.guard():
-                with fluid.program_guard(main, startup):
-                    main.random_seed = seed
-                    startup.random_seed = seed
-                    img = fluid.layers.data(
-                        name='image', shape=[1, 28, 28], dtype='float32')
-                    label = fluid.layers.data(
-                        name='label', shape=[1], dtype='int64')
-                    prediction = StaticLenet(img)
-                    if not is_test:
-                        loss = fluid.layers.cross_entropy(
-                            input=prediction, label=label)
-                        avg_loss = fluid.layers.mean(loss)
-                    else:
-                        avg_loss = prediction
-            return img, label, avg_loss
-
-        reader = paddle.batch(
-            paddle.dataset.mnist.test(), batch_size=32, drop_last=True)
-        weight_quantize_type = 'abs_max'
-        activation_quant_type = 'moving_average_abs_max'
-        param_init_map = {}
-        seed = 1000
-        lr = 0.001
-
-        # imperative train
-        _logger.info(
-            "--------------------------dynamic graph qat--------------------------"
-        )
-        imperative_qat = ImperativeQuantAware(
-            weight_quantize_type=weight_quantize_type,
-            activation_quantize_type=activation_quant_type,
-            quantizable_layer_type=[
-                'Conv2D', 'Linear', 'ReLU', 'LeakyReLU', 'ReLU6', 'Tanh',
-                'Swish'
-            ])
-
-        with fluid.dygraph.guard():
-            np.random.seed(seed)
-            fluid.default_main_program().random_seed = seed
-            fluid.default_startup_program().random_seed = seed
-            lenet = ImperativeLenet()
-            fixed_state = {}
-            for name, param in lenet.named_parameters():
-                p_shape = param.numpy().shape
-                p_value = param.numpy()
-                if name.endswith("bias"):
-                    value = np.zeros_like(p_value).astype('float32')
-                else:
-                    value = np.random.normal(
-                        loc=0.0, scale=0.01, size=np.product(p_shape)).reshape(
-                            p_shape).astype('float32')
-                fixed_state[name] = value
-                param_init_map[param.name] = value
-            lenet.set_dict(fixed_state)
-
-            imperative_qat.quantize(lenet)
-            adam = AdamOptimizer(
-                learning_rate=lr, parameter_list=lenet.parameters())
-            dynamic_loss_rec = []
-            lenet.train()
-            for batch_id, data in enumerate(reader()):
-                x_data = np.array([x[0].reshape(1, 28, 28)
-                                   for x in data]).astype('float32')
-                y_data = np.array(
-                    [x[1] for x in data]).astype('int64').reshape(-1, 1)
-
-                img = fluid.dygraph.to_variable(x_data)
-                label = fluid.dygraph.to_variable(y_data)
-
-                out = lenet(img)
-                loss = fluid.layers.cross_entropy(out, label)
-                avg_loss = fluid.layers.mean(loss)
-                avg_loss.backward()
-                adam.minimize(avg_loss)
-                lenet.clear_gradients()
-                dynamic_loss_rec.append(avg_loss.numpy()[0])
-                if batch_id % 100 == 0:
-                    _logger.info('{}: {}'.format('loss', avg_loss.numpy()))
-                if batch_id > 500:
-                    break
-            lenet.eval()
-        paddle.jit.save(
-            layer=lenet,
-            path=TestImperativeAddQuantDequant.dynamic_save_path,
-            input_spec=[
-                paddle.static.InputSpec(
-                    shape=[None, 1, 28, 28], dtype='float32')
-            ])
-
-        # static graph train
-        _logger.info(
-            "--------------------------static graph qat--------------------------"
-        )
-        static_loss_rec = []
-        if core.is_compiled_with_cuda():
-            place = core.CUDAPlace(0)
-        else:
-            place = core.CPUPlace()
-        exe = fluid.Executor(place)
-
-        main = fluid.Program()
-        infer = fluid.Program()
-        startup = fluid.Program()
-        static_img, static_label, static_loss = _build_static_lenet(
-            main, startup, False, seed)
-        infer_img, _, infer_pre = _build_static_lenet(infer, startup, True,
-                                                      seed)
-        with fluid.unique_name.guard():
-            with fluid.program_guard(main, startup):
-                opt = AdamOptimizer(learning_rate=lr)
-                opt.minimize(static_loss)
-
-        scope = core.Scope()
-        with fluid.scope_guard(scope):
-            exe.run(startup)
-        for param in main.all_parameters():
-            param_tensor = scope.var(param.name).get_tensor()
-            param_tensor.set(param_init_map[param.name], place)
-
-        main_graph = IrGraph(core.Graph(main.desc), for_test=False)
-        infer_graph = IrGraph(core.Graph(infer.desc), for_test=True)
-        transform_pass = QuantizationTransformPass(
-            scope=scope,
-            place=place,
-            activation_quantize_type=activation_quant_type,
-            weight_quantize_type=weight_quantize_type,
-            quantizable_op_type=['conv2d', 'depthwise_conv2d', 'mul'])
-        add_quant_dequant_pass = AddQuantDequantPass(
-            scope=scope,
-            place=place,
-            quantizable_op_type=[
-                'relu', 'leaky_relu', 'relu6', 'tanh', 'swish'
-            ])
-        transform_pass.apply(main_graph)
-        transform_pass.apply(infer_graph)
-        add_quant_dequant_pass.apply(main_graph)
-        add_quant_dequant_pass.apply(infer_graph)
-        build_strategy = fluid.BuildStrategy()
-        build_strategy.fuse_all_reduce_ops = False
-        binary = fluid.CompiledProgram(main_graph.graph).with_data_parallel(
-            loss_name=static_loss.name, build_strategy=build_strategy)
-
-        feeder = fluid.DataFeeder(
-            feed_list=[static_img, static_label], place=place)
-        with fluid.scope_guard(scope):
-            for batch_id, data in enumerate(reader()):
-                loss_v, = exe.run(binary,
-                                  feed=feeder.feed(data),
-                                  fetch_list=[static_loss])
-                static_loss_rec.append(loss_v[0])
-                if batch_id % 100 == 0:
-                    _logger.info('{}: {}'.format('loss', loss_v))
-
-        save_program = infer_graph.to_program()
-        with fluid.scope_guard(scope):
-            fluid.io.save_inference_model("./static_mnist", [infer_img.name],
-                                          [infer_pre], exe, save_program)
-        rtol = 1e-08
-        atol = 1e-10
-        for i, (loss_d,
-                loss_s) in enumerate(zip(dynamic_loss_rec, static_loss_rec)):
-            diff = np.abs(loss_d - loss_s)
-            if diff > (atol + rtol * np.abs(loss_s)):
-                _logger.info(
-                    "diff({}) at {}, dynamic loss = {}, static loss = {}".
-                    format(diff, i, loss_d, loss_s))
-                break
-
-        self.assertTrue(
-            np.allclose(
-                np.array(dynamic_loss_rec),
-                np.array(static_loss_rec),
-                rtol=rtol,
-                atol=atol,
-                equal_nan=True),
-            msg='Failed to do the imperative qat.')
-
-
-if __name__ == '__main__':
-    unittest.main()
diff --git a/python/paddle/fluid/contrib/slim/tests/test_imperative_qat_channelwise.py b/python/paddle/fluid/contrib/slim/tests/test_imperative_qat_channelwise.py
index f888edfcc977ae1a919787fa7a56a89812aeb324..3d2cad388d172edb76a439e6b920bc4a03c26754 100644
--- a/python/paddle/fluid/contrib/slim/tests/test_imperative_qat_channelwise.py
+++ b/python/paddle/fluid/contrib/slim/tests/test_imperative_qat_channelwise.py
@@ -19,18 +19,13 @@ import numpy as np
 import random
 import unittest
 import logging
+
 import paddle
 import paddle.fluid as fluid
 from paddle.fluid import core
-from paddle.fluid.optimizer import AdamOptimizer
-from paddle.fluid.framework import IrGraph
-from paddle.fluid.contrib.slim.quantization import ImperativeQuantAware
-from paddle.fluid.contrib.slim.quantization import QuantizationTransformPass
-from paddle.fluid.dygraph.container import Sequential
-from paddle.nn import Linear, Conv2D, Softmax
-from paddle.fluid.dygraph.nn import Pool2D
 from paddle.fluid.log_helper import get_logger
-from paddle.fluid.dygraph.io import INFER_MODEL_SUFFIX, INFER_PARAMS_SUFFIX
+
+from test_imperative_qat import TestImperativeQat
 
 paddle.enable_static()
 
@@ -42,388 +37,14 @@ _logger = get_logger(
     __name__, logging.INFO, fmt='%(asctime)s-%(levelname)s: %(message)s')
 
 
-def StaticLenet(data, num_classes=10):
-    conv2d_w1_attr = fluid.ParamAttr(name="conv2d_w_1")
-    conv2d_w2_attr = fluid.ParamAttr(name="conv2d_w_2")
-    fc_w1_attr = fluid.ParamAttr(name="fc_w_1")
-    fc_w2_attr = fluid.ParamAttr(name="fc_w_2")
-    fc_w3_attr = fluid.ParamAttr(name="fc_w_3")
-    conv2d_b1_attr = fluid.ParamAttr(name="conv2d_b_1")
-    conv2d_b2_attr = fluid.ParamAttr(name="conv2d_b_2")
-    fc_b1_attr = fluid.ParamAttr(name="fc_b_1")
-    fc_b2_attr = fluid.ParamAttr(name="fc_b_2")
-    fc_b3_attr = fluid.ParamAttr(name="fc_b_3")
-    conv1 = fluid.layers.conv2d(
-        data,
-        num_filters=6,
-        filter_size=3,
-        stride=1,
-        padding=1,
-        param_attr=conv2d_w1_attr,
-        bias_attr=conv2d_b1_attr)
-    pool1 = fluid.layers.pool2d(
-        conv1, pool_size=2, pool_type='max', pool_stride=2)
-    conv2 = fluid.layers.conv2d(
-        pool1,
-        num_filters=16,
-        filter_size=5,
-        stride=1,
-        padding=0,
-        param_attr=conv2d_w2_attr,
-        bias_attr=conv2d_b2_attr)
-    pool2 = fluid.layers.pool2d(
-        conv2, pool_size=2, pool_type='max', pool_stride=2)
-
-    fc1 = fluid.layers.fc(input=pool2,
-                          size=120,
-                          param_attr=fc_w1_attr,
-                          bias_attr=fc_b1_attr)
-    fc2 = fluid.layers.fc(input=fc1,
-                          size=84,
-                          param_attr=fc_w2_attr,
-                          bias_attr=fc_b2_attr)
-    fc3 = fluid.layers.fc(input=fc2,
-                          size=num_classes,
-                          param_attr=fc_w3_attr,
-                          bias_attr=fc_b3_attr)
-    fc3 = fluid.layers.softmax(fc3, use_cudnn=True)
-
-    return fc3
-
-
-class ImperativeLenet(fluid.dygraph.Layer):
-    def __init__(self, num_classes=10):
-        super(ImperativeLenet, self).__init__()
-        conv2d_w1_attr = fluid.ParamAttr(name="conv2d_w_1")
-        conv2d_w2_attr = fluid.ParamAttr(name="conv2d_w_2")
-        fc_w1_attr = fluid.ParamAttr(name="fc_w_1")
-        fc_w2_attr = fluid.ParamAttr(name="fc_w_2")
-        fc_w3_attr = fluid.ParamAttr(name="fc_w_3")
-        conv2d_b1_attr = fluid.ParamAttr(name="conv2d_b_1")
-        conv2d_b2_attr = fluid.ParamAttr(name="conv2d_b_2")
-        fc_b1_attr = fluid.ParamAttr(name="fc_b_1")
-        fc_b2_attr = fluid.ParamAttr(name="fc_b_2")
-        fc_b3_attr = fluid.ParamAttr(name="fc_b_3")
-        self.features = Sequential(
-            Conv2D(
-                in_channels=1,
-                out_channels=6,
-                kernel_size=3,
-                stride=1,
-                padding=1,
-                weight_attr=conv2d_w1_attr,
-                bias_attr=conv2d_b1_attr),
-            Pool2D(
-                pool_size=2, pool_type='max', pool_stride=2),
-            Conv2D(
-                in_channels=6,
-                out_channels=16,
-                kernel_size=5,
-                stride=1,
-                padding=0,
-                weight_attr=conv2d_w2_attr,
-                bias_attr=conv2d_b2_attr),
-            Pool2D(
-                pool_size=2, pool_type='max', pool_stride=2))
-
-        self.fc = Sequential(
-            Linear(
-                in_features=400,
-                out_features=120,
-                weight_attr=fc_w1_attr,
-                bias_attr=fc_b1_attr),
-            Linear(
-                in_features=120,
-                out_features=84,
-                weight_attr=fc_w2_attr,
-                bias_attr=fc_b2_attr),
-            Linear(
-                in_features=84,
-                out_features=num_classes,
-                weight_attr=fc_w3_attr,
-                bias_attr=fc_b3_attr),
-            Softmax())
-
-    def forward(self, inputs):
-        x = self.features(inputs)
-        x = fluid.layers.flatten(x, 1)
-        x = self.fc(x)
-        return x
-
-
-class TestImperativeQatChannelWise(unittest.TestCase):
-    """
-    QAT = quantization-aware training
-    """
-
-    def test_qat_save(self):
-        imperative_qat = ImperativeQuantAware(
-            weight_quantize_type='channel_wise_abs_max',
-            activation_quantize_type='moving_average_abs_max')
-
-        with fluid.dygraph.guard():
-            lenet = ImperativeLenet()
-            imperative_qat.quantize(lenet)
-            adam = AdamOptimizer(
-                learning_rate=0.001, parameter_list=lenet.parameters())
-            train_reader = paddle.batch(
-                paddle.dataset.mnist.train(), batch_size=32, drop_last=True)
-            test_reader = paddle.batch(
-                paddle.dataset.mnist.test(), batch_size=32)
-
-            epoch_num = 1
-            for epoch in range(epoch_num):
-                lenet.train()
-                for batch_id, data in enumerate(train_reader()):
-                    x_data = np.array([x[0].reshape(1, 28, 28)
-                                       for x in data]).astype('float32')
-                    y_data = np.array(
-                        [x[1] for x in data]).astype('int64').reshape(-1, 1)
-
-                    img = fluid.dygraph.to_variable(x_data)
-                    label = fluid.dygraph.to_variable(y_data)
-                    out = lenet(img)
-                    acc = fluid.layers.accuracy(out, label)
-                    loss = fluid.layers.cross_entropy(out, label)
-                    avg_loss = fluid.layers.mean(loss)
-                    avg_loss.backward()
-                    adam.minimize(avg_loss)
-                    lenet.clear_gradients()
-                    if batch_id % 100 == 0:
-                        _logger.info(
-                            "Train | At epoch {} step {}: loss = {:}, acc= {:}".
-                            format(epoch, batch_id,
-                                   avg_loss.numpy(), acc.numpy()))
-
-                lenet.eval()
-                for batch_id, data in enumerate(test_reader()):
-                    x_data = np.array([x[0].reshape(1, 28, 28)
-                                       for x in data]).astype('float32')
-                    y_data = np.array(
-                        [x[1] for x in data]).astype('int64').reshape(-1, 1)
-
-                    img = fluid.dygraph.to_variable(x_data)
-                    label = fluid.dygraph.to_variable(y_data)
-
-                    out = lenet(img)
-                    acc_top1 = fluid.layers.accuracy(
-                        input=out, label=label, k=1)
-                    acc_top5 = fluid.layers.accuracy(
-                        input=out, label=label, k=5)
-
-                    if batch_id % 100 == 0:
-                        _logger.info(
-                            "Test | At epoch {} step {}: acc1 = {:}, acc5 = {:}".
-                            format(epoch, batch_id,
-                                   acc_top1.numpy(), acc_top5.numpy()))
-
-            # save weights
-            model_dict = lenet.state_dict()
-            fluid.save_dygraph(model_dict, "save_temp")
-
-            # test the correctness of `paddle.jit.save`
-            data = next(test_reader())
-            test_data = np.array([x[0].reshape(1, 28, 28)
-                                  for x in data]).astype('float32')
-            test_img = fluid.dygraph.to_variable(test_data)
-            lenet.eval()
-            before_save = lenet(test_img)
-
-        # save inference quantized model
-        path = "./qat_infer_model/mnist"
-        save_dir = "./qat_infer_model"
-        paddle.jit.save(
-            layer=lenet,
-            path=path,
-            input_spec=[
-                paddle.static.InputSpec(
-                    shape=[None, 1, 28, 28], dtype='float32')
-            ])
-
-        if core.is_compiled_with_cuda():
-            place = core.CUDAPlace(0)
-        else:
-            place = core.CPUPlace()
-        exe = fluid.Executor(place)
-        [inference_program, feed_target_names,
-         fetch_targets] = fluid.io.load_inference_model(
-             dirname=save_dir,
-             executor=exe,
-             model_filename="mnist" + INFER_MODEL_SUFFIX,
-             params_filename="mnist" + INFER_PARAMS_SUFFIX)
-        after_save, = exe.run(inference_program,
-                              feed={feed_target_names[0]: test_data},
-                              fetch_list=fetch_targets)
-
-        self.assertTrue(
-            np.allclose(after_save, before_save.numpy()),
-            msg='Failed to save the inference quantized model.')
-
-    def test_qat_acc(self):
-        def _build_static_lenet(main, startup, is_test=False, seed=1000):
-            with fluid.unique_name.guard():
-                with fluid.program_guard(main, startup):
-                    main.random_seed = seed
-                    startup.random_seed = seed
-                    img = fluid.layers.data(
-                        name='image', shape=[1, 28, 28], dtype='float32')
-                    label = fluid.layers.data(
-                        name='label', shape=[1], dtype='int64')
-                    prediction = StaticLenet(img)
-                    if not is_test:
-                        loss = fluid.layers.cross_entropy(
-                            input=prediction, label=label)
-                        avg_loss = fluid.layers.mean(loss)
-                    else:
-                        avg_loss = prediction
-            return img, label, avg_loss
-
-        reader = paddle.batch(
-            paddle.dataset.mnist.test(), batch_size=32, drop_last=True)
-        weight_quantize_type = 'channel_wise_abs_max'
-        activation_quant_type = 'moving_average_abs_max'
-        param_init_map = {}
-        seed = 1000
-        lr = 0.001
-
-        # imperative train
-        _logger.info(
-            "--------------------------dynamic graph qat--------------------------"
-        )
-        imperative_qat = ImperativeQuantAware(
-            weight_quantize_type=weight_quantize_type,
-            activation_quantize_type=activation_quant_type)
-
-        with fluid.dygraph.guard():
-            np.random.seed(seed)
-            fluid.default_main_program().random_seed = seed
-            fluid.default_startup_program().random_seed = seed
-            lenet = ImperativeLenet()
-            fixed_state = {}
-            for name, param in lenet.named_parameters():
-                p_shape = param.numpy().shape
-                p_value = param.numpy()
-                if name.endswith("bias"):
-                    value = np.zeros_like(p_value).astype('float32')
-                else:
-                    value = np.random.normal(
-                        loc=0.0, scale=0.01, size=np.product(p_shape)).reshape(
-                            p_shape).astype('float32')
-                fixed_state[name] = value
-                param_init_map[param.name] = value
-            lenet.set_dict(fixed_state)
-
-            imperative_qat.quantize(lenet)
-            adam = AdamOptimizer(
-                learning_rate=lr, parameter_list=lenet.parameters())
-            dynamic_loss_rec = []
-            lenet.train()
-            for batch_id, data in enumerate(reader()):
-                x_data = np.array([x[0].reshape(1, 28, 28)
-                                   for x in data]).astype('float32')
-                y_data = np.array(
-                    [x[1] for x in data]).astype('int64').reshape(-1, 1)
-
-                img = fluid.dygraph.to_variable(x_data)
-                label = fluid.dygraph.to_variable(y_data)
-
-                out = lenet(img)
-                loss = fluid.layers.cross_entropy(out, label)
-                avg_loss = fluid.layers.mean(loss)
-                avg_loss.backward()
-                adam.minimize(avg_loss)
-                lenet.clear_gradients()
-                dynamic_loss_rec.append(avg_loss.numpy()[0])
-                if batch_id % 100 == 0:
-                    _logger.info('{}: {}'.format('loss', avg_loss.numpy()))
-
-        paddle.jit.save(
-            layer=lenet,
-            path="./dynamic_mnist/model",
-            input_spec=[
-                paddle.static.InputSpec(
-                    shape=[None, 1, 28, 28], dtype='float32')
-            ])
-
-        # static graph train
-        _logger.info(
-            "--------------------------static graph qat--------------------------"
-        )
-        static_loss_rec = []
-        if core.is_compiled_with_cuda():
-            place = core.CUDAPlace(0)
-        else:
-            place = core.CPUPlace()
-        exe = fluid.Executor(place)
-
-        main = fluid.Program()
-        infer = fluid.Program()
-        startup = fluid.Program()
-        static_img, static_label, static_loss = _build_static_lenet(
-            main, startup, False, seed)
-        infer_img, _, infer_pre = _build_static_lenet(infer, startup, True,
-                                                      seed)
-        with fluid.unique_name.guard():
-            with fluid.program_guard(main, startup):
-                opt = AdamOptimizer(learning_rate=lr)
-                opt.minimize(static_loss)
-
-        scope = core.Scope()
-        with fluid.scope_guard(scope):
-            exe.run(startup)
-        for param in main.all_parameters():
-            param_tensor = scope.var(param.name).get_tensor()
-            param_tensor.set(param_init_map[param.name], place)
-
-        main_graph = IrGraph(core.Graph(main.desc), for_test=False)
-        infer_graph = IrGraph(core.Graph(infer.desc), for_test=True)
-        transform_pass = QuantizationTransformPass(
-            scope=scope,
-            place=place,
-            activation_quantize_type=activation_quant_type,
-            weight_quantize_type=weight_quantize_type,
-            quantizable_op_type=['conv2d', 'depthwise_conv2d', 'mul'])
-        transform_pass.apply(main_graph)
-        transform_pass.apply(infer_graph)
-        build_strategy = fluid.BuildStrategy()
-        build_strategy.fuse_all_reduce_ops = False
-        binary = fluid.CompiledProgram(main_graph.graph).with_data_parallel(
-            loss_name=static_loss.name, build_strategy=build_strategy)
-
-        feeder = fluid.DataFeeder(
-            feed_list=[static_img, static_label], place=place)
-        with fluid.scope_guard(scope):
-            for batch_id, data in enumerate(reader()):
-                loss_v, = exe.run(binary,
-                                  feed=feeder.feed(data),
-                                  fetch_list=[static_loss])
-                static_loss_rec.append(loss_v[0])
-                if batch_id % 100 == 0:
-                    _logger.info('{}: {}'.format('loss', loss_v))
-
-        save_program = infer_graph.to_program()
-        with fluid.scope_guard(scope):
-            fluid.io.save_inference_model("./static_mnist", [infer_img.name],
-                                          [infer_pre], exe, save_program)
-        rtol = 1e-05
-        atol = 1e-08
-        for i, (loss_d,
-                loss_s) in enumerate(zip(dynamic_loss_rec, static_loss_rec)):
-            diff = np.abs(loss_d - loss_s)
-            if diff > (atol + rtol * np.abs(loss_s)):
-                _logger.info(
-                    "diff({}) at {}, dynamic loss = {}, static loss = {}".
-                    format(diff, i, loss_d, loss_s))
-                break
+class TestImperativeQatChannelWise(TestImperativeQat):
+    def set_vars(self):
+        self.weight_quantize_type = 'channel_wise_abs_max'
+        self.activation_quantize_type = 'moving_average_abs_max'
+        print('weight_quantize_type', self.weight_quantize_type)
 
-        self.assertTrue(
-            np.allclose(
-                np.array(dynamic_loss_rec),
-                np.array(static_loss_rec),
-                rtol=rtol,
-                atol=atol,
-                equal_nan=True),
-            msg='Failed to do the imperative qat.')
+    def test_qat(self):
+        self.run_qat_save()
 
 
 if __name__ == '__main__':
diff --git a/python/paddle/fluid/contrib/slim/tests/test_imperative_skip_op.py b/python/paddle/fluid/contrib/slim/tests/test_imperative_skip_op.py
index bda02769cea861908d90fa7ec44f64a696593987..bb24f941c625e5d0d659c9dc9b9b70b6f199023a 100644
--- a/python/paddle/fluid/contrib/slim/tests/test_imperative_skip_op.py
+++ b/python/paddle/fluid/contrib/slim/tests/test_imperative_skip_op.py
@@ -31,6 +31,8 @@ from paddle.nn import Linear, Conv2D, Softmax, BatchNorm
 from paddle.fluid.dygraph.nn import Pool2D
 from paddle.fluid.log_helper import get_logger
 
+from imperative_test_utils import fix_model_dict, train_lenet, ImperativeLenetWithSkipQuant
+
 os.environ["CPU_NUM"] = "1"
 if core.is_compiled_with_cuda():
     fluid.set_flags({"FLAGS_cudnn_deterministic": True})
@@ -39,144 +41,33 @@ _logger = get_logger(
     __name__, logging.INFO, fmt='%(asctime)s-%(levelname)s: %(message)s')
 
 
-class ImperativeLenet(fluid.dygraph.Layer):
-    def __init__(self, num_classes=10):
-        super(ImperativeLenet, self).__init__()
-        conv2d_w1_attr = fluid.ParamAttr(name="conv2d_w_1")
-        conv2d_w2_attr = fluid.ParamAttr(name="conv2d_w_2")
-        fc_w1_attr = fluid.ParamAttr(name="fc_w_1")
-        fc_w2_attr = fluid.ParamAttr(name="fc_w_2")
-        fc_w3_attr = fluid.ParamAttr(name="fc_w_3")
-        conv2d_b1_attr = fluid.ParamAttr(name="conv2d_b_1")
-        conv2d_b2_attr = fluid.ParamAttr(name="conv2d_b_2")
-        fc_b1_attr = fluid.ParamAttr(name="fc_b_1")
-        fc_b2_attr = fluid.ParamAttr(name="fc_b_2")
-        fc_b3_attr = fluid.ParamAttr(name="fc_b_3")
-        self.conv2d_0 = Conv2D(
-            in_channels=1,
-            out_channels=6,
-            kernel_size=3,
-            stride=1,
-            padding=1,
-            weight_attr=conv2d_w1_attr,
-            bias_attr=conv2d_b1_attr)
-        self.conv2d_0.skip_quant = True
-
-        self.batch_norm_0 = BatchNorm(6)
-        self.relu_0 = ReLU()
-        self.pool2d_0 = Pool2D(pool_size=2, pool_type='max', pool_stride=2)
-        self.conv2d_1 = Conv2D(
-            in_channels=6,
-            out_channels=16,
-            kernel_size=5,
-            stride=1,
-            padding=0,
-            weight_attr=conv2d_w2_attr,
-            bias_attr=conv2d_b2_attr)
-        self.conv2d_1.skip_quant = False
-
-        self.batch_norm_1 = BatchNorm(16)
-        self.relu6_0 = ReLU6()
-        self.pool2d_1 = Pool2D(pool_size=2, pool_type='max', pool_stride=2)
-        self.linear_0 = Linear(
-            in_features=400,
-            out_features=120,
-            weight_attr=fc_w1_attr,
-            bias_attr=fc_b1_attr)
-        self.linear_0.skip_quant = True
-
-        self.leaky_relu_0 = LeakyReLU()
-        self.linear_1 = Linear(
-            in_features=120,
-            out_features=84,
-            weight_attr=fc_w2_attr,
-            bias_attr=fc_b2_attr)
-        self.linear_1.skip_quant = False
-
-        self.sigmoid_0 = Sigmoid()
-        self.linear_2 = Linear(
-            in_features=84,
-            out_features=num_classes,
-            weight_attr=fc_w3_attr,
-            bias_attr=fc_b3_attr)
-        self.linear_2.skip_quant = False
-        self.softmax_0 = Softmax()
-
-    def forward(self, inputs):
-        x = self.conv2d_0(inputs)
-        x = self.batch_norm_0(x)
-        x = self.relu_0(x)
-        x = self.pool2d_0(x)
-        x = self.conv2d_1(x)
-        x = self.batch_norm_1(x)
-        x = self.relu6_0(x)
-        x = self.pool2d_1(x)
-
-        x = fluid.layers.flatten(x, 1)
-
-        x = self.linear_0(x)
-        x = self.leaky_relu_0(x)
-        x = self.linear_1(x)
-        x = self.sigmoid_0(x)
-        x = self.linear_2(x)
-        x = self.softmax_0(x)
-
-        return x
-
-
 class TestImperativeOutSclae(unittest.TestCase):
     def test_out_scale_acc(self):
         seed = 1000
         lr = 0.1
 
-        imperative_out_scale = ImperativeQuantAware()
+        qat = ImperativeQuantAware()
 
         np.random.seed(seed)
         reader = paddle.batch(
             paddle.dataset.mnist.test(), batch_size=512, drop_last=True)
-        lenet = ImperativeLenet()
-        fixed_state = {}
-        for name, param in lenet.named_parameters():
-            p_shape = param.numpy().shape
-            p_value = param.numpy()
-            if name.endswith("bias"):
-                value = np.zeros_like(p_value).astype('float32')
-            else:
-                value = np.random.normal(
-                    loc=0.0, scale=0.01,
-                    size=np.product(p_shape)).reshape(p_shape).astype('float32')
-            fixed_state[name] = value
-        lenet.set_dict(fixed_state)
-        imperative_out_scale.quantize(lenet)
+
+        lenet = ImperativeLenetWithSkipQuant()
+        lenet = fix_model_dict(lenet)
+        qat.quantize(lenet)
+
         adam = AdamOptimizer(
             learning_rate=lr, parameter_list=lenet.parameters())
         dynamic_loss_rec = []
         lenet.train()
-        for batch_id, data in enumerate(reader()):
-            x_data = np.array([x[0].reshape(1, 28, 28)
-                               for x in data]).astype('float32')
-            y_data = np.array(
-                [x[1] for x in data]).astype('int64').reshape(-1, 1)
-
-            img = fluid.dygraph.to_variable(x_data)
-            label = fluid.dygraph.to_variable(y_data)
-
-            out = lenet(img)
-            loss = fluid.layers.cross_entropy(out, label)
-            avg_loss = fluid.layers.mean(loss)
-            avg_loss.backward()
-            adam.minimize(avg_loss)
-            lenet.clear_gradients()
-            dynamic_loss_rec.append(avg_loss.numpy()[0])
-            if batch_id % 100 == 0:
-                _logger.info('{}: {}'.format('loss', avg_loss.numpy()))
+        loss_list = train_lenet(lenet, reader, adam)
 
         lenet.eval()
 
         path = "./save_dynamic_quant_infer_model/lenet"
         save_dir = "./save_dynamic_quant_infer_model"
 
-        imperative_out_scale.save_quantized_model(
+        qat.save_quantized_model(
             layer=lenet,
             path=path,
             input_spec=[
diff --git a/python/paddle/fluid/contrib/sparsity/__init__.py b/python/paddle/fluid/contrib/sparsity/__init__.py
index f78ea1b1c38b85b04ab0e09757ec4d6eea5eaf4d..b36a79b8ca865e4b982ef0315f023525045fc069 100644
--- a/python/paddle/fluid/contrib/sparsity/__init__.py
+++ b/python/paddle/fluid/contrib/sparsity/__init__.py
@@ -15,7 +15,22 @@
 
 from __future__ import print_function
 
-from . import utils
-from .utils import *
+from .utils import calculate_density
+from .utils import check_mask_1d
+from .utils import get_mask_1d
+from .utils import check_mask_2d
+from .utils import get_mask_2d_greedy
+from .utils import get_mask_2d_best
+from .utils import create_mask
+from .utils import check_sparsity
+from .utils import MaskAlgo
+from .utils import CheckMethod
+from .asp import decorate, prune_model
+from .asp import set_excluded_layers, reset_excluded_layers
 
-__all__ = utils.__all__
+__all__ = [
+    'calculate_density', 'check_mask_1d', 'get_mask_1d', 'check_mask_2d',
+    'get_mask_2d_greedy', 'get_mask_2d_best', 'create_mask', 'check_sparsity',
+    'MaskAlgo', 'CheckMethod', 'decorate', 'prune_model', 'set_excluded_layers',
+    'reset_excluded_layers'
+]
diff --git a/python/paddle/fluid/contrib/sparsity/asp.py b/python/paddle/fluid/contrib/sparsity/asp.py
new file mode 100644
index 0000000000000000000000000000000000000000..fbabc73f37bce5ca42c292572ee4082e1706bea2
--- /dev/null
+++ b/python/paddle/fluid/contrib/sparsity/asp.py
@@ -0,0 +1,497 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+# Copyright (c) 2021 NVIDIA Corporation.  All rights reserved.
+# 
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+# 
+#     http://www.apache.org/licenses/LICENSE-2.0
+# 
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+Functions for Auto SParsity (ASP) training and inference.
+"""
+
+import copy
+import numpy as np
+import paddle
+from paddle.fluid import framework, global_scope, program_guard, layers
+from paddle.fluid.initializer import ConstantInitializer
+from paddle.fluid.contrib import sparsity
+from paddle.fluid import core
+
+__all__ = [
+    'decorate', 'prune_model', 'set_excluded_layers', 'reset_excluded_layers'
+]
+
+
+def set_excluded_layers(main_program, param_names):
+    r"""
+    Set parameter name of layers which would not be pruned as sparse weights.
+
+    Args:
+        main_program (Program, optional): Program with model definition and its parameters.
+        param_names (list): A list contains names of parameters.
+    """
+    ASPHelper.set_excluded_layers(
+        main_program=main_program, param_names=param_names)
+
+
+def reset_excluded_layers(main_program=None):
+    r"""
+    Reset exculded layers setting corresponding to :attr:`main_program`. If :attr:`main_program` 
+    is None, then all configurations of excluded_layers would be cleaned.
+
+    Args:
+        main_program (Program, optional): Program with model definition and its parameters.
+    """
+    ASPHelper.reset_excluded_layers(main_program=main_program)
+
+
+def decorate(optimizer):
+    r"""
+    Wrap the given optimizer as a OptimizerWithSparsityGuarantee, 
+    which would insert necessary ops for ASP workflows when calling minimize()
+
+    Args:
+        optimizer (Optimizer): A Optimizer used for training.
+    Returns:
+        OptimizerWithSparsityGuarantee: A wrapper for ASP to decorate `minimize` function of the given optimizer.
+    Examples:
+        .. code-block:: python
+
+            import paddle.fluid as fluid
+            from paddle.fluid.contrib import sparsity
+
+            main_program = fluid.Program()
+            startup_program = fluid.Program()
+
+            with fluid.program_guard(main_program, startup_program):
+                input_data = fluid.layers.data(name='data', shape=[None, 128])
+                label = fluid.layers.data(name='label', shape=[None, 10])
+                hidden = fluid.layers.fc(input=input_data, num_flatten_dims=-1, size=32, act=None)
+                prob = fluid.layers.fc(input=hidden, num_flatten_dims=-1, size=10, act=None)
+                loss = fluid.layers.mean(fluid.layers.square_error_cost(prob, label))
+
+                optimizer = fluid.optimizer.SGD(learning_rate=0.1)
+
+                optimizer = sparsity.decorate(optimizer)
+                optimizer.minimize(loss, startup_program)
+
+            # When apply distributed training with Fleet
+            import paddle.distributed.fleet as fleet
+
+            optimizer = fluid.optimizer.SGD(learning_rate=0.1)
+            optimizer = sparsity.decorate(optimizer) # Need to be called before `fleet.distributed_optimizer`
+            optimizer = fleet.distributed_optimizer(optimizer)
+            optimizer.minimize(loss, startup_program)
+    """
+    return ASPHelper.decorate(optimizer)
+
+
+def prune_model(place,
+                main_program=None,
+                n=2,
+                m=4,
+                func_name=sparsity.MaskAlgo.MASK_1D,
+                with_mask=True):
+    r"""
+    Pruning parameters of supported layers in :attr:`main_program` via 
+    specified mask generation function given by :attr:`func_name`. This 
+    function supports both training and inference controlled by :attr:`with_mask`.
+    If :attr:`with_mask` is True, it would also prune parameter related ASP mask Variables,
+    else only prunes parameters.
+
+    *Note*: If parameters are supported and in FP16, please set :attr:`n`=2, :attr:`m`=4, 
+    if they in FP32, then :attr:`n`=1, :attr:`m`=2` to further enable Sparse Tensor Core acceleration.
+
+    *Note*: If calling this function with :attr:`with_mask`, it should call `OptimizerWithSparsityGuarantee.minimize` 
+    and initialization (`exe.run(startup_program`)) before (For successfully obtain mask Variable). 
+    Typically set `with_mask` as true for training (have called `OptimizerWithSparsityGuarantee.minimize`) and false for 
+    inference only. To obtain OptimizerWithSparsityGuarantee, please see `sparsity.decoreate()`.
+
+    Args:
+        place (fluid.CPUPlace()|fluid.CUDAPlace(N)): Device place for pruned parameter and mask Variables, and N means the GPU's id. It should be the same as created instance of Executor.
+        main_program (Program, optional): Program with model definition and its parameters. Default is `paddle.static.default_main_program()
+        n (int): n of `n:m` sparse pattern.
+        m (int): m of `n:m` sparse pattern.
+        func_name (MaskAlgo, optional): The function name to generate spase mask. Default is `MaskAlgo.MASK_1D`. All options please refer to `MaskAlgo`.
+        with_mask (bool, optional): To prune mask Variables related to parameters or not. Ture is purning also, False is not. Defalut is True.
+    Returns:
+        dictionary: A dictionary with key: `parameter name` (string) and value: its corresponding mask Variable.
+    Examples:
+        .. code-block:: python
+
+            import paddle.fluid as fluid
+            from paddle.fluid.contrib import sparsity
+
+            main_program = fluid.Program()
+            startup_program = fluid.Program()
+
+            place = fluid.CUDAPlace(0)
+
+            with fluid.program_guard(main_program, startup_program):
+                input_data = fluid.layers.data(name='data', shape=[None, 128])
+                label = fluid.layers.data(name='label', shape=[None, 10])
+                hidden = fluid.layers.fc(input=input_data, num_flatten_dims=-1, size=32, act=None)
+                prob = fluid.layers.fc(input=hidden, num_flatten_dims=-1, size=10, act=None)
+                loss = fluid.layers.mean(fluid.layers.square_error_cost(prob, label))
+
+                optimizer = decorate(fluid.optimizer.SGD(learning_rate=0.1))
+                optimizer.minimize(optimizer, loss, main_program, startup_program)
+
+            exe = fluid.Executor(place)
+            exe.run(startup_program)
+
+            # Must call `exe.run(startup_program)` first before calling `sparsity.prune_model`
+            sparsity.prune_model(place, main_program, func_name=sparsity.MaskAlgo.MASK_2D_BEST)
+    """
+    return ASPHelper.prune_model(
+        place=place,
+        main_program=main_program,
+        n=n,
+        m=m,
+        func_name=func_name,
+        with_mask=with_mask)
+
+
+class ProgramASPInfo(object):
+    r"""
+    ProgramASPInfo is a container to keep ASP relevant information of Pragrom. It contains three inner-variables:
+    1. __mask_vars (Dictionary): Key is parameter's name and vaule is its corresponding sparse mask Variable object, which is created by `ASPHelper.create_mask_variables`.
+    2. __masks (Dictionary): Key is parameter's name and vaule is its corressponding sparse mask Numpy array, which is created by `ASPHelper.prune_model`.
+    3. __excluded_layers (List): It stores name of layers which should not involve into ASP workflow.
+    """
+
+    def __init__(self):
+        self.__mask_vars = {}
+        self.__masks = {}
+        self.__excluded_layers = []
+
+    def update_mask_vars(self, param_name, var):
+        self.__mask_vars[param_name] = var
+
+    def update_masks(self, param_name, var):
+        self.__masks[param_name] = var
+
+    def update_excluded_layers(self, param_names):
+        self.__excluded_layers.extend(copy.deepcopy(param_names))
+
+    def reset_excluded_layers(self):
+        self.__excluded_layers = []
+
+    @property
+    def mask_vars(self):
+        return self.__mask_vars
+
+    @property
+    def masks(self):
+        return self.__masks
+
+    @property
+    def excluded_layers(self):
+        return self.__excluded_layers
+
+
+class ASPHelper(object):
+    r"""
+    ASPHelper is a collection of Auto SParsity (ASP) functions to enable 
+
+    1. training models with weights in 2:4 sparse pattern on FP16 or 1:2 sparse pattern on FP32 from scratch.
+    2. pruning well-trained models into 2:4 sparse pattern on FP16 or 1:2 sparse pattern on FP32 for fine-tuning.
+    """
+
+    MASK_APPENDDED_NAME = '_asp_mask'
+    SUPPORTED_LAYERS = {'fc': 'w_0', 'linear': 'w_0', 'conv2d': 'w_0'}
+
+    __asp_info = {}
+
+    @classmethod
+    def set_excluded_layers(cls, main_program, param_names):
+        r"""
+        This is the implementation of `sparsity.set_excluded_layers`, for details please see explanation in `sparsity.set_excluded_layers`.
+        """
+        asp_info = cls._get_program_asp_info(main_program)
+        asp_info.update_excluded_layers(param_names)
+
+    @classmethod
+    def reset_excluded_layers(cls, main_program=None):
+        r"""
+        This is the implementation of `sparsity.reset_excluded_layers`, for details please see explanation in `sparsity.reset_excluded_layers`.
+        """
+        if main_program is None:
+            for asp_info in cls.__asp_info:
+                asp_info.reset_excluded_layers()
+        else:
+            cls._get_program_asp_info(main_program).reset_excluded_layers()
+
+    @staticmethod
+    def decorate(optimizer):
+        r"""
+        This is the implementation of `sparsity.decorate`, for details please see explanation in `sparsity.decorate`.
+        """
+        return OptimizerWithSparsityGuarantee(optimizer)
+
+    @classmethod
+    def prune_model(cls,
+                    place,
+                    main_program=None,
+                    n=2,
+                    m=4,
+                    func_name=sparsity.MaskAlgo.MASK_1D,
+                    with_mask=True):
+        r"""
+        This is the implementation of `sparsity.prune_model`, for details please see explanation in `sparsity.prune_model`.
+        """
+        checked_func_name = sparsity.CheckMethod.get_checking_method(func_name)
+
+        if main_program is None:
+            main_program = paddle.static.default_main_program()
+
+        asp_info = cls._get_program_asp_info(main_program)
+        for param in main_program.global_block().all_parameters():
+            if ASPHelper._is_supported_layer(main_program, param.name):
+                weight_tensor = global_scope().find_var(param.name).get_tensor()
+                weight_nparray = np.array(weight_tensor)
+
+                # The double transpose ops here make sure pruning direction consistent with cuSparseLt.
+                # SPMMA in cuSparseLt: D = (AxB) + C, where matrix A (mxk) is sparse matrix.
+                # cuSparseLt would prune matrix A along k dimension.
+                # In sparse training, layer weight matriices is viewed sparse matrix A, so
+                # the math fomula should be 'Act(WX + b)'. However, default fomula in PaddlePaddle
+                #  is 'Act(XW + b)'. For enabling SPMMA, weights and inputs should be transposed 
+                # for computing, Act( (W^T X^T)^T + b). Therefore, we have to prune alog k dimension 
+                # of W^T, which is m dimension of W. Moreove, all mask generating functions in 
+                # sparsity/utils is row-major pruning. That is the reason we have to transpose weight 
+                # matrices beforce invoking create_mask. Then we transpose the result maks to make 
+                # sure its shape to be the same as the input weight.
+                weight_sparse_mask = sparsity.create_mask(
+                    weight_nparray.T, func_name=func_name, n=n, m=m).T
+                weight_pruned_nparray = np.multiply(weight_nparray,
+                                                    weight_sparse_mask)
+                weight_tensor.set(weight_pruned_nparray, place)
+                assert sparsity.check_sparsity(weight_pruned_nparray.T,  n=n, m=m, func_name=checked_func_name), \
+                        'Pruning {} weight matrix failure!!!'.format(param.name)
+                if with_mask:
+                    weight_mask_param = global_scope().find_var(
+                        ASPHelper._get_mask_name(param.name))
+                    assert weight_mask_param is not None, \
+                        'Cannot find {} variable, please call ASPHelper.minimize' \
+                        ' and initialization (exe.run(startup_program)) first!'.format(ASPHelper._get_mask_name(param.name))
+                    weight_mask_tensor = weight_mask_param.get_tensor()
+                    weight_mask_tensor.set(weight_sparse_mask, place)
+                asp_info.update_masks(param.name, weight_sparse_mask)
+        return asp_info.masks.copy()
+
+    @staticmethod
+    def _get_mask_name(param_name):
+        r"""
+        Return mask name by given parameter name :attr:`param_name`.
+
+        Args:
+            param_name (string): The name of parameter.
+        Returns:
+            string: The mask name of :attr:`param_name`.
+        """
+        return param_name + ASPHelper.MASK_APPENDDED_NAME
+
+    @staticmethod
+    def _get_not_ASP_relevant_vars(main_program):
+        r"""
+        Get all parameters's Variables in :attr:`main_program` but excluded ASP mask Variables.
+
+        Args:
+            main_program (Program): Program with model definition and its parameters.
+        Returns:
+            list: A list of parameter Variables in :attr:`main_program` (excluded ASP mask Variables).
+        """
+        var_list = []
+        for param in main_program.global_block().all_parameters():
+            if ASPHelper.MASK_APPENDDED_NAME not in param.name:
+                var_list.append(param)
+        return var_list
+
+    @classmethod
+    def _get_program_asp_info(cls, main_program):
+        if not main_program in cls.__asp_info:
+            cls.__asp_info[main_program] = ProgramASPInfo()
+        return cls.__asp_info[main_program]
+
+    @classmethod
+    def _is_supported_layer(cls, main_program, param_name):
+        r"""
+        Verify if given :attr:`param_name` is supported by ASP.
+
+        Args:
+            param_name (string): The name of parameter.
+        Returns:
+            bool: True if it is supported, else False.
+        Examples:
+            .. code-block:: python
+
+              import paddle.fluid as fluid
+              from paddle.fluid.contrib.sparsity.asp import ASPHelper
+
+              main_program = fluid.Program()
+              startup_program = fluid.Program()
+
+              with fluid.program_guard(main_program, startup_program):
+                  input_data = fluid.layers.data(name='data', shape=[None, 128])
+                  fc = fluid.layers.fc(input=input_data, num_flatten_dims=-1, size=32, act=None)
+
+              for param in main_program.global_block().all_parameters():
+                  ASPHelper._is_supported_layer(main_program, param.name)
+              # fc_0.w_0 -> True
+              # fc_0.b_0 -> False
+        """
+        if ASPHelper.MASK_APPENDDED_NAME in param_name:
+            return False
+
+        for layer in cls._get_program_asp_info(main_program).excluded_layers:
+            if layer in param_name:
+                return False
+
+        for name in ASPHelper.SUPPORTED_LAYERS:
+            if name in param_name and \
+               ASPHelper.SUPPORTED_LAYERS[name] in param_name:
+                return True
+        return False
+
+    @classmethod
+    def _minimize(cls,
+                  optimizer,
+                  loss,
+                  main_program=None,
+                  startup_program=None,
+                  parameter_list=None,
+                  no_grad_set=None):
+        r"""
+        This function is a decorator of `minimize` function in `Optimizer`.
+        There are three steps:
+
+        1. Call :attr:`optimizer`.minimize(:attr:`loss`)
+        2. Create sparse mask Tensors according to supported layers in :attr:`main_program`.
+        3. Insert masking ops in the end of parameters update.
+
+        *Note*: Please use `ASP.decorate` instead when applying distributed training with `Fleet`. 
+        (Due to there is a invisiable graphs optimization in `Fleet.minimize()` which make training graph 
+        cannot be modified anymore.)
+
+        Args:
+            optimizer (Optimizer): A Optimizer used for training.
+            loss (Variable): A Variable containing the value to minimize.
+            main_program (Program, optional): Program with model definition and its parameters. Default is `loss.block.program`.
+            startup_program (Program, optional): Program for initializing parameters in `parameter_list`. Default is `paddle.static.default_startup_program()`.
+            parameter_list (Iterable, optional): Iterable of `Variable` or `Variable.name` to update to minimize `loss`. The default value is None, at this time all parameters will be updated.
+            no_grad_set (set, optional): Set of `Variable  or `Variable.name` that don't need to be updated. The default value is None.
+        Returns:
+            list: operators from :attr:`optimizer`.minimize(:attr:`loss`).
+            list: pairs of parameters and their gradients.
+        """
+        if main_program is None:
+            main_program = loss.block.program
+
+        if startup_program is None:
+            startup_program = paddle.static.default_startup_program()
+
+        optimizer_ops, params_and_grads = optimizer.minimize(
+            loss, startup_program, parameter_list, no_grad_set=no_grad_set)
+        cls._create_mask_variables(main_program, startup_program,
+                                   params_and_grads)
+        cls._insert_sparse_mask_ops(main_program, params_and_grads)
+        return optimizer_ops, params_and_grads
+
+    @classmethod
+    def _create_mask_variables(cls, main_program, startup_program,
+                               params_and_grads):
+        r"""
+        Create sparse mask Tensors according to supported layers in :attr:`main_program`.
+        This function is called in second step of `ASPHelper._minimize`
+
+        Args:
+            main_program (Program): Program with model definition and its parameters.
+            startup_program (Program): Program for initializing parameters.
+            params_and_grads (list): Variable pairs of parameters and their gradients.
+        """
+        asp_info = cls._get_program_asp_info(main_program)
+        with program_guard(main_program, startup_program):
+            for param_and_grad in params_and_grads:
+                if ASPHelper._is_supported_layer(main_program,
+                                                 param_and_grad[0].name):
+                    mask_param = layers.create_parameter(
+                        name=param_and_grad[0].name +
+                        ASPHelper.MASK_APPENDDED_NAME,
+                        shape=param_and_grad[0].shape,
+                        dtype=param_and_grad[0].dtype,
+                        default_initializer=ConstantInitializer(value=1.0))
+                    mask_param.stop_gradient = True
+                    mask_param.trainable = False
+                    asp_info.update_mask_vars(param_and_grad[0].name,
+                                              mask_param)
+
+    @classmethod
+    def _insert_sparse_mask_ops(cls, main_program, param_grads):
+        r"""
+        Insert masking ops in the end of parameters update.
+        This function is called in third step of `ASPHelper._minimize`
+
+        Args:
+            main_program (Program): Program with model definition and its parameters.
+            params_and_grads (list): Variable pairs of parameters and their gradients.
+        """
+        block = main_program.global_block()
+        asp_info = cls._get_program_asp_info(main_program)
+        for param_grad in param_grads:
+            if param_grad[0].name in asp_info.mask_vars:
+                block.append_op(
+                    type='elementwise_mul',
+                    inputs={
+                        "X": param_grad[0],
+                        'Y': asp_info.mask_vars[param_grad[0].name]
+                    },
+                    outputs={'Out': param_grad[0]},
+                    attrs={'axis': -1,
+                           'use_mkldnn': False})
+
+
+class OptimizerWithSparsityGuarantee(object):
+    r"""
+    OptimizerWithSparsityGuarantee is a wrapper to decorate `minimize` function of given optimizer by `_minimize` of ASPHelper.
+    The decorated `minimize` function would do three things (exactly same as `ASPHelper._minimize`):
+    1. Call `minimize` function of given optimizer.
+    2. Call `ASPHelper._create_mask_variables` to create mask Variables.
+    3. Call `ASPHelper._insert_sparse_mask_ops` to insert weight masking ops in the end of `loss`'s Program.
+    """
+
+    def __init__(self, optimizer):
+        self._optimizer = optimizer
+        self._learning_rate = optimizer._learning_rate
+        self._learning_rate_map = optimizer._learning_rate_map
+
+    def minimize(self,
+                 loss,
+                 startup_program=None,
+                 parameter_list=None,
+                 no_grad_set=None):
+        r"""
+        This function is to call `ASPHelper.minimize()` and return its return
+
+        Args:
+            loss (Variable): A Variable containing the value to minimize.
+            startup_program (Program, optional): Program for initializing parameters in `parameter_list`. Default is `paddle.static.default_startup_program()`.
+            parameter_list (Iterable, optional): Iterable of `Variable` or `Variable.name` to update to minimize `loss`. The default value is None, at this time all parameters will be updated.
+            no_grad_set (set, optional): Set of `Variable  or `Variable.name` that don't need to be updated. The default value is None.
+        Returns:
+            list: operators from :attr:`optimizer`.minimize(:attr:`loss`).
+            list: pairs of parameters and their gradients.
+        """
+        return ASPHelper._minimize(
+            self._optimizer,
+            loss,
+            startup_program=startup_program,
+            parameter_list=parameter_list,
+            no_grad_set=no_grad_set)
diff --git a/python/paddle/fluid/contrib/sparsity/utils.py b/python/paddle/fluid/contrib/sparsity/utils.py
index f1108c327407ff65596156668edd864715291894..bb030cbac1beaf814987e5cf6a21075ff21d58ee 100644
--- a/python/paddle/fluid/contrib/sparsity/utils.py
+++ b/python/paddle/fluid/contrib/sparsity/utils.py
@@ -27,7 +27,7 @@ from itertools import permutations
 import threading
 
 __all__ = [
-    'density', 'check_mask_1d', 'get_mask_1d', 'check_mask_2d',
+    'calculate_density', 'check_mask_1d', 'get_mask_1d', 'check_mask_2d',
     'get_mask_2d_greedy', 'get_mask_2d_best', 'create_mask', 'check_sparsity',
     'MaskAlgo', 'CheckMethod'
 ]
@@ -75,7 +75,7 @@ class CheckMethod(Enum):
             CheckMethod.get_checking_method(MaskAlgo.MASK_2D_BEST)
             # CheckMethod.CHECK_2D
         """
-        assert type(mask_algo) == MaskAlgo, \
+        assert isinstance(mask_algo, MaskAlgo), \
                "mask_algo should be MaskAlgo type"
         if mask_algo == MaskAlgo.MASK_1D:
             return CheckMethod.CHECK_1D
@@ -83,7 +83,7 @@ class CheckMethod(Enum):
             return CheckMethod.CHECK_2D
 
 
-def density(x):
+def calculate_density(x):
     r"""
     Return the density of the input tensor.
 
@@ -99,15 +99,15 @@ def density(x):
 
           x = np.array([[0, 1, 3, 0],
                         [1, 1, 0, 1]])
-          sparsity.density(x) # 0.625
+          sparsity.calculate_density(x) # 0.625
     """
     x_flattened = x.flatten()
     return float(np.nonzero(x_flattened)[0].size) / x_flattened.size
 
 
-def reshape_1d(mat, m):
+def _reshape_1d(mat, m):
     r"""
-    Reshape the input matrix to shape (-1, m).
+    Reshape the input 2D matrix to shape (-1, m).
     If the second dimension of :attr:`mat` is not a multiples of :attr:`m`, 
     then this function would pad the remainder with 0 before reshaping.
 
@@ -116,11 +116,13 @@ def reshape_1d(mat, m):
         remainder = mat.shape[1] % m
 
     Args:
-        mat (nparray): The input matrix.
+        mat (nparray): The input 2D matrix.
         m (int): The second dimension of reshaped matrix.
     Returns:
         tuple: A pair of the reshaped and padded matrix and the shape of padded matrix (non-reshaping).
     """
+    assert len(mat.shape) == 2, "The input mat should be a 2D matrix!"
+
     remainder = mat.shape[1] % m
     if mat.shape[1] % m > 0:
         mat_padded = np.zeros((mat.shape[0], mat.shape[1] + (m - remainder)))
@@ -165,9 +167,9 @@ def check_mask_1d(mat, n, m):
           sparsity.check_mask_1d(x, 2, 4) # True
     """
     if len(mat.shape) <= 1:
-        mat_flattern, shape = reshape_1d(mat.reshape(1, mat.shape[0]), m)
+        mat_flattern, shape = _reshape_1d(mat.reshape(1, mat.shape[0]), m)
     else:
-        mat_flattern, shape = reshape_1d(mat, m)
+        mat_flattern, shape = _reshape_1d(mat, m)
 
     for sub_mat in mat_flattern:
         if np.nonzero(sub_mat)[0].size > (m - n):
@@ -202,7 +204,7 @@ def get_mask_1d(mat, n, m):
           #          [0, 1, 0, 1]])
           sparsity.check_mask_1d(mask, 2, 4) # True
     """
-    mat_flattern, shape = reshape_1d(mat, m)
+    mat_flattern, shape = _reshape_1d(mat, m)
 
     mask_flattern = np.ones_like(mat_flattern)
     mask = np.ones_like(mat)
@@ -215,9 +217,9 @@ def get_mask_1d(mat, n, m):
     return mask
 
 
-def reshape_2d(mat, m):
+def _reshape_2d(mat, m):
     r"""
-    Reshape the input matrix to shape (-1, :math:`m \times m`).
+    Reshape the input 2D matrix to shape (-1, :math:`m \times m`).
     In each dimension of :attr:`mat`, if it is not a multiples of :attr:`m`, 
     then this function would pad the remainder with 0 before reshaping.
 
@@ -227,11 +229,13 @@ def reshape_2d(mat, m):
         remainder_1 = mat.shape[1] % m
 
     Args:
-        mat (nparray): The input matrix.
+        mat (nparray): The input 2D matrix.
         m (int): The square root of second dimension of reshaped matrix.
     Returns:
         tuple: A pair of the reshaped and padded matrix and the shape of padded matrix (non-reshaping).
     """
+    assert len(mat.shape) == 2, "The input mat should be a 2D matrix!"
+
     remainder_0 = mat.shape[0] % m
     remainder_1 = mat.shape[1] % m
 
@@ -297,7 +301,7 @@ def check_mask_2d(mat, n, m):
                         [1, 1, 0, 1]])
           sparsity.check_mask_2d(x, 2, 4) # True
     """
-    mat_padded, shape = reshape_2d(mat, m)
+    mat_padded, shape = _reshape_2d(mat, m)
     for sub_mat in mat_padded:
         sub_mask = np.absolute(np.squeeze(sub_mat.reshape(m, m))) > 0
         if (np.sum(np.sum(sub_mask, axis=1) > (m-n)) != 0) and \
@@ -338,7 +342,7 @@ def get_mask_2d_greedy(mat, n, m):
           #          [0. 1. 1. 0.]])
           sparsity.check_mask_2d(mask, 2, 4) # True
     """
-    mat_padded, shape = reshape_2d(mat, m)
+    mat_padded, shape = _reshape_2d(mat, m)
     mask_padded = np.zeros_like(mat_padded).reshape(-1, m, m)
 
     for idx in range(len(mat_padded)):
@@ -372,11 +376,11 @@ def get_mask_2d_greedy(mat, n, m):
     return mask[:mat.shape[0], :mat.shape[1]]
 
 
-valid_2d_patterns_lock = threading.Lock()
-valid_2d_patterns = {}
+_valid_2d_patterns_lock = threading.Lock()
+_valid_2d_patterns = {}
 
 
-def compute_valid_2d_patterns(n, m):
+def _compute_valid_2d_patterns(n, m):
     r"""
     Compute all vaild 2D `n:m` sparse patterns.
 
@@ -389,12 +393,12 @@ def compute_valid_2d_patterns(n, m):
     Returns:
         dictionary: A dictionary with key: *m_n* (string) and value: all vaild 2D `n:m` sparse patterns.
     """
-    global valid_2d_patterns_lock
-    global valid_2d_patterns
+    global _valid_2d_patterns_lock
+    global _valid_2d_patterns
 
     valid_key = '{}_{}'.format(m, n)
-    if valid_key in valid_2d_patterns:
-        return valid_2d_patterns[valid_key]
+    if valid_key in _valid_2d_patterns:
+        return _valid_2d_patterns[valid_key]
     else:
         patterns = np.zeros(m)
         patterns[:n] = 1
@@ -407,9 +411,9 @@ def compute_valid_2d_patterns(n, m):
         valid_patterns = np.empty((valid.shape[0], m, m))
         valid_patterns[:] = patterns[valid[:]]
 
-        valid_2d_patterns_lock.acquire()
-        valid_2d_patterns[valid_key] = valid_patterns
-        valid_2d_patterns_lock.release()
+        _valid_2d_patterns_lock.acquire()
+        _valid_2d_patterns[valid_key] = valid_patterns
+        _valid_2d_patterns_lock.release()
 
         return valid_patterns
 
@@ -446,9 +450,9 @@ def get_mask_2d_best(mat, n, m):
           print("L1 norm of `greedy` sparse matrix", np.multiply(mat, mask_greedy).sum()) # 56
           print("L1 norm of `best` sparse matrix", np.multiply(mat, mask_best).sum()) # 61
     """
-    patterns = compute_valid_2d_patterns(n, m)
+    patterns = _compute_valid_2d_patterns(n, m)
 
-    mat_flattern, shape = reshape_2d(mat, m)
+    mat_flattern, shape = _reshape_2d(mat, m)
     mask_flattern = np.ones_like(mat_flattern).reshape(-1, m, m)
     pmax = np.argmax(
         np.matmul(mat_flattern, patterns.reshape(patterns.shape[0], m * m).T),
@@ -504,30 +508,25 @@ def create_mask(tensor, func_name=MaskAlgo.MASK_1D, n=2, m=4):
     dtype = tensor.dtype
     t = tensor.astype(float)
 
-    assert type(func_name) == MaskAlgo, \
+    assert isinstance(func_name, MaskAlgo), \
            "func_name argumet of create_mask is only accepted as type MaskAlgo. " \
            "But got {}".format(type(func_name))
     func = getattr(sys.modules[__name__], func_name.value, None)
     if len(shape) == 1:
         t = t.reshape(1, shape[0])
-        mask = func(t, n=n, m=m)
-        return mask.reshape(shape).astype(dtype)
     elif len(shape) == 2:
         t = t.reshape(shape[0], shape[1])
-        mask = func(t, n=n, m=m)
-        return mask.reshape(shape).astype(dtype)
     elif len(shape) == 3:
         t = t.reshape(shape[0] * shape[1], shape[2])
-        mask = func(t, n=n, m=m)
-        return mask.reshape(shape).astype(dtype)
     # 4d-tensor conv (out, in, h, w) -> (out, in*h*w) in GemmConvKernel Op
     elif len(shape) == 4:
         t = t.reshape(shape[0], shape[1] * shape[2] * shape[3])
-        mask = func(t, n=n, m=m)
-        return mask.reshape(shape).astype(dtype)
     else:
-        assert True, "The dimension of input tensor is not supported in create_mask, " \
-                     "Only dimension < 4 is supported but got {}".format(len(shape))
+        raise ValueError("The dimension of input tensor is not supported in create_mask, " \
+                         "Only dimension < 4 is supported but got {}".format(len(shape)))
+
+    mask = func(t, n=n, m=m)
+    return mask.reshape(shape).astype(dtype)
 
 
 def check_sparsity(tensor, func_name=CheckMethod.CHECK_1D, n=2, m=4):
@@ -569,19 +568,15 @@ def check_sparsity(tensor, func_name=CheckMethod.CHECK_1D, n=2, m=4):
     func = getattr(sys.modules[__name__], func_name.value, None)
     if len(shape) == 1:
         t = t.reshape(1, shape[0])
-        return func(t, n=n, m=m)
     elif len(shape) == 2:
         t = t.reshape(shape[0], shape[1])
-        return func(t, n=n, m=m)
     elif len(shape) == 3:
         t = t.reshape(shape[0] * shape[1], shape[2])
-        return func(t, n=n, m=m)
     # 4d-tensor conv (out, in, h, w) -> (out, in*h*w) in GemmConvKernel Op
     elif len(shape) == 4:
         t = t.reshape(shape[0], shape[1] * shape[2] * shape[3])
-        return func(t, n=n, m=m)
     else:
-        assert True, "The dimension of input tensor is not supported in check_sparsity, " \
-                     "Only dimension < 4 is supported but got {}".format(len(shape))
+        raise ValueError("The dimension of input tensor is not supported in create_mask, " \
+                         "Only dimension < 4 is supported but got {}".format(len(shape)))
 
-    return False
+    return func(t, n=n, m=m)
diff --git a/python/paddle/fluid/core.py b/python/paddle/fluid/core.py
index 9e931ad40c57a511a18e67a669b8b06b34db57d8..7886b6b3f7ad7c40d199862ab2056c9567486a9a 100644
--- a/python/paddle/fluid/core.py
+++ b/python/paddle/fluid/core.py
@@ -269,14 +269,6 @@ if avx_supported():
         from .core_avx import _dygraph_debug_level
         from .core_avx import _switch_tracer
         from .core_avx import _set_paddle_lib_path
-        from .core_avx import _save_static_dict
-        from .core_avx import _load_static_dict
-        from .core_avx import _save_dygraph_dict
-        from .core_avx import _load_dygraph_dict
-        from .core_avx import _save_lod_tensor
-        from .core_avx import _load_lod_tensor
-        from .core_avx import _save_selected_rows
-        from .core_avx import _load_selected_rows
         from .core_avx import _create_loaded_parameter
         from .core_avx import _cuda_synchronize
         from .core_avx import _promote_types_if_complex_exists
@@ -328,14 +320,6 @@ if load_noavx:
         from .core_noavx import _dygraph_debug_level
         from .core_noavx import _switch_tracer
         from .core_noavx import _set_paddle_lib_path
-        from .core_noavx import _save_static_dict
-        from .core_noavx import _load_static_dict
-        from .core_noavx import _save_dygraph_dict
-        from .core_noavx import _load_dygraph_dict
-        from .core_noavx import _save_lod_tensor
-        from .core_noavx import _load_lod_tensor
-        from .core_noavx import _save_selected_rows
-        from .core_noavx import _load_selected_rows
         from .core_noavx import _create_loaded_parameter
         from .core_noavx import _cuda_synchronize
         from .core_noavx import _promote_types_if_complex_exists
diff --git a/python/paddle/fluid/dataloader/collate.py b/python/paddle/fluid/dataloader/collate.py
index 8e90b308b393ed04b295eb80ab6272c12f807391..eaaf4cc2d9f62b491b6ee8f9db2c93f0db45a673 100644
--- a/python/paddle/fluid/dataloader/collate.py
+++ b/python/paddle/fluid/dataloader/collate.py
@@ -78,7 +78,6 @@ def default_collate_fn(batch):
 
     raise TypeError("batch data con only contains: tensor, numpy.ndarray, "
                     "dict, list, number, but got {}".format(type(sample)))
-    return outputs
 
 
 def default_convert_fn(batch):
diff --git a/python/paddle/fluid/dataloader/worker.py b/python/paddle/fluid/dataloader/worker.py
index 26bd1f06e12e84a9da24fad76091de4e71d3add4..409f55efebc8a67647f82d06368d788b8fa3b7af 100644
--- a/python/paddle/fluid/dataloader/worker.py
+++ b/python/paddle/fluid/dataloader/worker.py
@@ -168,6 +168,89 @@ class _WorkerException(object):
         raise self.exc_type(msg)
 
 
+# The function `_generate_states` is adapted from `numpy.random.SeedSequence`
+# from https://github.com/numpy/numpy/blob/main/numpy/random/bit_generator.pyx
+# Here is the copyright:
+
+# SeedSequence is derived from Melissa E. O'Neill's C++11 `std::seed_seq`
+# implementation, as it has a lot of nice properties that we want.
+# https://gist.github.com/imneme/540829265469e673d045
+# http://www.pcg-random.org/posts/developing-a-seed_seq-alternative.html
+
+# The MIT License (MIT)
+
+# Copyright (c) 2015 Melissa E. O'Neill
+# Copyright (c) 2019 NumPy Developers
+#
+# Permission is hereby granted, free of charge, to any person obtaining a copy
+# of this software and associated documentation files (the "Software"), to deal
+# in the Software without restriction, including without limitation the rights
+# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+# copies of the Software, and to permit persons to whom the Software is
+# furnished to do so, subject to the following conditions:
+#
+# The above copyright notice and this permission notice shall be included in
+# all copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+# SOFTWARE.
+
+INIT_A = 0x43b0d7e5
+MULT_A = 0x931e8875
+INIT_B = 0x8b51f9dd
+MULT_B = 0x58f38ded
+MIX_MULT_L = 0xca01f9dd
+MIX_MULT_R = 0x4973f715
+XSHIFT = np.dtype(np.uint32).itemsize * 8 // 2
+MASK32 = 0xFFFFFFFF
+
+
+def _generate_states(base_seed=0, worker_id=0):
+    # init hash constant
+    hash_const_A = INIT_A
+    hash_const_B = INIT_B
+
+    def hash(value):
+        nonlocal hash_const_A
+        value = (value ^ hash_const_A) & MASK32
+        hash_const_A = (hash_const_A * MULT_A) & MASK32
+        value = (value * hash_const_A) & MASK32
+        value = (value ^ (value >> XSHIFT)) & MASK32
+        return value
+
+    def mix(x, y):
+        result_x = (MIX_MULT_L * x) & MASK32
+        result_y = (MIX_MULT_R * y) & MASK32
+        result = (result_x - result_y) & MASK32
+        result = (result ^ (result >> XSHIFT)) & MASK32
+        return result
+
+    # init entropys with based_seed and worker_id and calculate pool
+    entropys = [worker_id, base_seed & MASK32, base_seed >> 32, 0]
+    pool = [hash(entropy) for entropy in entropys]
+
+    # mix all bits together
+    for i in range(len(pool)):
+        for j in range(len(pool)):
+            if i != j:
+                pool[j] = mix(pool[j], hash(pool[i]))
+
+    states = []
+    for p in pool:
+        state = (p ^ hash_const_B) & MASK32
+        hash_const_B = (hash_const_B * MULT_B) & MASK32
+        state = (state * hash_const_B) & MASK32
+        state = (state ^ (state >> XSHIFT)) & MASK32
+        states.append(state)
+
+    return states
+
+
 def _worker_loop(dataset, dataset_kind, indices_queue, out_queue, done_event,
                  auto_collate_batch, collate_fn, init_fn, worker_id,
                  num_workers, use_shared_memory):
@@ -181,6 +264,15 @@ def _worker_loop(dataset, dataset_kind, indices_queue, out_queue, done_event,
         # set signal handler
         core._set_process_signal_handler()
 
+        # set different numpy seed for each worker
+        try:
+            import numpy as np
+            import time
+        except ImportError:
+            pass
+        else:
+            np.random.seed(_generate_states(int(time.time()), worker_id))
+
         global _worker_info
         _worker_info = WorkerInfo(
             id=worker_id, num_workers=num_workers, dataset=dataset)
diff --git a/python/paddle/fluid/dygraph/dygraph_to_static/partial_program.py b/python/paddle/fluid/dygraph/dygraph_to_static/partial_program.py
index 6eea883226b36b7c1804214fc5e4c9a306c53d01..7910e7a38558ce8768853d5c975fc1d332d2fa67 100644
--- a/python/paddle/fluid/dygraph/dygraph_to_static/partial_program.py
+++ b/python/paddle/fluid/dygraph/dygraph_to_static/partial_program.py
@@ -16,6 +16,7 @@ from __future__ import print_function
 import numpy as np
 import six
 
+import paddle
 from paddle.fluid import framework, backward, core
 from paddle.fluid.dygraph import layers
 from paddle.fluid.dygraph.base import switch_to_static_graph
@@ -220,23 +221,15 @@ class PartialProgramLayer(layers.Layer):
 
     def forward(self, inputs):
         in_vars, out_vars, tmp_scope_vec = self._prepare(inputs)
-        framework._dygraph_tracer().trace_op(
-            type='run_program',
-            inputs={
-                'X': valid_vars(in_vars),
-                'Params': valid_vars(self._params)
-            },
-            outputs={
-                'Out': valid_vars(out_vars),
-                'OutScope': tmp_scope_vec,
-                'DOut': valid_vars(self._double_grads)
-            },
-            attrs={
-                'global_block': self.program.desc.block(0),
-                'start_op_index': 0,
-                'end_op_index': self._infer_program.desc.block(0).op_size(),
-                'is_test': not self.training
-            })
+
+        attrs = ('global_block', self.program.desc.block(0), 'start_op_index',
+                 0, 'end_op_index', self._infer_program.desc.block(0).op_size(),
+                 'is_test', not self.training)
+        core.ops.run_program(
+            valid_vars(in_vars),
+            valid_vars(self._params),
+            valid_vars(out_vars), tmp_scope_vec,
+            valid_vars(self._double_grads), *attrs)
 
         restored_nest_out = self._restore_out(out_vars)
         return self._remove_no_value(restored_nest_out)
@@ -263,8 +256,19 @@ class PartialProgramLayer(layers.Layer):
                     place=framework._current_expected_place(),
                     zero_copy=True)
             elif isinstance(value, core.VarBase):
-                var = value
-                var.name = self._inputs[i].desc.name()
+                value.name = self._inputs[i].desc.name()
+                if value.stop_gradient:
+                    # NOTE(Aurelius84): If var is on CPUPlace, it will be transformed multi times
+                    # into CUDAPlace when it's as input of multi Ops. so we move it in advance
+                    # to avoid this problem.
+                    var = paddle.to_tensor(
+                        value,
+                        dtype=value.dtype,
+                        place=framework._current_expected_place(),
+                        stop_gradient=True)
+                    var.name = value.name
+                else:
+                    var = value
             else:
                 continue
             input_vars.append(var)
diff --git a/python/paddle/fluid/dygraph/layer_hooks.py b/python/paddle/fluid/dygraph/layer_hooks.py
new file mode 100644
index 0000000000000000000000000000000000000000..e9c6867cb7c8ba720637cc98bafb2e016fadbaa3
--- /dev/null
+++ b/python/paddle/fluid/dygraph/layer_hooks.py
@@ -0,0 +1,74 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+# 
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+# 
+#     http://www.apache.org/licenses/LICENSE-2.0
+# 
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import six
+import warnings
+
+from paddle.fluid.framework import default_main_program, in_dygraph_mode
+
+
+class LayerOpsRecoder:
+    """
+    Record generated operators information in nn.Layer.
+    """
+
+    def __init__(self, start=-1, end=-1, ops=None, is_valid=False, hooks=None):
+        self.start = start
+        self.end = end
+        self.ops = ops
+        self.is_valid = is_valid
+        self.hooks = hooks
+
+
+def record_program_ops_pre_hook(layer, inputs):
+    """
+    A pre-hook to mark op numbers before enter layer.forward.
+    """
+    if not in_dygraph_mode():
+        if layer._op_recorder.start < 0:
+            layer._op_recorder.start = len(default_main_program().current_block(
+            ).ops)
+            layer._op_recorder.is_valid = True
+        else:
+            layer._op_recorder.is_valid = False
+            warnings.warn(
+                "{} has recorded the op information before. Please check whether you call this layer twice.".
+                format(layer._full_name))
+
+    return None
+
+
+def set_op_customized_attrs_post_hook(layer, inputs, outputs):
+    """
+    A post-hook to append customized attributes into all operators generated in current layer.
+    """
+    if not in_dygraph_mode() and layer._op_recorder.is_valid:
+
+        start = layer._op_recorder.start
+        end = len(default_main_program().current_block().ops)
+        assert (start >= 0 and end >= start)
+        ops = default_main_program().current_block().ops[start:end]
+
+        layer._op_recorder.end = end
+        layer._op_recorder.ops = ops
+
+        for op in ops:
+            for attr_name, val in six.iteritems(layer._customized_attrs):
+                op._set_attr(attr_name, val)
+
+        # remove pre-hook and post-hook
+        for hook_helper in layer._op_recorder.hooks:
+            hook_helper.remove()
+
+    return None
diff --git a/python/paddle/fluid/dygraph/layer_object_helper.py b/python/paddle/fluid/dygraph/layer_object_helper.py
index a904f80639752a7538289a1ce7c2abf378ccc634..5bf5eda19a5d0c2e3aab1515ddb8855ba2db5017 100644
--- a/python/paddle/fluid/dygraph/layer_object_helper.py
+++ b/python/paddle/fluid/dygraph/layer_object_helper.py
@@ -16,7 +16,7 @@ from __future__ import print_function
 
 import copy
 import six
-from ..framework import Parameter, in_dygraph_mode
+from ..framework import Parameter, in_dygraph_mode, _global_flags
 from ..param_attr import ParamAttr
 from .. import core
 from six.moves import zip
@@ -158,7 +158,7 @@ class LayerObjectHelper(LayerHelperBase):
 
         if (use_cudnn is not None) and use_cudnn:
             act['use_cudnn'] = use_cudnn
-        use_mkldnn = core.globals()["FLAGS_use_mkldnn"]
+        use_mkldnn = _global_flags()["FLAGS_use_mkldnn"]
         if (use_mkldnn is not None) and use_mkldnn:
             act['use_mkldnn'] = use_mkldnn
         act_type = act.pop('type')
diff --git a/python/paddle/fluid/dygraph/layers.py b/python/paddle/fluid/dygraph/layers.py
index ecf6be1a0224af6b89033d6e279e3c2cfe3ef192..cb7666b353db793ce45ccd23b51cb993313f820b 100644
--- a/python/paddle/fluid/dygraph/layers.py
+++ b/python/paddle/fluid/dygraph/layers.py
@@ -30,6 +30,7 @@ from . import parallel_helper
 from .. import unique_name
 from paddle.fluid import core
 from .layer_object_helper import LayerObjectHelper
+from .layer_hooks import record_program_ops_pre_hook, set_op_customized_attrs_post_hook, LayerOpsRecoder
 from .base import program_desc_tracing_guard, param_guard
 from paddle.fluid import framework
 from ..param_attr import ParamAttr
@@ -113,6 +114,10 @@ class Layer(core.Layer):
         self._sub_layers = collections.OrderedDict()
         self._loaddict_holder = collections.OrderedDict()
 
+        # Record generated op_descs in this layer
+        self._op_recorder = LayerOpsRecoder(ops=[], hooks=[])
+        self._customized_attrs = {}
+
         self._forward_pre_hooks = collections.OrderedDict()
         self._forward_post_hooks = collections.OrderedDict()
 
@@ -665,7 +670,7 @@ class Layer(core.Layer):
         Parameters:
             prefix(str, optional): Prefix to prepend to all parameter names. Default: ''.
             include_self(bool, optional): Whether include the Layer itself. Default: False.
-            layers_set(set, optioanl): The set to record duplicate sublayers. Default: None.
+            layers_set(set, optional): The set to record duplicate sublayers. Default: None.
 
         Yields:
             (string, Layer): Tuple of name and Layer
@@ -1028,6 +1033,54 @@ class Layer(core.Layer):
             self._parameters[name] = parameter
         return parameter
 
+    def _set_op_attrs(self, attrs):
+        """
+        Add customized attribute while append_op. In case of quantization, we want to save
+        some attributes into op_desc while exporting inference model by @to_static.
+
+        Arguments:
+            attrs(dict): customized attributes that will be added into op_descs.
+
+        NOTE: The interface is only exposed to developers.
+        """
+
+        def is_already_registered(is_pre_hook):
+            layers_hooks = self._forward_pre_hooks if is_pre_hook else self._forward_post_hooks
+            candidate_hook = record_program_ops_pre_hook if is_pre_hook else set_op_customized_attrs_post_hook
+
+            already_registed = False
+            if layers_hooks:
+                last_key = next(reversed(layers_hooks))
+                already_registed = (layers_hooks[last_key] == candidate_hook)
+
+            return already_registed
+
+        if not isinstance(attrs, dict):
+            raise TypeError("attrs should be type(dict), but received {}".
+                            format(type(attrs).__name__))
+
+        # NOTE: Overwrite behavior for same key.
+        self._customized_attrs.update(attrs)
+
+        if not is_already_registered(is_pre_hook=True):
+            pre_hook_helper = self.register_forward_pre_hook(
+                record_program_ops_pre_hook)
+            assert len(self._op_recorder.hooks) == 0
+            self._op_recorder.hooks = [pre_hook_helper]
+
+        # manually register post_hook to ensure it is inserted into the head.
+        if not is_already_registered(is_pre_hook=False):
+            post_hook_helper = self.register_forward_post_hook(
+                set_op_customized_attrs_post_hook)
+            if len(self._forward_post_hooks) > 1:
+                self._forward_post_hooks.move_to_end(
+                    post_hook_helper._hook_id, last=False)
+
+            assert len(self._op_recorder.hooks) == 1
+
+            # hooks that need to be removed once we finish executing them.
+            self._op_recorder.hooks.append(post_hook_helper)
+
     def __getstate__(self):
         return self.__dict__
 
diff --git a/python/paddle/fluid/dygraph/nn.py b/python/paddle/fluid/dygraph/nn.py
index ce728f1121dfdbc04dc123c3976539ec143fc9d6..9d6e637342a7b6626c3f3b958c91fdee1a9c4eac 100644
--- a/python/paddle/fluid/dygraph/nn.py
+++ b/python/paddle/fluid/dygraph/nn.py
@@ -21,7 +21,7 @@ from ..layers import utils
 from ..layers import nn as F
 from .. import dygraph_utils
 from . import layers
-from ..framework import Variable, in_dygraph_mode, OpProtoHolder, Parameter, _dygraph_tracer, _varbase_creator, default_main_program
+from ..framework import Variable, in_dygraph_mode, OpProtoHolder, Parameter, _dygraph_tracer, _varbase_creator, default_main_program, _global_flags
 from ..data_feeder import convert_dtype, check_variable_and_dtype, check_type, check_dtype
 from ..param_attr import ParamAttr
 from ..initializer import Normal, Constant, NumpyArrayInitializer
@@ -188,7 +188,7 @@ class Conv2D(layers.Layer):
         if not isinstance(use_cudnn, bool):
             raise ValueError("use_cudnn should be True or False")
         self._use_cudnn = use_cudnn
-        self._use_mkldnn = core.globals()["FLAGS_use_mkldnn"]
+        self._use_mkldnn = _global_flags()["FLAGS_use_mkldnn"]
         self._filter_size = filter_size
         self._num_filters = num_filters
         self._param_attr = param_attr
@@ -837,7 +837,7 @@ class Pool2D(layers.Layer):
         if not isinstance(use_cudnn, bool):
             raise ValueError("use_cudnn should be True or False")
 
-        self._use_mkldnn = core.globals()["FLAGS_use_mkldnn"]
+        self._use_mkldnn = _global_flags()["FLAGS_use_mkldnn"]
 
         if data_format not in ["NCHW", "NHWC"]:
             raise ValueError(
@@ -966,7 +966,7 @@ class Linear(layers.Layer):
         self.bias = self.create_parameter(
             shape=[output_dim], attr=bias_attr, dtype=dtype, is_bias=True)
 
-        self._use_mkldnn = core.globals()["FLAGS_use_mkldnn"]
+        self._use_mkldnn = _global_flags()["FLAGS_use_mkldnn"]
 
     def forward(self, input):
         if in_dygraph_mode():
@@ -1268,7 +1268,7 @@ class BatchNorm(layers.Layer):
         self._param_attr = param_attr
         self._bias_attr = bias_attr
         self._act = act
-        self._use_mkldnn = core.globals()["FLAGS_use_mkldnn"]
+        self._use_mkldnn = _global_flags()["FLAGS_use_mkldnn"]
 
         assert bias_attr is not False, "bias_attr should not be False in batch_norm."
 
diff --git a/python/paddle/fluid/framework.py b/python/paddle/fluid/framework.py
index bffeaf2c6c973ec7ff928eae6bac1c3fa2af5f50..695c91fea819f57a12ec760d3eeb4965da6c23de 100644
--- a/python/paddle/fluid/framework.py
+++ b/python/paddle/fluid/framework.py
@@ -72,6 +72,7 @@ _dygraph_tracer_ = None
 _global_expected_place_ = None
 _current_device = None
 global_prog_seed = 0
+_global_flags_ = core.globals()
 
 
 def require_version(min_version, max_version=None):
@@ -286,6 +287,10 @@ def _dygraph_tracer():
     return _dygraph_tracer_
 
 
+def _global_flags():
+    return _global_flags_
+
+
 def _current_expected_place():
     global _global_expected_place_
     if _global_expected_place_ is None:
@@ -2142,7 +2147,7 @@ class Operator(object):
         """
         assert isinstance(
             skip_op_callstack, bool
-        ), "skip_op_callstack parameter's type is error, expect bool, received %s".format(
+        ), "skip_op_callstack parameter's type is error, expect bool, received {}".format(
             type(skip_op_callstack))
         outputs_str = "{"
         for i in range(0, len(self.output_names)):
@@ -2550,7 +2555,7 @@ class Block(object):
         """
         assert isinstance(
             skip_op_callstack, bool
-        ), "skip_op_callstack parameter's type is error, expect bool, received %s".format(
+        ), "skip_op_callstack parameter's type is error, expect bool, received {}".format(
             type(skip_op_callstack))
         block_str = "{ // block "
         block_str += "{}\n".format(self.idx)
@@ -4259,7 +4264,7 @@ class Program(object):
         """
         assert isinstance(
             skip_op_callstack, bool
-        ), "skip_op_callstack parameter's type is error, expect bool, received %s".format(
+        ), "skip_op_callstack parameter's type is error, expect bool, received {}".format(
             type(skip_op_callstack))
         program_str = ""
         for block in self.blocks:
@@ -5833,8 +5838,8 @@ def set_flags(flags):
     if not isinstance(flags, dict):
         raise TypeError('flags in set_flags should be a dict')
     for key, value in flags.items():
-        if core.globals().is_public(key):
-            core.globals()[key] = value
+        if _global_flags().is_public(key):
+            _global_flags()[key] = value
         else:
             raise ValueError(
                 "Flag %s cannot set its value through this function." % (key))
@@ -5863,8 +5868,8 @@ def get_flags(flags):
     flags_value = {}
     if isinstance(flags, (list, tuple)):
         for key in flags:
-            if (core.globals().is_public(key)):
-                value = core.globals()[key]
+            if (_global_flags().is_public(key)):
+                value = _global_flags()[key]
                 temp = {key: value}
                 flags_value.update(temp)
             else:
@@ -5872,8 +5877,8 @@ def get_flags(flags):
                     'Flag %s cannot get its value through this function.' %
                     (key))
     elif isinstance(flags, str):
-        if (core.globals().is_public(flags)):
-            value = core.globals()[flags]
+        if (_global_flags().is_public(flags)):
+            value = _global_flags()[flags]
             temp = {flags: value}
             flags_value.update(temp)
         else:
diff --git a/python/paddle/fluid/incubate/fleet/utils/hdfs.py b/python/paddle/fluid/incubate/fleet/utils/hdfs.py
index 94a371ae3fb5bb57836701a02f64f9ad01a49d7e..fe09692531ad3a80e06022cd02d84fe23f7bc6ae 100644
--- a/python/paddle/fluid/incubate/fleet/utils/hdfs.py
+++ b/python/paddle/fluid/incubate/fleet/utils/hdfs.py
@@ -268,8 +268,7 @@ class HDFSClient(FS):
                     fs_src_path))
 
             if self.is_exist(fs_dst_path):
-                raise FSFileExistsError("{} exists already".format(
-                    fs_src_path, fs_dst_path, fs_dst_path))
+                raise FSFileExistsError("{} exists already".format(fs_dst_path))
 
         return self._try_mv(fs_src_path, fs_dst_path)
 
diff --git a/python/paddle/fluid/initializer.py b/python/paddle/fluid/initializer.py
index 5b2010f340958059a37e3564e6d5f228be7c5a7b..54ba5f22e53d6cfc21af87278ecba3849e715c91 100644
--- a/python/paddle/fluid/initializer.py
+++ b/python/paddle/fluid/initializer.py
@@ -152,6 +152,7 @@ class ConstantInitializer(Initializer):
             out_dtype = var.dtype
             out_var = var
 
+        # fill constant should set the "str_value" to preserve precision
         op = block.append_op(
             type="fill_constant",
             outputs={"Out": out_var},
@@ -159,6 +160,7 @@ class ConstantInitializer(Initializer):
                 "shape": var.shape,
                 "dtype": int(out_dtype),
                 "value": float(self._value),
+                'str_value': str(float(self._value)),
                 'force_cpu': self._force_cpu
             },
             stop_gradient=True)
diff --git a/python/paddle/fluid/io.py b/python/paddle/fluid/io.py
index 30a0b4053e6ffa26274e9eb99c311e4138cb815c..2d3578c6c104b0753d881164b5feb15659986a1d 100644
--- a/python/paddle/fluid/io.py
+++ b/python/paddle/fluid/io.py
@@ -23,6 +23,7 @@ import pickle
 import contextlib
 from functools import reduce
 import sys
+from io import BytesIO
 
 import numpy as np
 import math
@@ -71,6 +72,52 @@ _logger = get_logger(
     __name__, logging.INFO, fmt='%(asctime)s-%(levelname)s: %(message)s')
 
 
+class _open_buffer(object):
+    def __init__(self, buffer):
+        self.buffer = buffer
+
+    def __enter__(self):
+        return self.buffer
+
+
+class _buffer_reader(_open_buffer):
+    def __init__(self, buffer):
+        super(_buffer_reader, self).__init__(buffer)
+        self.initial_tell = self.buffer.tell()
+
+    def __exit__(self, *args):
+        # `args[0]` is type of exception. When the `read` is abnormal, the file pointer returns to the initial position.
+        if args[0] is not None:
+            self.buffer.seek(self.initial_tell)
+
+
+class _buffer_writer(_open_buffer):
+    def __exit__(self, *args):
+        self.buffer.flush()
+
+
+def _is_file_path(path):
+    return isinstance(path, str)
+
+
+def _open_file_buffer(path_or_buffer, mode):
+
+    if _is_file_path(path_or_buffer):
+        return open(path_or_buffer, mode)
+    else:
+        if 'w' in mode:
+            return _buffer_writer(path_or_buffer)
+        elif 'r' in mode:
+            return _buffer_reader(path_or_buffer)
+        else:
+            raise ValueError("Expected 'r' or 'w' in mode but got {}".format(
+                mode))
+
+
+def _is_memory_buffer(buffer):
+    return isinstance(buffer, BytesIO)
+
+
 def is_parameter(var):
     """
     Check whether the given variable is an instance of Parameter.
@@ -1776,14 +1823,16 @@ def _legacy_save(param_dict, model_path, protocol=2):
     param_dict = {name: get_tensor(param_dict[name]) for name in param_dict}
 
     # When value of dict is lager than 4GB ,there is a Bug on 'MAC python3'
-    if sys.platform == 'darwin' and sys.version_info.major == 3:
+    if _is_file_path(
+            model_path
+    ) and sys.platform == 'darwin' and sys.version_info.major == 3:
         pickle_bytes = pickle.dumps(param_dict, protocol=protocol)
         with open(model_path, 'wb') as f:
             max_bytes = 2**30
             for i in range(0, len(pickle_bytes), max_bytes):
                 f.write(pickle_bytes[i:i + max_bytes])
     else:
-        with open(model_path, 'wb') as f:
+        with _open_file_buffer(model_path, 'wb') as f:
             pickle.dump(param_dict, f, protocol=protocol)
 
 
diff --git a/python/paddle/fluid/layer_helper.py b/python/paddle/fluid/layer_helper.py
index db556913384785e1f11ba05dcc524ef1f1de92ab..2b677c11e9d96b7f412f3bdbb0322d4bcf98c472 100644
--- a/python/paddle/fluid/layer_helper.py
+++ b/python/paddle/fluid/layer_helper.py
@@ -17,7 +17,7 @@ from __future__ import print_function
 import copy
 import six
 
-from .framework import Parameter, dtype_is_floating, in_dygraph_mode, OpProtoHolder
+from .framework import Parameter, dtype_is_floating, in_dygraph_mode, OpProtoHolder, _global_flags
 from . import unique_name
 from paddle.fluid.initializer import Constant, Xavier
 from .param_attr import ParamAttr
@@ -148,7 +148,7 @@ class LayerHelper(LayerHelperBase):
         if 'use_cudnn' in self.kwargs and self.kwargs.get('use_cudnn'):
             act['use_cudnn'] = self.kwargs.get('use_cudnn')
         use_mkldnn = self.kwargs.get(
-            'use_mkldnn', core.globals().get("FLAGS_use_mkldnn", False))
+            'use_mkldnn', _global_flags().get("FLAGS_use_mkldnn", False))
         if use_mkldnn:
             act['use_mkldnn'] = use_mkldnn
         act_type = act.pop('type')
diff --git a/python/paddle/fluid/layers/detection.py b/python/paddle/fluid/layers/detection.py
index cf4abc207bd7541676ee7ad3c1ad5f9c67a67619..604bcc0e277769e074b3c531fa364a62b8078e49 100644
--- a/python/paddle/fluid/layers/detection.py
+++ b/python/paddle/fluid/layers/detection.py
@@ -1139,7 +1139,9 @@ def yolo_box(x,
              downsample_ratio,
              clip_bbox=True,
              name=None,
-             scale_x_y=1.):
+             scale_x_y=1.,
+             iou_aware=False,
+             iou_aware_factor=0.5):
     """
 
     ${comment}
@@ -1156,6 +1158,8 @@ def yolo_box(x,
         name (string): The default value is None.  Normally there is no need 
                        for user to set this property.  For more information, 
                        please refer to :ref:`api_guide_Name`
+        iou_aware (bool): ${iou_aware_comment}
+        iou_aware_factor (float): ${iou_aware_factor_comment}
 
     Returns:
         Variable: A 3-D tensor with shape [N, M, 4], the coordinates of boxes,
@@ -1204,6 +1208,8 @@ def yolo_box(x,
         "downsample_ratio": downsample_ratio,
         "clip_bbox": clip_bbox,
         "scale_x_y": scale_x_y,
+        "iou_aware": iou_aware,
+        "iou_aware_factor": iou_aware_factor
     }
 
     helper.append_op(
diff --git a/python/paddle/fluid/layers/nn.py b/python/paddle/fluid/layers/nn.py
index ee08cb8654ec135a4efde03704ee0911d0fe18e1..e02edb72ce1f71a2f877c9b210f51567cb0a2607 100755
--- a/python/paddle/fluid/layers/nn.py
+++ b/python/paddle/fluid/layers/nn.py
@@ -26,7 +26,7 @@ import six
 import paddle
 from ..layer_helper import LayerHelper
 from ..initializer import Normal, Constant, NumpyArrayInitializer
-from ..framework import Variable, OpProtoHolder, in_dygraph_mode, dygraph_only, _dygraph_tracer, default_main_program, _varbase_creator, static_only
+from ..framework import Variable, OpProtoHolder, in_dygraph_mode, dygraph_only, _dygraph_tracer, default_main_program, _varbase_creator, static_only, _global_flags
 from .. import dygraph_utils
 from ..param_attr import ParamAttr
 from .layer_function_generator import autodoc, templatedoc, _generate_doc_string_
@@ -9500,7 +9500,7 @@ def relu6(x, threshold=6.0, name=None):
         outputs={'Out': out},
         attrs={
             'threshold': threshold,
-            'use_mkldnn': core.globals()["FLAGS_use_mkldnn"]
+            'use_mkldnn': _global_flags()["FLAGS_use_mkldnn"]
         })
     return out
 
@@ -11093,7 +11093,7 @@ def strided_slice(input, axes, starts, ends, strides):
             Then:
                 result = [ [2], ]
     Args:
-        input (Variable): An N-D ``Tensor`` or ``LoDTensor`` . The data type is ``float32``, ``float64``, ``int32`` or ``int64``.
+        input (Variable): An N-D ``Tensor`` or ``LoDTensor`` . The data type is ``bool``, ``float32``, ``float64``, ``int32`` or ``int64``.
         axes (list|tuple): The data type is ``int32`` . Axes that `starts` and `ends` apply to.
                             It's optional. If it is not provides, it will be treated as :math:`[0,1,...,len(starts)-1]`.
         starts (list|tuple|Variable): The data type is ``int32`` . If ``starts`` is a list or tuple, the elements of
@@ -11144,7 +11144,7 @@ def strided_slice(input, axes, starts, ends, strides):
     helper = LayerHelper('strided_slice', **locals())
 
     check_variable_and_dtype(input, 'input',
-                             ['float32', 'float64', 'int32', 'int64'],
+                             ['bool', 'float32', 'float64', 'int32', 'int64'],
                              'strided_slice')
     check_type(axes, 'axes', (list, tuple), 'strided_slice')
     check_type(starts, 'starts', (list, tuple, Variable), 'strided_slice')
@@ -11569,7 +11569,7 @@ Examples:
             axis=axis,
             act=act,
             op_name='elementwise_add',
-            use_mkldnn=core.globals()["FLAGS_use_mkldnn"])
+            use_mkldnn=_global_flags()["FLAGS_use_mkldnn"])
 
     return _elementwise_op(LayerHelper('elementwise_add', **locals()))
 
diff --git a/python/paddle/fluid/layers/ops.py b/python/paddle/fluid/layers/ops.py
index 813f671e02070659da0ee83734b594b5163499a1..eee4bbbb1d54fe24696f2829d3a1cbd397125532 100755
--- a/python/paddle/fluid/layers/ops.py
+++ b/python/paddle/fluid/layers/ops.py
@@ -53,6 +53,7 @@ __unary_func__ = [
     'round',
     'reciprocal',
     'square',
+    'lgamma',
 ]
 
 __inplace_unary_func__ = [
@@ -396,6 +397,19 @@ Examples:
 
 """)
 
+add_sample_code(globals()["lgamma"], r"""
+Examples:
+    .. code-block:: python
+
+        import paddle
+
+        x = paddle.to_tensor([-0.4, -0.2, 0.1, 0.3])
+        out = paddle.lgamma(x)
+        print(out)
+        # [1.31452441, 1.76149750, 2.25271273, 1.09579802]
+
+""")
+
 add_sample_code(globals()["softplus"], r"""
 Examples:
     .. code-block:: python
diff --git a/python/paddle/fluid/layers/tensor.py b/python/paddle/fluid/layers/tensor.py
index a62217c628c302b7bb9e3f4aa62ee6fad17cb6bc..65cc745dbab8830be3c47a5a36fb7955d0bc4ea2 100644
--- a/python/paddle/fluid/layers/tensor.py
+++ b/python/paddle/fluid/layers/tensor.py
@@ -36,11 +36,32 @@ from paddle.utils import deprecated
 from .utils import check_shape
 
 __all__ = [
-    'create_tensor', 'create_parameter', 'create_global_var', 'cast',
-    'tensor_array_to_tensor', 'concat', 'sums', 'assign',
-    'fill_constant_batch_size_like', 'fill_constant', 'argmin', 'argmax',
-    'argsort', 'ones', 'zeros', 'reverse', 'has_inf', 'has_nan', 'isfinite',
-    'range', 'linspace', 'zeros_like', 'ones_like', 'diag', 'eye', 'triu'
+    'create_tensor',
+    'create_parameter',
+    'create_global_var',
+    'cast',
+    'tensor_array_to_tensor',
+    'concat',
+    'sums',
+    'assign',
+    'fill_constant_batch_size_like',
+    'fill_constant',
+    'argmin',
+    'argmax',
+    'argsort',
+    'ones',
+    'zeros',
+    'reverse',
+    'has_inf',
+    'has_nan',
+    'isfinite',
+    'range',
+    'linspace',
+    'zeros_like',
+    'ones_like',
+    'diag',
+    'eye',
+    'triu',
 ]
 
 
diff --git a/python/paddle/fluid/net_drawer.py b/python/paddle/fluid/net_drawer.py
index f991310384f769ce091197b16db953e7af94a3c3..fd8f6eaf364c419d44b62c860d0d506d4a481942 100644
--- a/python/paddle/fluid/net_drawer.py
+++ b/python/paddle/fluid/net_drawer.py
@@ -102,11 +102,11 @@ def parse_graph(program, graph, var_dict, **kwargs):
 
 def draw_graph(startup_program, main_program, **kwargs):
     if "graph_attr" in kwargs:
-        GRAPH_STYLE.update(kwargs[graph_attr])
+        GRAPH_STYLE.update(kwargs["graph_attr"])
     if "node_attr" in kwargs:
-        OP_STYLE.update(kwargs[node_attr])
+        OP_STYLE.update(kwargs["node_attr"])
     if "edge_attr" in kwargs:
-        VAR_STYLE.update(kwargs[edge_attr])
+        VAR_STYLE.update(kwargs["edge_attr"])
 
     graph_id = unique_id()
     filename = kwargs.get("filename")
diff --git a/python/paddle/fluid/optimizer.py b/python/paddle/fluid/optimizer.py
index 60d25a77c58dc05c9b52e35c617f95bc4647d277..b1b6c95ea33604c1629f55444a8a1968023e3faa 100755
--- a/python/paddle/fluid/optimizer.py
+++ b/python/paddle/fluid/optimizer.py
@@ -14,6 +14,7 @@
 
 from __future__ import print_function
 
+import warnings
 import numpy as np
 import six
 import os
@@ -21,6 +22,7 @@ import logging
 from collections import defaultdict
 
 import paddle
+import paddle.fluid as fluid
 from paddle.fluid.distribute_lookup_table import find_distributed_lookup_table
 from paddle.fluid.framework import Program, Variable, name_scope, default_main_program, default_startup_program, device_guard
 
@@ -33,7 +35,6 @@ from .framework import program_guard
 from .initializer import Constant
 from .layer_helper import LayerHelper
 from .layers import ops
-from .regularizer import append_regularization_ops
 from .dygraph import base as imperative_base
 from .dygraph import no_grad
 from .dygraph.learning_rate_scheduler import LearningRateDecay, _LearningRateEpochDecay
@@ -257,11 +258,11 @@ class Optimizer(object):
 
             assert model_np.shape == load_para_np.shape,  \
                                         "Parameter shape not match, Dygraph Parameter [ {} ] need tensor with shape {} but load tensor with shape {}".format(
-                                                item.name, model_np.shape, load_para_np.shape)
+                                                param.name, model_np.shape, load_para_np.shape)
 
             assert model_np.dtype == load_para_np.dtype, \
                                         "Parameter dtype not match, Dygraph Parameter [ {} ] need tensor with dtype {}  but load tensor with dtype {}".format(
-                                            item.name, model_np.dtype, load_para_np.dtype)
+                                            param.name, model_np.dtype, load_para_np.dtype)
 
             tensor.set(load_para_np, framework._current_expected_place())
 
@@ -884,6 +885,93 @@ class Optimizer(object):
                                                act_no_grad_set, callbacks)
         return params_grads
 
+    def _create_regularization_of_grad(self, param, grad, regularization=None):
+        """ Create and add backward regularization Operators
+    
+        Function helper of append_regularization_ops.
+        """
+        # If no gradient or no regularization is specified,  then we don't need to do anything
+        if grad is None or ((not hasattr(param, 'regularizer') or
+                             (hasattr(param, 'regularizer') and
+                              param.regularizer is None)) and
+                            regularization is None):
+            return grad
+        regularization_term = None
+        if hasattr(param, 'regularizer') and param.regularizer is not None:
+            # Add variable for regularization term in grad block
+            regularization_term = param.regularizer(param, grad, grad.block)
+        elif regularization is not None:
+            regularization_term = regularization(param, grad, grad.block)
+
+        assert regularization_term is not None
+
+        new_grad = grad
+        if grad.type == core.VarDesc.VarType.SELECTED_ROWS:
+            # FIXME(zcd): If the grad is SELECTED_ROWS, after regularization,
+            # the grad's type and name will be changed. But the gradient's name
+            # is used in ParallelExecutor Reduce mode, so I add a flag for
+            # the new_grad here.
+            new_grad = grad.block.create_var(
+                name=grad.name + core.kNewGradSuffix(),
+                dtype=param.dtype,
+                shape=param.shape,
+                lod_level=param.lod_level,
+                type=core.VarDesc.VarType.LOD_TENSOR)
+
+        inputs = {"X": [grad, regularization_term]}
+        outputs = {"Out": [new_grad]}
+        if framework.in_dygraph_mode():
+            new_grad = core.ops.sum([grad, regularization_term])
+        else:
+            grad.block.append_op(type='sum', inputs=inputs, outputs=outputs)
+
+        return new_grad
+
+    def append_regularization_ops(self,
+                                  parameters_and_grads,
+                                  regularization=None):
+        r"""Create and add backward regularization Operators
+    
+        Creates and adds backward regularization operators in the BlockDesc.
+        This will add gradients of the regularizer function to the gradients
+        of the parameters and return these modified gradients. This is the
+        same as implementing weight decay in optimizers for regularization.
+    
+        Args:
+            parameters_and_grads: A list of (parameters, gradients) pairs
+                                  that need to be regularized.
+            regularization: A global regularizer. If the parameter is not
+                            set. It will be applied with regularizer.
+    
+        Returns:
+            list[(Variable, Variable)]: list of (parameters, gradients) \
+            pair with the regularized gradient
+    
+        Raises:
+            Exception: Unknown regularization type
+        """
+        params_and_grads = []
+        if framework.in_dygraph_mode():
+            for param, grad in parameters_and_grads:
+                new_grad = self._create_regularization_of_grad(param, grad,
+                                                               regularization)
+                params_and_grads.append((param, new_grad))
+        else:
+            repeate_regularizer = False
+            with framework.name_scope('regularization'):
+                for param, grad in parameters_and_grads:
+                    if not repeate_regularizer and param.regularizer is not None and regularization is not None:
+                        repeate_regularizer = True
+                        logging.info(
+                            "If regularizer of a Parameter has been set by 'fluid.ParamAttr' or 'fluid.WeightNormParamAttr' already. "
+                            "The Regularization[%s] in Optimizer will not take effect, and it will only be applied to other Parameters!"
+                            % regularization.__str__())
+                    with param.block.program._optimized_guard([param, grad]):
+                        new_grad = self._create_regularization_of_grad(
+                            param, grad, regularization)
+                        params_and_grads.append((param, new_grad))
+        return params_and_grads
+
     def apply_gradients(self, params_grads):
         """
         Second part of `minimize`, appending optimization operators for
@@ -916,8 +1004,8 @@ class Optimizer(object):
             params_grads = append_gradient_clip_ops(params_grads)
 
         # Add regularization if any
-        params_grads = append_regularization_ops(params_grads,
-                                                 self.regularization)
+        params_grads = self.append_regularization_ops(params_grads,
+                                                      self.regularization)
 
         optimize_ops = self._create_optimization_pass(params_grads)
         return optimize_ops
@@ -939,8 +1027,8 @@ class Optimizer(object):
                                framework.default_startup_program()):
                 if self._grad_clip is not None:
                     params_grads = self._grad_clip(params_grads)
-                params_grads = append_regularization_ops(params_grads,
-                                                         self.regularization)
+                params_grads = self.append_regularization_ops(
+                    params_grads, self.regularization)
                 optimize_ops = self._create_optimization_pass(params_grads)
         else:
             program = loss.block.program
@@ -1383,7 +1471,7 @@ class DGCMomentumOptimizer(Optimizer):
             assert isinstance(
                 num_trainers, int
             ), "The type of num_trainers should be 'int', but received %s" % type(
-                value)
+                num_trainers)
             assert num_trainers > 0, "The value of num_trainers should be greater than 0!"
 
             self._num_trainers = num_trainers
@@ -1674,8 +1762,8 @@ class DGCMomentumOptimizer(Optimizer):
             not_dgc_params_grads = append_gradient_clip_ops(
                 not_dgc_params_grads)
 
-        not_dgc_params_grads = append_regularization_ops(not_dgc_params_grads,
-                                                         self.regularization)
+        not_dgc_params_grads = self.append_regularization_ops(
+            not_dgc_params_grads, self.regularization)
 
         params_grads = not_dgc_params_grads + dgc_params_grads
         params_grads = sorted(params_grads, key=lambda x: x[0].name)
@@ -4084,6 +4172,7 @@ class PipelineOptimizer(object):
                     'out_dtype': out_var.dtype,
                     self._op_role_key: self._op_role.Optimize
                 })
+            offset += 1
         return offset
 
     def _create_vars(self, block, ori_block):
@@ -4596,12 +4685,15 @@ class PipelineOptimizer(object):
                                 'ring_id': ring_id
                             })
                         extra_index_info['index'] += 1
+                        var_shape = list(var.shape)
+                        var_shape[0] = self.micro_batch_size if var_shape[
+                            0] < 0 else var_shape[0]
                         block._insert_op_without_sync(
                             index=index + extra_index_info['index'],
                             type='recv_v2',
                             outputs={'Out': [var]},
                             attrs={
-                                'out_shape': var.shape,
+                                'out_shape': var_shape,
                                 'dtype': var.dtype,
                                 self._op_device_key: cur_dev,
                                 self._op_role_key: op_role,
diff --git a/python/paddle/fluid/reader.py b/python/paddle/fluid/reader.py
index 9f2b2127aa7043546e84c8cc0295349108f407f5..616daf5a6504134c93720b31c66c1d19fb36b4ab 100644
--- a/python/paddle/fluid/reader.py
+++ b/python/paddle/fluid/reader.py
@@ -1291,7 +1291,7 @@ class GeneratorLoader(DataLoaderBase):
             except Exception as ex:
                 self._queue.kill()
                 self._thread = None
-                logging.warn('Your reader has raised an exception!')
+                logging.warning('Your reader has raised an exception!')
                 six.reraise(*sys.exc_info())
 
         self._thread = threading.Thread(
diff --git a/python/paddle/fluid/regularizer.py b/python/paddle/fluid/regularizer.py
index 64ce283a63c5bf4e47c522c1581c48a1f11c85f1..64bbca6c57c54090281ff1d03659178c55e475cb 100644
--- a/python/paddle/fluid/regularizer.py
+++ b/python/paddle/fluid/regularizer.py
@@ -22,92 +22,6 @@ from . import core
 __all__ = ['L1Decay', 'L2Decay', 'L1DecayRegularizer', 'L2DecayRegularizer']
 
 
-def _create_regularization_of_grad(param, grad, regularization=None):
-    """ Create and add backward regularization Operators
-
-    Function helper of append_regularization_ops.
-    """
-    # If no gradient or no regularization is specified,  then we don't need to do anything
-    if grad is None or ((not hasattr(param, 'regularizer') or (
-            hasattr(param, 'regularizer') and param.regularizer is None)) and
-                        regularization is None):
-        return grad
-    regularization_term = None
-    if hasattr(param, 'regularizer') and param.regularizer is not None:
-        # Add variable for regularization term in grad block
-        regularization_term = param.regularizer(param, grad, grad.block)
-    elif regularization is not None:
-        regularization_term = regularization(param, grad, grad.block)
-
-    assert regularization_term is not None
-
-    new_grad = grad
-    if grad.type == core.VarDesc.VarType.SELECTED_ROWS:
-        # FIXME(zcd): If the grad is SELECTED_ROWS, after regularization,
-        # the grad's type and name will be changed. But the gradient's name
-        # is used in ParallelExecutor Reduce mode, so I add a flag for
-        # the new_grad here.
-        new_grad = grad.block.create_var(
-            name=grad.name + core.kNewGradSuffix(),
-            dtype=param.dtype,
-            shape=param.shape,
-            lod_level=param.lod_level,
-            type=core.VarDesc.VarType.LOD_TENSOR)
-
-    inputs = {"X": [grad, regularization_term]}
-    outputs = {"Out": [new_grad]}
-    if in_dygraph_mode():
-        new_grad = core.ops.sum([grad, regularization_term])
-    else:
-        grad.block.append_op(type='sum', inputs=inputs, outputs=outputs)
-
-    return new_grad
-
-
-def append_regularization_ops(parameters_and_grads, regularization=None):
-    r"""Create and add backward regularization Operators
-
-    Creates and adds backward regularization operators in the BlockDesc.
-    This will add gradients of the regularizer function to the gradients
-    of the parameters and return these modified gradients. This is the
-    same as implementing weight decay in optimizers for regularization.
-
-    Args:
-        parameters_and_grads: A list of (parameters, gradients) pairs
-                              that need to be regularized.
-        regularization: A global regularizer. If the parameter is not
-                        set. It will be applied with regularizer.
-
-    Returns:
-        list[(Variable, Variable)]: list of (parameters, gradients) \
-        pair with the regularized gradient
-
-    Raises:
-        Exception: Unknown regularization type
-    """
-    params_and_grads = []
-    if in_dygraph_mode():
-        for param, grad in parameters_and_grads:
-            new_grad = _create_regularization_of_grad(param, grad,
-                                                      regularization)
-            params_and_grads.append((param, new_grad))
-    else:
-        repeate_regularizer = False
-        with framework.name_scope('regularization'):
-            for param, grad in parameters_and_grads:
-                if not repeate_regularizer and param.regularizer is not None and regularization is not None:
-                    repeate_regularizer = True
-                    logging.info(
-                        "If regularizer of a Parameter has been set by 'fluid.ParamAttr' or 'fluid.WeightNormParamAttr' already. "
-                        "The Regularization[%s] in Optimizer will not take effect, and it will only be applied to other Parameters!"
-                        % regularization.__str__())
-                with param.block.program._optimized_guard([param, grad]):
-                    new_grad = _create_regularization_of_grad(param, grad,
-                                                              regularization)
-                    params_and_grads.append((param, new_grad))
-    return params_and_grads
-
-
 class WeightDecayRegularizer(object):
     """Base class for weight decay regularizers
 
diff --git a/python/paddle/fluid/tests/test_beam_search_decoder.py b/python/paddle/fluid/tests/test_beam_search_decoder.py
index 69f3ff46b3ac9c50f588a64182d02783cbc93aed..301bd0ff0039e03e44949ed9ebc2bede36ebf5ba 100644
--- a/python/paddle/fluid/tests/test_beam_search_decoder.py
+++ b/python/paddle/fluid/tests/test_beam_search_decoder.py
@@ -246,7 +246,7 @@ def inject_test_train(use_cuda):
 
 
 def inject_test_decode(use_cuda, decorator=None):
-    f_name = 'test_{0}_decode'.format('cuda' if use_cuda else 'cpu', 'sparse')
+    f_name = 'test_{0}_decode'.format('cuda' if use_cuda else 'cpu')
 
     def f(*args):
         with scope_prog_guard():
diff --git a/python/paddle/fluid/tests/unittests/CMakeLists.txt b/python/paddle/fluid/tests/unittests/CMakeLists.txt
index 85fbe001970ba7179691aa2853e53922f33944a1..03aaf7ed03e26dd1772cbf598c82daedcc716a07 100644
--- a/python/paddle/fluid/tests/unittests/CMakeLists.txt
+++ b/python/paddle/fluid/tests/unittests/CMakeLists.txt
@@ -104,6 +104,7 @@ if(((NOT WITH_ROCM) AND (NOT WITH_GPU)) OR WIN32)
     LIST(REMOVE_ITEM TEST_OPS test_collective_sendrecv_api)
     LIST(REMOVE_ITEM TEST_OPS test_collective_wait)
     LIST(REMOVE_ITEM TEST_OPS test_memcpy_op)
+    LIST(REMOVE_ITEM TEST_OPS test_raw_program_optimizer)
 endif()
 
 if(WIN32)
@@ -660,6 +661,8 @@ if (WITH_MKLDNN)
     add_subdirectory(mkldnn)
 endif()
 
+add_subdirectory(asp)
+
 add_subdirectory(ir)
 
 if (WITH_TESTING)
diff --git a/python/paddle/fluid/tests/unittests/asp/CMakeLists.txt b/python/paddle/fluid/tests/unittests/asp/CMakeLists.txt
new file mode 100644
index 0000000000000000000000000000000000000000..f71e04c09aa38b8cf7b3a167b84d4dc0e6cc3ec7
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/asp/CMakeLists.txt
@@ -0,0 +1,6 @@
+file(GLOB TEST_OPS RELATIVE "${CMAKE_CURRENT_SOURCE_DIR}" "test_*.py")
+string(REPLACE ".py" "" TEST_OPS "${TEST_OPS}")
+
+foreach(TEST_OP ${TEST_OPS})
+    py_test_modules(${TEST_OP} MODULES ${TEST_OP})
+endforeach(TEST_OP)
diff --git a/python/paddle/fluid/tests/unittests/asp/__init__.py b/python/paddle/fluid/tests/unittests/asp/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..4c551792f989c0611d7077beb4e0995fc2f06abe
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/asp/__init__.py
@@ -0,0 +1,14 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+# Copyright (c) 2021 NVIDIA Corporation.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
diff --git a/python/paddle/fluid/tests/unittests/asp/asp_pruning_base.py b/python/paddle/fluid/tests/unittests/asp/asp_pruning_base.py
new file mode 100644
index 0000000000000000000000000000000000000000..370d73cc35a43ad02a715e9765cbf5a88a9be535
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/asp/asp_pruning_base.py
@@ -0,0 +1,89 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+# Copyright (c) 2021 NVIDIA Corporation.  All rights reserved.
+# 
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+# 
+#     http://www.apache.org/licenses/LICENSE-2.0
+# 
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+
+import unittest
+import threading, time
+import paddle
+import paddle.fluid as fluid
+import paddle.fluid.core as core
+from paddle.fluid.contrib import sparsity
+from paddle.fluid.contrib.sparsity.asp import ASPHelper
+import numpy as np
+
+paddle.enable_static()
+
+
+class TestASPHelperPruningBase(unittest.TestCase):
+    def setUp(self):
+        self.main_program = fluid.Program()
+        self.startup_program = fluid.Program()
+
+        def build_model():
+            img = fluid.data(
+                name='img', shape=[None, 3, 32, 32], dtype='float32')
+            label = fluid.data(name='label', shape=[None, 1], dtype='int64')
+            hidden = fluid.layers.conv2d(
+                input=img, num_filters=4, filter_size=3, padding=2, act="relu")
+            hidden = fluid.layers.fc(input=hidden, size=32, act='relu')
+            prediction = fluid.layers.fc(input=hidden, size=10, act='softmax')
+            return img, label, prediction
+
+        with fluid.program_guard(self.main_program, self.startup_program):
+            self.img, self.label, self.predict = build_model()
+
+    def run_inference_pruning_test(self, get_mask_gen_func,
+                                   get_mask_check_func):
+        place = paddle.CPUPlace()
+        if core.is_compiled_with_cuda():
+            place = paddle.CUDAPlace(0)
+        exe = fluid.Executor(place)
+
+        self.__pruning_and_checking(exe, place, get_mask_gen_func,
+                                    get_mask_check_func, False)
+
+    def run_training_pruning_test(self, get_mask_gen_func, get_mask_check_func):
+        with fluid.program_guard(self.main_program, self.startup_program):
+            loss = fluid.layers.mean(
+                fluid.layers.cross_entropy(
+                    input=self.predict, label=self.label))
+            optimizer = sparsity.decorate(
+                fluid.optimizer.SGD(learning_rate=0.01))
+            optimizer.minimize(loss, self.startup_program)
+
+        place = paddle.CPUPlace()
+        if core.is_compiled_with_cuda():
+            place = paddle.CUDAPlace(0)
+        exe = fluid.Executor(place)
+
+        self.__pruning_and_checking(exe, place, get_mask_gen_func,
+                                    get_mask_check_func, True)
+
+    def __pruning_and_checking(self, exe, place, mask_func_name,
+                               check_func_name, with_mask):
+        exe.run(self.startup_program)
+        sparsity.prune_model(
+            place,
+            self.main_program,
+            func_name=mask_func_name,
+            with_mask=with_mask)
+        for param in self.main_program.global_block().all_parameters():
+            if ASPHelper._is_supported_layer(self.main_program, param.name):
+                mat = np.array(fluid.global_scope().find_var(param.name)
+                               .get_tensor())
+                self.assertTrue(
+                    sparsity.check_sparsity(
+                        mat.T, func_name=check_func_name, n=2, m=4))
diff --git a/python/paddle/fluid/tests/unittests/asp/test_asp_optimize.py b/python/paddle/fluid/tests/unittests/asp/test_asp_optimize.py
new file mode 100644
index 0000000000000000000000000000000000000000..402861ad5d93120dd9328b25d2adab07504ff313
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/asp/test_asp_optimize.py
@@ -0,0 +1,202 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+# Copyright (c) 2021 NVIDIA Corporation.  All rights reserved.
+# 
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+# 
+#     http://www.apache.org/licenses/LICENSE-2.0
+# 
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+
+import unittest
+import threading, time
+import paddle
+import paddle.fluid as fluid
+import paddle.fluid.core as core
+from paddle.fluid.contrib import sparsity
+from paddle.fluid.contrib.sparsity.asp import ASPHelper
+import numpy as np
+
+paddle.enable_static()
+
+
+class TestASPHelper(unittest.TestCase):
+    def setUp(self):
+        self.main_program = fluid.Program()
+        self.startup_program = fluid.Program()
+
+        def build_model():
+            img = fluid.data(
+                name='img', shape=[None, 3, 32, 32], dtype='float32')
+            label = fluid.data(name='label', shape=[None, 1], dtype='int64')
+            hidden = fluid.layers.conv2d(
+                input=img, num_filters=4, filter_size=3, padding=2, act="relu")
+            hidden = fluid.layers.fc(input=hidden, size=32, act='relu')
+            prediction = fluid.layers.fc(input=hidden, size=10, act='softmax')
+            return img, label, prediction
+
+        with fluid.program_guard(self.main_program, self.startup_program):
+            self.img, self.label, predict = build_model()
+            self.loss = fluid.layers.mean(
+                fluid.layers.cross_entropy(
+                    input=predict, label=self.label))
+            self.optimizer = fluid.optimizer.SGD(learning_rate=0.01)
+
+    def test_get_not_ASP_relevant_vars(self):
+        def check_params(params, params_from_asp):
+            if len(params_from_asp) != len(params):
+                return False
+
+            for i, p in enumerate(params_from_asp):
+                if p.name != params[i].name:
+                    return False
+            return True
+
+        params = self.main_program.global_block().all_parameters()
+        params_from_asp = ASPHelper._get_not_ASP_relevant_vars(
+            self.main_program)
+        self.assertTrue(check_params(params, params_from_asp))
+
+        with fluid.program_guard(self.main_program, self.startup_program):
+            ASPHelper._minimize(self.optimizer, self.loss, self.main_program,
+                                self.startup_program)
+        params_from_asp_after_opt = ASPHelper._get_not_ASP_relevant_vars(
+            self.main_program)
+        self.assertTrue(check_params(params, params_from_asp_after_opt))
+
+    def test_is_supported_layers(self):
+        program = paddle.static.default_main_program()
+
+        names = [
+            'embedding_0.w_0', 'fack_layer_0.w_0', 'conv2d_0.w_0',
+            'conv2d_0.b_0', 'conv2d_1.w_0', 'conv2d_1.b_0', 'fc_0.w_0',
+            'fc_0.b_0', 'fc_1.w_0', 'fc_1.b_0', 'linear_2.w_0', 'linear_2.b_0'
+        ]
+        ref = [
+            False, False, True, False, True, False, True, False, True, False,
+            True, False
+        ]
+        for i, name in enumerate(names):
+            self.assertTrue(
+                ref[i] == ASPHelper._is_supported_layer(program, name))
+
+        sparsity.set_excluded_layers(program, ['fc_1', 'conv2d_0'])
+        ref = [
+            False, False, False, False, True, False, True, False, False, False,
+            True, False
+        ]
+        for i, name in enumerate(names):
+            self.assertTrue(
+                ref[i] == ASPHelper._is_supported_layer(program, name))
+
+        sparsity.reset_excluded_layers(program)
+        ref = [
+            False, False, True, False, True, False, True, False, True, False,
+            True, False
+        ]
+        for i, name in enumerate(names):
+            self.assertTrue(
+                ref[i] == ASPHelper._is_supported_layer(program, name))
+
+    def test_decorate(self):
+        param_names = self.__get_param_names(self.main_program.global_block()
+                                             .all_parameters())
+        with fluid.program_guard(self.main_program, self.startup_program):
+            self.optimizer = sparsity.decorate(self.optimizer)
+            self.optimizer.minimize(self.loss, self.startup_program)
+        param_names_after_minimize = self.__get_param_names(
+            self.main_program.global_block().all_parameters())
+
+        self.__check_mask_variables_and_ops(param_names,
+                                            param_names_after_minimize)
+
+    def test_asp_training(self):
+        with fluid.program_guard(self.main_program, self.startup_program):
+            self.optimizer = sparsity.decorate(self.optimizer)
+            self.optimizer.minimize(self.loss, self.startup_program)
+
+        place = paddle.CPUPlace()
+        if core.is_compiled_with_cuda():
+            place = paddle.CUDAPlace(0)
+        exe = fluid.Executor(place)
+        feeder = fluid.DataFeeder(feed_list=[self.img, self.label], place=place)
+
+        exe.run(self.startup_program)
+        sparsity.prune_model(place, self.main_program)
+
+        data = (np.random.randn(64, 3, 32, 32), np.random.randint(
+            10, size=(64, 1)))
+        exe.run(self.main_program, feed=feeder.feed([data]))
+
+        for param in self.main_program.global_block().all_parameters():
+            if ASPHelper._is_supported_layer(self.main_program, param.name):
+                mat = np.array(fluid.global_scope().find_var(param.name)
+                               .get_tensor())
+                self.assertTrue(sparsity.check_sparsity(mat.T, n=2, m=4))
+
+    def test_asp_training_with_amp(self):
+        if core.is_compiled_with_cuda():
+            place = paddle.CUDAPlace(0)
+            with fluid.program_guard(self.main_program, self.startup_program):
+                self.optimizer = fluid.contrib.mixed_precision.decorator.decorate(
+                    self.optimizer)
+                self.optimizer = sparsity.decorate(self.optimizer)
+                self.optimizer.minimize(self.loss, self.startup_program)
+
+            exe = fluid.Executor(place)
+            feeder = fluid.DataFeeder(
+                feed_list=[self.img, self.label], place=place)
+
+            exe.run(self.startup_program)
+            sparsity.prune_model(place, self.main_program)
+
+            data = (np.random.randn(64, 3, 32, 32), np.random.randint(
+                10, size=(64, 1)))
+            exe.run(self.main_program, feed=feeder.feed([data]))
+
+            for param in self.main_program.global_block().all_parameters():
+                if ASPHelper._is_supported_layer(self.main_program, param.name):
+                    mat = np.array(fluid.global_scope().find_var(param.name)
+                                   .get_tensor())
+                    self.assertTrue(sparsity.check_sparsity(mat.T, n=2, m=4))
+
+    def __get_param_names(self, params):
+        param_names = []
+        for p in params:
+            param_names.append(p.name)
+        return param_names
+
+    def __check_mask_variables_and_ops(self, param_names,
+                                       param_names_after_minimize):
+        for n in param_names:
+            self.assertFalse(ASPHelper._is_supported_layer(self.main_program, n) and \
+               ASPHelper._get_mask_name(n) not in param_names_after_minimize)
+
+        mask_names = []
+        for n in param_names:
+            if ASPHelper._is_supported_layer(self.main_program, n):
+                mask_names.append(ASPHelper._get_mask_name(n))
+
+        masking_ops = []
+        for op in self.main_program.global_block().ops:
+            if op.type == 'elementwise_mul' and \
+               op.input('Y')[0] in mask_names:
+                masking_ops.append(op.input('Y')[0])
+
+        self.assertTrue(len(masking_ops) == len(mask_names))
+        for n in masking_ops:
+            self.assertTrue(n in mask_names)
+
+        for n in mask_names:
+            self.assertTrue(n in masking_ops)
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/asp/test_asp_pruning_1d.py b/python/paddle/fluid/tests/unittests/asp/test_asp_pruning_1d.py
new file mode 100644
index 0000000000000000000000000000000000000000..ee4b2c002f5afaf390b42e13e5cf7f34906cd90a
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/asp/test_asp_pruning_1d.py
@@ -0,0 +1,36 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+# Copyright (c) 2021 NVIDIA Corporation.  All rights reserved.
+# 
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+# 
+#     http://www.apache.org/licenses/LICENSE-2.0
+# 
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+
+import paddle
+from paddle.fluid.contrib import sparsity
+from paddle.fluid.tests.unittests.asp.asp_pruning_base import TestASPHelperPruningBase
+
+paddle.enable_static()
+
+
+class TestASPHelperPruning1D(TestASPHelperPruningBase):
+    def test_1D_inference_pruning(self):
+        self.run_inference_pruning_test(sparsity.MaskAlgo.MASK_1D,
+                                        sparsity.CheckMethod.CHECK_1D)
+
+    def test_1D_training_pruning(self):
+        self.run_training_pruning_test(sparsity.MaskAlgo.MASK_1D,
+                                       sparsity.CheckMethod.CHECK_1D)
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/asp/test_asp_pruning_2d_best.py b/python/paddle/fluid/tests/unittests/asp/test_asp_pruning_2d_best.py
new file mode 100644
index 0000000000000000000000000000000000000000..1b8b1e4a06ae4c6954aba4f380361dfc7383eb9b
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/asp/test_asp_pruning_2d_best.py
@@ -0,0 +1,36 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+# Copyright (c) 2021 NVIDIA Corporation.  All rights reserved.
+# 
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+# 
+#     http://www.apache.org/licenses/LICENSE-2.0
+# 
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+
+import paddle
+from paddle.fluid.contrib import sparsity
+from paddle.fluid.tests.unittests.asp.asp_pruning_base import TestASPHelperPruningBase
+
+paddle.enable_static()
+
+
+class TestASPHelperPruning2DBest(TestASPHelperPruningBase):
+    def test_2D_best_inference_pruning(self):
+        self.run_inference_pruning_test(sparsity.MaskAlgo.MASK_2D_BEST,
+                                        sparsity.CheckMethod.CHECK_2D)
+
+    def test_2D_best_training_pruning(self):
+        self.run_training_pruning_test(sparsity.MaskAlgo.MASK_2D_BEST,
+                                       sparsity.CheckMethod.CHECK_2D)
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/asp/test_asp_pruning_2d_greedy.py b/python/paddle/fluid/tests/unittests/asp/test_asp_pruning_2d_greedy.py
new file mode 100644
index 0000000000000000000000000000000000000000..4bdd310f0209a94f639b107e7279726e196e6a7d
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/asp/test_asp_pruning_2d_greedy.py
@@ -0,0 +1,36 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+# Copyright (c) 2021 NVIDIA Corporation.  All rights reserved.
+# 
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+# 
+#     http://www.apache.org/licenses/LICENSE-2.0
+# 
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+
+import paddle
+from paddle.fluid.contrib import sparsity
+from paddle.fluid.tests.unittests.asp.asp_pruning_base import TestASPHelperPruningBase
+
+paddle.enable_static()
+
+
+class TestASPHelperPruning2DGreedy(TestASPHelperPruningBase):
+    def test_2D_greedy_inference_pruning(self):
+        self.run_inference_pruning_test(sparsity.MaskAlgo.MASK_2D_GREEDY,
+                                        sparsity.CheckMethod.CHECK_2D)
+
+    def test_2D_greedy_training_pruning(self):
+        self.run_training_pruning_test(sparsity.MaskAlgo.MASK_2D_GREEDY,
+                                       sparsity.CheckMethod.CHECK_2D)
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_asp_utils.py b/python/paddle/fluid/tests/unittests/asp/test_asp_utils.py
similarity index 94%
rename from python/paddle/fluid/tests/unittests/test_asp_utils.py
rename to python/paddle/fluid/tests/unittests/asp/test_asp_utils.py
index faffd477ae5661cee5e599b2044af4f42a96112f..387cb55e5c3cfd65c6e56433afb659dfe2f12bff 100644
--- a/python/paddle/fluid/tests/unittests/test_asp_utils.py
+++ b/python/paddle/fluid/tests/unittests/asp/test_asp_utils.py
@@ -39,9 +39,9 @@ class TestASPUtils(unittest.TestCase):
         x = np.array([[1.0, 1.0, 1.0, 0.0, 1.0], [1.0, 1.0, 0.0, 0.0, 1.0],
                       [1.0, 0.0, 0.0, 0.0, 1.0], [1.0, 1.0, 0.0, 0.0, 1.0],
                       [0.0, 1.0, 0.0, 0.0, 1.0]])
-        self.assertEqual(sparsity.density(x), 0.56)
+        self.assertEqual(sparsity.calculate_density(x), 0.56)
         x[:, 0] = 0.0
-        self.assertEqual(sparsity.density(x), 0.4)
+        self.assertEqual(sparsity.calculate_density(x), 0.4)
 
     def test_check_mask_1d(self):
         x = np.array([[1.0, 0.0, 0.0, 1.0, 1.0], [1.0, 1.0, 0.0, 0.0, 1.0],
@@ -114,11 +114,11 @@ class TestASPUtils(unittest.TestCase):
         for _ in range(4):
             computing_thread = threading.Thread(
                 target=paddle.fluid.contrib.sparsity.utils.
-                compute_valid_2d_patterns,
+                _compute_valid_2d_patterns,
                 args=(2, 4))
             computing_thread.start()
         time.sleep(3)
-        patterns_map = paddle.fluid.contrib.sparsity.utils.valid_2d_patterns
+        patterns_map = paddle.fluid.contrib.sparsity.utils._valid_2d_patterns
         reference_patterns = get_reference()
         reference_key = '4_2'
 
diff --git a/python/paddle/fluid/tests/unittests/check_nan_inf_base_dygraph.py b/python/paddle/fluid/tests/unittests/check_nan_inf_base_dygraph.py
new file mode 100644
index 0000000000000000000000000000000000000000..08bab306df1b111b0b27814cd27a6f0243adfdf4
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/check_nan_inf_base_dygraph.py
@@ -0,0 +1,112 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import unicode_literals
+from __future__ import print_function
+
+import os
+import sys
+import time
+import numpy as np
+
+os.environ[str("FLAGS_check_nan_inf")] = str("1")
+os.environ[str("GLOG_vmodule")] = str("nan_inf_utils_detail=10")
+
+import paddle
+import paddle.nn as nn
+
+np.random.seed(0)
+
+
+def generator():
+    batch_size = 5
+    for i in range(5):
+        curr_train_x = np.random.randint(
+            batch_size, size=(batch_size, 3)).astype("float32")
+        if i >= 2:
+            curr_train_x[0, :] = np.nan
+            curr_train_x[-1, :] = np.inf
+        res = []
+        for i in range(batch_size):
+            y = i % 3
+            res.append([y])
+        y_label = np.array(res).astype('int64')
+        yield [curr_train_x, y_label]
+
+
+class TestLayer(nn.Layer):
+    def __init__(self):
+        super(TestLayer, self).__init__()
+        self.linear1 = nn.Linear(3, 400)
+        self.linear2 = nn.Linear(400, 400)
+        self.linear3 = nn.Linear(400, 3)
+
+    def forward(self, x):
+        x = self.linear1(x)
+        x = nn.functional.sigmoid(x)
+        x = self.linear2(x)
+        x = nn.functional.sigmoid(x)
+        x = self.linear3(x)
+        x = nn.functional.softmax(x)
+
+        return x
+
+
+def check(use_cuda):
+    paddle.set_device('gpu' if use_cuda else 'cpu')
+
+    net = TestLayer()
+    sgd = paddle.optimizer.SGD(learning_rate=0.05, parameters=net.parameters())
+
+    for step, (x, y) in enumerate(generator()):
+        x = paddle.to_tensor(x)
+        y = paddle.to_tensor(y)
+
+        zero = paddle.zeros(shape=[1], dtype='int64')
+        fp16_zero = paddle.cast(zero, dtype='float16')
+
+        y = y + zero
+
+        y_pred = net(x)
+
+        cost = nn.functional.cross_entropy(y_pred, y, use_softmax=False)
+        avg_cost = paddle.mean(cost)
+
+        acc_top1 = paddle.metric.accuracy(input=y_pred, label=y, k=1)
+
+        print('iter={:.0f}, cost={}, acc1={}'.format(
+            step, avg_cost.numpy(), acc_top1.numpy()))
+
+        sgd.step()
+        sgd.clear_grad()
+
+
+if __name__ == '__main__':
+    if paddle.is_compiled_with_cuda():
+        try:
+            check(use_cuda=True)
+            assert False
+        except Exception as e:
+            print(e)
+            print(type(e))
+            # Note. Enforce in cuda kernel may not catch in paddle, and
+            # Exception type will be RuntimeError
+            assert type(e) == OSError or type(e) == RuntimeError
+    try:
+        check(use_cuda=False)
+        assert False
+    except Exception as e:
+        print(e)
+        print(type(e))
+        assert type(e) == RuntimeError
diff --git a/python/paddle/fluid/tests/unittests/dygraph_to_static/simnet_dygraph_model.py b/python/paddle/fluid/tests/unittests/dygraph_to_static/simnet_dygraph_model.py
index 4f35befda8e2cdd7e238dc22f2cea78b68fc70e6..affec2f7dfefc3d8f385077055de5028df28bea7 100644
--- a/python/paddle/fluid/tests/unittests/dygraph_to_static/simnet_dygraph_model.py
+++ b/python/paddle/fluid/tests/unittests/dygraph_to_static/simnet_dygraph_model.py
@@ -18,6 +18,7 @@ import paddle.fluid.param_attr as attr
 from functools import reduce
 from paddle.fluid.dygraph import declarative, to_variable
 from paddle.fluid.dygraph import Embedding, Layer, Linear
+from paddle.static import Variable
 
 
 class EmbeddingLayer(object):
diff --git a/python/paddle/fluid/tests/unittests/dygraph_to_static/simnet_dygraph_model_v2.py b/python/paddle/fluid/tests/unittests/dygraph_to_static/simnet_dygraph_model_v2.py
index e0b7e9033dd5e62110dde39e3f8d399b0f3b1662..5cbaeb0f4046e3e9e6404586ca59f2b2d7f3c8fc 100644
--- a/python/paddle/fluid/tests/unittests/dygraph_to_static/simnet_dygraph_model_v2.py
+++ b/python/paddle/fluid/tests/unittests/dygraph_to_static/simnet_dygraph_model_v2.py
@@ -14,6 +14,7 @@
 
 from functools import reduce
 import paddle
+from paddle.static import Variable
 
 
 class EmbeddingLayer(object):
diff --git a/python/paddle/fluid/tests/unittests/dygraph_to_static/test_op_attr.py b/python/paddle/fluid/tests/unittests/dygraph_to_static/test_op_attr.py
new file mode 100644
index 0000000000000000000000000000000000000000..a39b5d7cd1a44b0193e10158b8fbe7de87850fde
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/dygraph_to_static/test_op_attr.py
@@ -0,0 +1,148 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+# 
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+# 
+#     http://www.apache.org/licenses/LICENSE-2.0
+# 
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import six
+import paddle
+import unittest
+import numpy as np
+
+from paddle.static import InputSpec
+
+
+class MySub(paddle.nn.Layer):
+    def __init__(self):
+        super(MySub, self).__init__()
+
+    def forward(self, x, y, name=None):
+        return paddle.subtract(x, y, name)
+
+
+class NetWithOpAttr(paddle.nn.Layer):
+    def __init__(self, in_num, out_num):
+        super(NetWithOpAttr, self).__init__()
+
+        self.linear = paddle.nn.Linear(in_num, out_num)
+        self.bn = paddle.nn.BatchNorm(out_num)
+        self.sub = MySub()
+
+    def forward(self, x):
+        out = self.linear(x)
+        out = self.sub(out, x)
+        out = self.bn(out)
+        return out
+
+    @paddle.jit.to_static(input_spec=[InputSpec([10, 16])])
+    def with_cond(self, x):
+        if paddle.mean(x) > 0.:
+            out = self.linear(x)
+        else:
+            out = self.sub(x, x)
+        out = self.bn(out)
+        return out
+
+
+class CheckOpAttr(unittest.TestCase):
+    def setUp(self):
+        self.in_num = 16
+        self.out_num = 16
+        self.x = paddle.randn([10, self.in_num])
+        self.expected_results()
+
+    def expected_results(self):
+        self.fc_attrs = {
+            "int_val": 10,
+            "int_vals": [10, 20],
+            "float_val": 3.8,
+            "float_vals": [3.8, -0.2]
+        }
+        self.bn_attrs = {"bool_val": True, "bool_vals": [True, False]}
+        self.sub_attrs = {"int_vals": [10, 20], "bool_vals": [True, False]}
+
+        self.infos = {
+            'matmul': self.fc_attrs,
+            'elementwise_add': self.fc_attrs,
+            'batch_norm': self.bn_attrs,
+            'tanh': self.bn_attrs,
+            'elementwise_sub': self.sub_attrs
+        }
+
+    def test_set_op_attrs(self):
+        net = NetWithOpAttr(self.in_num, self.out_num)
+        # set attrs
+        net.linear._set_op_attrs(self.fc_attrs)
+        net.bn._set_op_attrs({"bool_val": False})  # test overwrite behavior
+        net.bn._set_op_attrs(self.bn_attrs)
+        net.sub._set_op_attrs(self.sub_attrs)
+        # assert hooks exist.
+        self.assertEqual(len(net.linear._forward_pre_hooks), 1)
+        self.assertEqual(len(net.linear._forward_post_hooks), 1)
+        # to_static
+        net = paddle.jit.to_static(
+            net, input_spec=[InputSpec.from_tensor(self.x)])
+
+        # assert attrs have be set.
+        self.check_op_attrs(net.forward.concrete_program.main_program)
+
+        # assert hooks have be clean.
+        self.assertEqual(len(net.linear._forward_pre_hooks), 0)
+        self.assertEqual(len(net.linear._forward_post_hooks), 0)
+
+    def check_op_attrs(self, main_program):
+        for cur_block in main_program.blocks:
+            ops = cur_block.ops
+            for op in ops:
+                if op.type not in self.infos: continue
+                for attr_name, expect_vals in six.iteritems(self.infos[
+                        op.type]):
+                    op_vals = op.desc.attr(attr_name)
+                    if not isinstance(expect_vals, list):
+                        expect_vals = [expect_vals]
+                        op_vals = [op_vals]
+
+                    for (op_val, expect_val) in zip(op_vals, expect_vals):
+                        if isinstance(op_val, float):
+                            # C++ vs python: 3.799999952316284 ~= 3.8
+                            self.assertAlmostEqual(op_val, expect_val)
+                        else:
+                            self.assertEqual(op_val, expect_val)
+
+    def test_set_op_attrs_with_sub_block(self):
+        net = NetWithOpAttr(self.in_num, self.out_num)
+        # set attrs
+        net.linear._set_op_attrs({
+            "int_vals": [0, 0]
+        })  # test overwrite behavior
+        net.linear._set_op_attrs(self.fc_attrs)
+        net.bn._set_op_attrs(self.bn_attrs)
+        net.sub._set_op_attrs(self.sub_attrs)
+        # assert hooks exist.
+        self.assertEqual(len(net.linear._forward_pre_hooks), 1)
+        self.assertEqual(len(net.linear._forward_post_hooks), 1)
+
+        # assert attrs have be set.
+        self.check_op_attrs(net.with_cond.concrete_program.main_program)
+
+        # assert hooks have be clean.
+        self.assertEqual(len(net.linear._forward_pre_hooks), 0)
+        self.assertEqual(len(net.linear._forward_post_hooks), 0)
+
+    def test_type_error(self):
+        net = NetWithOpAttr(self.in_num, self.out_num)
+        # attrs should be dict
+        with self.assertRaises(TypeError):
+            net.linear._set_op_attrs([self.fc_attrs])
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/dygraph_to_static/test_static_analysis.py b/python/paddle/fluid/tests/unittests/dygraph_to_static/test_static_analysis.py
index e72688d800ba59f63503248f2a5d385da23d6882..0fffb0c985375be5f126852b8fa890bb0c32ab9d 100644
--- a/python/paddle/fluid/tests/unittests/dygraph_to_static/test_static_analysis.py
+++ b/python/paddle/fluid/tests/unittests/dygraph_to_static/test_static_analysis.py
@@ -17,6 +17,7 @@ from __future__ import print_function
 import gast
 import inspect
 import numpy as np
+import paddle
 import paddle.fluid as fluid
 import unittest
 
diff --git a/python/paddle/fluid/tests/unittests/dygraph_to_static/test_tensor_shape.py b/python/paddle/fluid/tests/unittests/dygraph_to_static/test_tensor_shape.py
index 70749c2e24447e67f267dcfe396dec18d2dcebab..f7cdb12a1ab673824dad137cbd2186c08b631d60 100644
--- a/python/paddle/fluid/tests/unittests/dygraph_to_static/test_tensor_shape.py
+++ b/python/paddle/fluid/tests/unittests/dygraph_to_static/test_tensor_shape.py
@@ -29,10 +29,10 @@ def dyfunc_tensor_shape_1(x):
 
 
 def dyfunc_tensor_shape_2(x):
-    x = fluid.dygraph.to_variable(x)
+    x = paddle.to_tensor(x)
     shape = x.shape
     shape2 = shape
-    res = fluid.layers.reshape(x, shape2)
+    res = paddle.reshape(x, shape2)
     return res
 
 
@@ -85,6 +85,13 @@ def dyfunc_tuple_shape_2(x):
     return res
 
 
+def dyfunc_tuple_shape_3(x):
+    x = paddle.to_tensor(x)
+    a, b = paddle.shape(x)
+    res = paddle.reshape(x, shape=(b, a))
+    return res
+
+
 def dyfunc_paddle_shape_api(x):
     x = paddle.to_tensor(x)
     # paddle.shape will not be converted.
@@ -190,7 +197,7 @@ def dyfunc_with_while_3(x):
 
 
 def dyfunc_with_while_4(x):
-    x = fluid.dygraph.to_variable(x)
+    x = paddle.to_tensor(x)
     y = numpy.ones(5)
     y_shape_0 = y.shape[0]
     i = 1
@@ -337,6 +344,18 @@ class TestTupleShape2(TestTensorShapeBasic):
         self.expected_slice_op_num = 2
 
 
+class TestTupleShape3(TestTensorShapeBasic):
+    def init_test_func(self):
+        self.input = numpy.ones((5, 7)).astype("int32")
+        self.input_spec = [paddle.static.InputSpec(shape=[5, 7], dtype="int32")]
+        self.dygraph_func = dyfunc_tuple_shape_3
+
+    def _set_expected_op_num(self):
+        self.expected_op_num = 5
+        self.expected_shape_op_num = 1
+        self.expected_slice_op_num = 2
+
+
 class TestPaddleShapeApi(TestTensorShapeBasic):
     def init_test_func(self):
         self.input = numpy.ones((5, 7)).astype("int32")
diff --git a/python/paddle/fluid/tests/unittests/hybrid_parallel_mp_layers.py b/python/paddle/fluid/tests/unittests/hybrid_parallel_mp_layers.py
index e69cf7d267bccb4e302e904158c6f04b4287f9f0..23dae3173869183fa460aa020f9544c07df60b3f 100644
--- a/python/paddle/fluid/tests/unittests/hybrid_parallel_mp_layers.py
+++ b/python/paddle/fluid/tests/unittests/hybrid_parallel_mp_layers.py
@@ -269,6 +269,63 @@ class TestDistTraning(unittest.TestCase):
 
             np.testing.assert_allclose(loss_a.numpy(), loss_b.numpy())
 
+    def test_parallel_cross_entropy(self):
+        batch_size = 2
+        seq_length = 1
+        class_size_per_card = 2
+        vocab_size = class_size_per_card * self.model_parallel_size
+        seed = 1025
+
+        set_random_seed(seed)
+        rank_id = dist.get_rank()
+
+        # model_a
+        model_a = fleet.meta_parallel.ParallelCrossEntropy()
+
+        model_b = paddle.nn.CrossEntropyLoss(reduction="none")
+
+        paddle.seed(rank_id * 10)
+        random.seed(seed)
+        np.random.seed(seed)
+
+        for _ in range(5):
+            np_label = np.random.randint(0, vocab_size,
+                                         (batch_size, seq_length))
+            label = paddle.to_tensor(np_label, dtype="int64")
+
+            data = paddle.randn(
+                shape=[batch_size, seq_length, class_size_per_card],
+                dtype='float32')
+            data.stop_gradient = False
+
+            check_group = dist.new_group(list(range(self.model_parallel_size)))
+            integral_data = []
+            partial_data = data.clone().detach()
+            paddle.distributed.all_gather(
+                integral_data, partial_data, group=check_group)
+            integral_data = paddle.concat(integral_data, axis=-1)
+            integral_data = integral_data.detach().clone()
+            integral_data.stop_gradient = False
+
+            loss_a = model_a(data, label).sum() / batch_size
+            loss_b = model_b(integral_data, label).sum() / batch_size
+            print("loss_a: ", loss_a.numpy(), "loss_b: ", loss_b.numpy())
+
+            np.testing.assert_allclose(
+                loss_a.numpy(), loss_b.numpy(), rtol=1e-6)
+
+            loss_a.backward()
+            loss_b.backward()
+
+            integral_grad = []
+            partial_grad = data.grad.clone().detach()
+            paddle.distributed.all_gather(
+                integral_grad, partial_grad, group=check_group)
+            integral_grad = paddle.concat(integral_grad, axis=-1)
+
+            np.testing.assert_allclose(
+                integral_data.grad.numpy(), integral_grad.numpy(), rtol=1e-6)
+
 
 if __name__ == '__main__':
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/ir/inference/inference_pass_test.py b/python/paddle/fluid/tests/unittests/ir/inference/inference_pass_test.py
index 010086bfbbc47ffe65b6379b65b05900235e83d3..e3c21eaa78d716863c01996e2e5371c2f90f0643 100644
--- a/python/paddle/fluid/tests/unittests/ir/inference/inference_pass_test.py
+++ b/python/paddle/fluid/tests/unittests/ir/inference/inference_pass_test.py
@@ -160,7 +160,8 @@ class InferencePassTest(unittest.TestCase):
                                  use_gpu,
                                  atol=1e-5,
                                  flatten=False,
-                                 quant=False):
+                                 quant=False,
+                                 rtol=1e-5):
         '''
         Check whether calculating on CPU and GPU, enable TensorRT 
         or disable TensorRT, enable MKLDNN or disable MKLDNN 
@@ -260,7 +261,7 @@ class InferencePassTest(unittest.TestCase):
 
                 self.assertTrue(
                     np.allclose(
-                        out, tensorrt_output, atol=atol),
+                        out, tensorrt_output, rtol=rtol, atol=atol),
                     "Output has diff between GPU and TensorRT. ")
 
         # Check whether the mkldnn results and the CPU results are the same. 
diff --git a/python/paddle/fluid/tests/unittests/ir/inference/test_trt_subgraph_pass.py b/python/paddle/fluid/tests/unittests/ir/inference/test_trt_subgraph_pass.py
index d895ac44d89319c396b95af1994c5d99f4555ea5..0406e03f54bd4cb70f99d21dcb94b8d380da8954 100644
--- a/python/paddle/fluid/tests/unittests/ir/inference/test_trt_subgraph_pass.py
+++ b/python/paddle/fluid/tests/unittests/ir/inference/test_trt_subgraph_pass.py
@@ -366,6 +366,61 @@ class TensorRTSubgraphPassLayerNormTest(InferencePassTest):
                 PassVersionChecker.IsCompatible('tensorrt_subgraph_pass'))
 
 
+class TensorRTSubgraphPassLayerNormDynamicTest(InferencePassTest):
+    def setUp(self):
+        self.set_params()
+        with fluid.program_guard(self.main_program, self.startup_program):
+            data = fluid.data(
+                name="data", shape=[-1, 3, 64, 64], dtype="float32")
+            out = fluid.layers.layer_norm(
+                data, begin_norm_axis=self.begin_norm_axis)
+        self.feeds = {
+            "data": np.random.random([1, 3, 64, 64]).astype("float32"),
+        }
+        self.set_trt_params()
+        self.fetch_list = [out]
+
+    def set_trt_params(self):
+        self.enable_trt = True
+        self.trt_parameters = TensorRTSubgraphPassLayerNormDynamicTest.TensorRTParam(
+            1 << 30, 32, 0, self.precision, self.serialize, False)
+        self.dynamic_shape_params = TensorRTSubgraphPassLayerNormDynamicTest.DynamicShapeParam(
+            {
+                'data': [1, 3, 64, 64],
+            }, {'data': [8, 8, 64, 64], }, {'data': [4, 4, 64, 64], }, False)
+
+    def set_params(self):
+        self.begin_norm_axis = 2
+        self.precision = AnalysisConfig.Precision.Float32
+        self.serialize = True
+
+    def test_check_output(self):
+        if os.path.exists(self.path + "_opt_cache"):
+            shutil.rmtree(self.path + "_opt_cache")
+        if core.is_compiled_with_cuda():
+            use_gpu = True
+            self.check_output_with_option(use_gpu)
+            self.assertTrue(
+                PassVersionChecker.IsCompatible('tensorrt_subgraph_pass'))
+
+
+class TensorRTSubgraphPassLayerNormDynamicFP16Test(
+        TensorRTSubgraphPassLayerNormDynamicTest):
+    def set_params(self):
+        self.begin_norm_axis = 2
+        self.precision = AnalysisConfig.Precision.Half
+        self.serialize = True
+
+    def test_check_output(self):
+        if os.path.exists(self.path + "_opt_cache"):
+            shutil.rmtree(self.path + "_opt_cache")
+        if core.is_compiled_with_cuda():
+            use_gpu = True
+            self.check_output_with_option(use_gpu, atol=0.01, rtol=0.01)
+            self.assertTrue(
+                PassVersionChecker.IsCompatible('tensorrt_subgraph_pass'))
+
+
 class TensorRTSubgraphPassLayerNormBeginNormAxis2Test(
         TensorRTSubgraphPassLayerNormTest):
     def set_params(self):
diff --git a/python/paddle/fluid/tests/unittests/ir_memory_optimize_net_base.py b/python/paddle/fluid/tests/unittests/ir_memory_optimize_net_base.py
index 0e4fd8f69dcd3fb5ecca5635c8b04df86d1e6bab..ea125ccf3fc6c09f3fff2a5ba97fff5ac279bab9 100644
--- a/python/paddle/fluid/tests/unittests/ir_memory_optimize_net_base.py
+++ b/python/paddle/fluid/tests/unittests/ir_memory_optimize_net_base.py
@@ -13,7 +13,7 @@
 # limitations under the License.
 
 import os
-
+import sys
 import six
 import unittest
 import time
diff --git a/python/paddle/fluid/tests/unittests/mkldnn/check_flags_mkldnn_ops_on_off.py b/python/paddle/fluid/tests/unittests/mkldnn/check_flags_mkldnn_ops_on_off.py
index c93201946b275715cf70ae549339cd3f41f5cac7..90614ccb3bc1543073c808a1a424227736c794e3 100644
--- a/python/paddle/fluid/tests/unittests/mkldnn/check_flags_mkldnn_ops_on_off.py
+++ b/python/paddle/fluid/tests/unittests/mkldnn/check_flags_mkldnn_ops_on_off.py
@@ -19,18 +19,19 @@ import numpy as np
 import paddle.fluid as fluid
 import os
 from paddle.fluid.layer_helper import LayerHelper
+from paddle.fluid.framework import _global_flags
 
 
 def check():
-    print("check: fluid.core.globals()['FLAGS_use_mkldnn']=",
-          fluid.core.globals()["FLAGS_use_mkldnn"])
+    print("check: _global_flags()['FLAGS_use_mkldnn']=",
+          _global_flags()["FLAGS_use_mkldnn"])
     print("check: fluid.get_flags('FLAGS_use_mkldnn')=",
           fluid.get_flags(['FLAGS_use_mkldnn']))
     print("check: DNNL_VERBOSE=", os.environ['DNNL_VERBOSE'])
     print("check: FLAGS_tracer_mkldnn_ops_on=",
-          fluid.core.globals()['FLAGS_tracer_mkldnn_ops_on'])
+          _global_flags()['FLAGS_tracer_mkldnn_ops_on'])
     print("check: FLAGS_tracer_mkldnn_ops_off=",
-          fluid.core.globals()['FLAGS_tracer_mkldnn_ops_off'])
+          _global_flags()['FLAGS_tracer_mkldnn_ops_off'])
     a_np = np.random.uniform(-2, 2, (10, 20, 30)).astype(np.float32)
     b_np = np.random.uniform(-5, 5, (10, 20, 30)).astype(np.float32)
     helper = LayerHelper(fluid.unique_name.generate(str("test")), act="relu")
diff --git a/python/paddle/fluid/tests/unittests/mkldnn/check_flags_use_mkldnn.py b/python/paddle/fluid/tests/unittests/mkldnn/check_flags_use_mkldnn.py
index 8f5715a0d0afcf59ebbe1cc95a6b06dead64c6e2..3d9ef39680dc059d3823e6ad89081549e9f693a2 100644
--- a/python/paddle/fluid/tests/unittests/mkldnn/check_flags_use_mkldnn.py
+++ b/python/paddle/fluid/tests/unittests/mkldnn/check_flags_use_mkldnn.py
@@ -19,11 +19,12 @@ import numpy as np
 import paddle.fluid as fluid
 import os
 from paddle.fluid.layer_helper import LayerHelper
+from paddle.fluid.framework import _global_flags
 
 
 def check():
-    print("check: fluid.core.globals()['FLAGS_use_mkldnn']=",
-          fluid.core.globals()["FLAGS_use_mkldnn"])
+    print("check: _global_flags()['FLAGS_use_mkldnn']=",
+          _global_flags()["FLAGS_use_mkldnn"])
     print("check: fluid.get_flags('FLAGS_use_mkldnn')=",
           fluid.get_flags(['FLAGS_use_mkldnn']))
     print("check: DNNL_VERBOSE=", os.environ['DNNL_VERBOSE'])
diff --git a/python/paddle/fluid/tests/unittests/mkldnn/test_conv2d_transpose_mkldnn_op.py b/python/paddle/fluid/tests/unittests/mkldnn/test_conv2d_transpose_mkldnn_op.py
index f31ddf921f819c9b377d950bff8c7a77ea352cae..b473d2643d3788fc38318f33d13a96211616a881 100644
--- a/python/paddle/fluid/tests/unittests/mkldnn/test_conv2d_transpose_mkldnn_op.py
+++ b/python/paddle/fluid/tests/unittests/mkldnn/test_conv2d_transpose_mkldnn_op.py
@@ -18,6 +18,7 @@ import unittest
 import numpy as np
 import paddle.fluid.core as core
 from paddle.fluid.tests.unittests.op_test import OpTest
+from paddle import enable_static
 
 from paddle.fluid.tests.unittests.test_conv2d_transpose_op import conv2dtranspose_forward_naive, TestConv2DTransposeOp
 
diff --git a/python/paddle/fluid/tests/unittests/npu/test_coalesce_tensor_op_npu.py b/python/paddle/fluid/tests/unittests/npu/test_coalesce_tensor_op_npu.py
new file mode 100644
index 0000000000000000000000000000000000000000..37fa5f8cad2abee06438f2d27da5e27ff5bbf963
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/npu/test_coalesce_tensor_op_npu.py
@@ -0,0 +1,110 @@
+#  Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+
+import numpy as np
+import unittest
+import sys
+sys.path.append("..")
+from op_test import OpTest
+import paddle
+import paddle.fluid as fluid
+from paddle.fluid import core
+
+paddle.enable_static()
+SEED = 2021
+alignment = 512
+
+
+@unittest.skipIf(not paddle.is_compiled_with_npu(),
+                 "core is not compiled with NPU")
+class TestAllocContinuousSpace(OpTest):
+    def setUp(self):
+        self.__class__.use_npu = True
+        self.op_type = "coalesce_tensor"
+        self.dtype, self.fluid_dtype = self.init_dtype()
+        attrs = self.init_attr()
+        self.copy_data = attrs["copy_data"]
+        self.constant = attrs["constant"]
+        self.set_constant = attrs["set_constant"]
+        self.Inputs = self.init_input()
+        self.Outputs, self.FusedOutput = self.init_output(
+            self.Inputs, self.set_constant, self.constant)
+        self.inputs = {'Input': self.Inputs}
+        self.attrs = attrs
+        self.outputs = {'Output': self.Outputs, 'FusedOutput': self.FusedOutput}
+
+    def init_dtype(self):
+        return np.float32, int(core.VarDesc.VarType.FP32)
+
+    def init_input(self):
+        inputs = []
+        inputs.append(("x1", np.zeros([20, 3]).astype(self.dtype)))
+        inputs.append(("x2", np.zeros([20, 3]).astype(self.dtype)))
+        return inputs
+
+    def init_attr(self):
+        return {
+            "copy_data": False,
+            "set_constant": False,
+            "constant": 0.0,
+            "use_align": True,
+            "dtype": self.fluid_dtype
+        }
+
+    def init_output(self, input_list, set_constant, constant):
+        inputs = []
+        outputs = input_list
+
+        for input in input_list:
+            length = len(input[1].flatten())
+            aligned_len = (length + alignment) / alignment * alignment
+            out = np.zeros(int(aligned_len), dtype=self.dtype)
+            out[0:length] = input[1].flatten()
+            inputs.append(out)
+
+        coalesce_tensor_var = np.concatenate([input for input in inputs])
+        return outputs, coalesce_tensor_var
+
+    def test_check_output(self):
+        self.check_output_with_place(
+            place=paddle.NPUPlace(0),
+            no_check_set=["FusedOutput"],
+            atol=1e-5,
+            check_dygraph=False)
+
+
+@unittest.skipIf(not paddle.is_compiled_with_npu(),
+                 "core is not compiled with NPU")
+class TestAllocContinuousSpace2(TestAllocContinuousSpace):
+    def init_attr(self):
+        return {
+            "copy_data": True,
+            "set_constant": False,
+            "constant": 0.5,
+            "use_align": True,
+            "dtype": self.fluid_dtype
+        }
+
+    def test_check_output(self):
+        self.check_output_with_place(
+            place=paddle.NPUPlace(0),
+            no_check_set=["FusedOutput"],
+            atol=1e-5,
+            check_dygraph=False)
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/parallel_embedding_api.py b/python/paddle/fluid/tests/unittests/parallel_embedding_api.py
index 7460577403fb12915a4d0b0e68333392a4c2c43b..8907adbf46a971e09d86824c5b758ff14f6f767c 100644
--- a/python/paddle/fluid/tests/unittests/parallel_embedding_api.py
+++ b/python/paddle/fluid/tests/unittests/parallel_embedding_api.py
@@ -48,23 +48,27 @@ class TestParallelEmbeddingAPI(TestCollectiveAPIRunnerBase):
         with fluid.program_guard(main_prog, startup_program):
             fleet.init(is_collective=True)
             np.random.seed(2020)
-            np_array = np.random.rand(10, 8)
+            # (num_embeddings, embedding_dim) = (12, 8)
+            size = (12, 8)
+            np_array = np.random.rand(size[0], size[1])
             paddle.seed(2020)
-            data_in = paddle.randint(0, 8, shape=(10, 4))
+            data_in = paddle.randint(0, size[0], shape=(10, 4))
 
             data = paddle.static.data(
                 name='tindata', shape=[10, 1000], dtype="float32")
+            per_part_size = size[0] // 2
             if rank == 0:
                 param_attr = paddle.fluid.ParamAttr(
                     initializer=paddle.fluid.initializer.NumpyArrayInitializer(
-                        np_array[0:5, :]), )
+                        np_array[0:per_part_size, :]), )
             else:
                 param_attr = paddle.fluid.ParamAttr(
                     initializer=paddle.fluid.initializer.NumpyArrayInitializer(
-                        np_array[5:10, :]), )
+                        np_array[per_part_size:size[0], :]), )
 
             emb_out = paddle.distributed.split(
-                data_in, (8, 8),
+                data_in,
+                size,
                 operation="embedding",
                 num_partitions=2,
                 weight_attr=param_attr)
diff --git a/python/paddle/fluid/tests/unittests/parallel_embedding_api_none_divisible.py b/python/paddle/fluid/tests/unittests/parallel_embedding_api_none_divisible.py
deleted file mode 100644
index 75b966fdc57272fb8dd905cf7ba6fff52dc743bf..0000000000000000000000000000000000000000
--- a/python/paddle/fluid/tests/unittests/parallel_embedding_api_none_divisible.py
+++ /dev/null
@@ -1,76 +0,0 @@
-# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from __future__ import print_function
-
-import numpy as np
-import argparse
-import os
-import sys
-import signal
-import time
-import socket
-from contextlib import closing
-from six import string_types
-import math
-import paddle
-import paddle.fluid as fluid
-import paddle.fluid.profiler as profiler
-import paddle.fluid.unique_name as nameGen
-from paddle.fluid import core
-import paddle.distributed.fleet as fleet
-from paddle.fluid.incubate.fleet.base import role_maker
-import unittest
-from multiprocessing import Process
-import paddle.fluid.layers as layers
-from functools import reduce
-from test_collective_api_base import TestCollectiveAPIRunnerBase, runtime_main
-
-paddle.enable_static()
-
-
-class TestParallelEmbeddingAPINoneDivisible(TestCollectiveAPIRunnerBase):
-    def __init__(self):
-        self.global_ring_id = 0
-
-    def get_model(self, main_prog, startup_program, rank):
-        with fluid.program_guard(main_prog, startup_program):
-            fleet.init(is_collective=True)
-            np.random.seed(2020)
-            np_array = np.random.rand(9, 8)
-            paddle.seed(2020)
-            data_in = paddle.randint(0, 7, shape=(10, 4))
-
-            data = paddle.static.data(
-                name='tindata', shape=[10, 1000], dtype="float32")
-            if rank == 0:
-                param_attr = paddle.fluid.ParamAttr(
-                    initializer=paddle.fluid.initializer.NumpyArrayInitializer(
-                        np_array[0:5, :]), )
-            else:
-                param_attr = paddle.fluid.ParamAttr(
-                    initializer=paddle.fluid.initializer.NumpyArrayInitializer(
-                        np_array[5:9, :]), )
-
-            emb_out = paddle.distributed.split(
-                data_in, (7, 8),
-                operation="embedding",
-                num_partitions=2,
-                weight_attr=param_attr)
-
-            return [data_in, emb_out]
-
-
-if __name__ == "__main__":
-    runtime_main(TestParallelEmbeddingAPINoneDivisible, "parallel_embedding")
diff --git a/python/paddle/fluid/tests/unittests/pipeline_mnist_multi_device.py b/python/paddle/fluid/tests/unittests/pipeline_mnist_multi_device.py
new file mode 100644
index 0000000000000000000000000000000000000000..7211bd3e92f790201a9cea7512a01079764bc677
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/pipeline_mnist_multi_device.py
@@ -0,0 +1,159 @@
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+
+import numpy as np
+import argparse
+import time
+import math
+
+import paddle
+import paddle.fluid as fluid
+import paddle.fluid.profiler as profiler
+from paddle.fluid import core
+import unittest
+from multiprocessing import Process
+import os
+import signal
+from functools import reduce
+from test_dist_base import TestDistRunnerBase, runtime_main
+import paddle.distributed.fleet as fleet
+
+paddle.enable_static()
+
+DTYPE = "float32"
+paddle.dataset.mnist.fetch()
+
+# Fix seed for test
+fluid.default_startup_program().random_seed = 1
+fluid.default_main_program().random_seed = 1
+
+
+def cnn_model(data):
+    conv_pool_1 = fluid.nets.simple_img_conv_pool(
+        input=data,
+        filter_size=5,
+        num_filters=20,
+        pool_size=2,
+        pool_stride=2,
+        act="relu",
+        param_attr=fluid.ParamAttr(initializer=fluid.initializer.Constant(
+            value=0.01)))
+    conv_pool_2 = fluid.nets.simple_img_conv_pool(
+        input=conv_pool_1,
+        filter_size=5,
+        num_filters=50,
+        pool_size=2,
+        pool_stride=2,
+        act="relu",
+        param_attr=fluid.ParamAttr(initializer=fluid.initializer.Constant(
+            value=0.01)))
+
+    SIZE = 10
+    input_shape = conv_pool_2.shape
+    param_shape = [reduce(lambda a, b: a * b, input_shape[1:], 1)] + [SIZE]
+    scale = (2.0 / (param_shape[0]**2 * SIZE))**0.5
+
+    with fluid.device_guard("gpu:1"):
+        predict = fluid.layers.fc(
+            input=conv_pool_2,
+            size=SIZE,
+            act="softmax",
+            param_attr=fluid.param_attr.ParamAttr(
+                initializer=fluid.initializer.Constant(value=0.01)))
+        # To cover @RENAMED@GRADIENT
+        predict2 = fluid.layers.fc(
+            input=conv_pool_1,
+            size=SIZE,
+            act="softmax",
+            param_attr=fluid.param_attr.ParamAttr(
+                initializer=fluid.initializer.Constant(value=0.01)))
+        predict += predict2
+    return predict
+
+
+class TestDistMnist2x2(TestDistRunnerBase):
+    def get_model(self, batch_size=2, use_dgc=False, dist_strategy=None):
+        # Input data
+        with fluid.device_guard("gpu:0"):
+            images = fluid.layers.data(
+                name='pixel', shape=[1, 28, 28], dtype=DTYPE)
+            label = fluid.layers.data(name='label', shape=[1], dtype='int64')
+
+            if dist_strategy:
+                data_loader = fluid.io.DataLoader.from_generator(
+                    feed_list=[images, label],
+                    capacity=64,
+                    use_double_buffer=False,
+                    iterable=False)
+            # Train program
+            predict = cnn_model(images)
+        with fluid.device_guard("gpu:1"):
+            cost = fluid.layers.cross_entropy(input=predict, label=label)
+            avg_cost = fluid.layers.mean(x=cost)
+
+        # Evaluator
+        with fluid.device_guard("gpu:1"):
+            batch_size_tensor = fluid.layers.create_tensor(dtype='int64')
+            batch_acc = fluid.layers.accuracy(
+                input=predict, label=label, total=batch_size_tensor)
+
+        inference_program = fluid.default_main_program().clone()
+        base_lr = self.lr
+        passes = [30, 60, 80, 90]
+        steps_per_pass = 10
+        bd = [steps_per_pass * p for p in passes]
+        lr = [base_lr * (0.1**i) for i in range(len(bd) + 1)]
+        lr_val = fluid.layers.piecewise_decay(boundaries=bd, values=lr)
+        opt = fluid.optimizer.Momentum(
+            learning_rate=lr_val,
+            momentum=0.9,
+            grad_clip=fluid.clip.GradientClipByGlobalNorm(clip_norm=1.0))
+
+        acc_steps = 2  # accumulated steps for pipeline
+        if dist_strategy:
+            # Reader
+            train_reader = paddle.batch(
+                paddle.dataset.mnist.test(), batch_size=batch_size)
+            test_reader = paddle.batch(
+                paddle.dataset.mnist.test(), batch_size=batch_size)
+            fleet.init(is_collective=True)
+            strategy = fleet.DistributedStrategy()
+            strategy.pipeline = True
+            strategy.amp = True
+            strategy.pipeline_configs = {
+                'micro_batch_size': batch_size,
+                'schedule_mode': 'F-then-B',
+                'accumulate_steps': acc_steps
+            }
+            dist_opt = fleet.distributed_optimizer(
+                optimizer=opt, strategy=strategy)
+            dist_opt.minimize(avg_cost)
+        else:
+            opt.minimize(avg_cost)
+            # Reader
+            train_reader = paddle.batch(
+                paddle.dataset.mnist.test(), batch_size=batch_size * acc_steps)
+            test_reader = paddle.batch(
+                paddle.dataset.mnist.test(), batch_size=batch_size * acc_steps)
+
+        if dist_strategy:
+            return inference_program, avg_cost, train_reader, test_reader, batch_acc, predict, data_loader
+        else:
+            return inference_program, avg_cost, train_reader, test_reader, batch_acc, predict
+
+
+if __name__ == "__main__":
+    runtime_main(TestDistMnist2x2)
diff --git a/python/paddle/fluid/tests/unittests/test_adam_op.py b/python/paddle/fluid/tests/unittests/test_adam_op.py
index aea2a074aedd58a1152efbaa8d276f7d1c82387c..715e66e563337f2115800acb19ba76aa22ca7e0b 100644
--- a/python/paddle/fluid/tests/unittests/test_adam_op.py
+++ b/python/paddle/fluid/tests/unittests/test_adam_op.py
@@ -804,11 +804,48 @@ class TestNetWithEpsilonTensor(unittest.TestCase):
         adam.minimize(b)
         state_dict = adam.state_dict()
         fluid.save_dygraph(state_dict, "paddle_dy")
-        para_state_dict, opti_state_dict = fluid.load_dygraph("paddle_dy")
-        adam.set_state_dict(opti_state_dict)
+        para_state_dict, opt_state_dict = fluid.load_dygraph("paddle_dy")
+        adam.set_state_dict(opt_state_dict)
 
         paddle.enable_static()
 
+    def test_adam_save_load_error(self):
+        paddle.disable_static()
+
+        def get_opt(dtype, shape):
+            with paddle.utils.unique_name.guard():
+                paddle.set_default_dtype(dtype)
+                a = paddle.rand([4, 10])
+                linear = paddle.nn.Linear(10, 10)
+                b = linear(a)
+                state_dict = linear.state_dict()
+                fluid.save_dygraph(state_dict, "paddle_dy")
+
+                scheduler = paddle.optimizer.lr.NoamDecay(
+                    d_model=0.01, warmup_steps=100, verbose=True)
+                adam = paddle.fluid.optimizer.Adam(
+                    learning_rate=scheduler,
+                    parameter_list=linear.parameters(),
+                    use_global_beta_pow=True)
+                adam.minimize(b)
+                return adam
+
+        adam = get_opt('float32', [10, 10])
+
+        state_dict = adam.state_dict()
+        fluid.save_dygraph(state_dict, "paddle_dy")
+        para_state_dict, opt_state_dict = fluid.load_dygraph("paddle_dy")
+        adam.set_state_dict(opt_state_dict)
+
+        adam2 = get_opt('float64', [10, 10])  # dtype not match
+        self.assertRaises(AssertionError, adam2.set_state_dict, opt_state_dict)
+
+        adam3 = get_opt('float32', [10, 10])  # shape not match
+        opt_state_dict['beta1_pow_acc_0'] = np.array(
+            [0.9, 0.9], dtype='float32')
+        self.assertRaises(AssertionError, adam3.set_state_dict, opt_state_dict)
+        paddle.enable_static()
+
 
 class TestAdamOpV2Group(TestAdamOpV2):
     def test_adam_op(self):
diff --git a/python/paddle/fluid/tests/unittests/test_auto_checkpoint.py b/python/paddle/fluid/tests/unittests/test_auto_checkpoint.py
index 3f33120d1f79f089d7511621611141683f0a03cd..3faf7f6862058d056ee43f2603873a4fc834334d 100644
--- a/python/paddle/fluid/tests/unittests/test_auto_checkpoint.py
+++ b/python/paddle/fluid/tests/unittests/test_auto_checkpoint.py
@@ -268,7 +268,7 @@ class AutoCheckpointTest(AutoCheckPointACLBase):
     def test_checker(self):
         os.environ.pop("PADDLE_JOB_ID", None)
         try:
-            checker = AutoCheckpointChecker()
+            checker = acp.AutoCheckpointChecker()
             self.assertFalse(True)
         except Exception as e:
             pass
diff --git a/python/paddle/fluid/tests/unittests/test_collective_api_base.py b/python/paddle/fluid/tests/unittests/test_collective_api_base.py
index f0c042eb7e95b69e7fa894df6c06e5a6fb649588..81d246d35b8bbacc2950c4fa80c7e46950c40666 100644
--- a/python/paddle/fluid/tests/unittests/test_collective_api_base.py
+++ b/python/paddle/fluid/tests/unittests/test_collective_api_base.py
@@ -257,11 +257,10 @@ class TestDistBase(unittest.TestCase):
         elif col_type == "parallel_embedding":
             result_data = tr0_out[0]
             np.random.seed(2020)
-            need_result = np.random.rand(10, 8)
+            need_result = np.random.rand(12, 8)
             for i in range(result_data.shape[0]):
                 for j in range(result_data.shape[1]):
                     data = result_data[i][j]
-                    if data >= 4: data += 1
                     assert np.allclose(
                         tr0_out[1][i][j], need_result[data], atol=1e-08)
         elif col_type == "row_parallel_linear":
diff --git a/python/paddle/fluid/tests/unittests/test_collective_split_embedding_none_divisible.py b/python/paddle/fluid/tests/unittests/test_collective_split_embedding_none_divisible.py
index fc9775b3566b112a7d6c0c203147a1522383e3e4..955adf08c482418e8d1e02db7f87ab3e3cb5b700 100644
--- a/python/paddle/fluid/tests/unittests/test_collective_split_embedding_none_divisible.py
+++ b/python/paddle/fluid/tests/unittests/test_collective_split_embedding_none_divisible.py
@@ -16,20 +16,24 @@ from __future__ import print_function
 import unittest
 import numpy as np
 import paddle
-
-from test_collective_api_base import TestDistBase
+from paddle.distributed import fleet
 
 paddle.enable_static()
 
 
-class TestParallelEmbeddingNoneDivisibleAPI(TestDistBase):
-    def _setup_config(self):
-        pass
+class TestCollectiveSplitAssert(unittest.TestCase):
+    def network(self):
+        fleet.init()
+        data = paddle.static.data(
+            name='tindata', shape=[10, 1000], dtype="float32")
+        emb_out = paddle.distributed.split(
+            data, (7, 8), operation="embedding", num_partitions=2)
 
-    def test_parallel_embedding_none_divisible(self):
-        self.check_with_place("parallel_embedding_api_none_divisible.py",
-                              "parallel_embedding", "nccl")
+    def test_assert(self):
+        with self.assertRaises(AssertionError):
+            self.network()
 
 
 if __name__ == '__main__':
+    paddle.enable_static()
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_communicator_geo.py b/python/paddle/fluid/tests/unittests/test_communicator_geo.py
index ea59e070cbd51da440d81a3eb2236edb38385f2b..d9c6406422277c72f18bde341855f66dff7f3555 100644
--- a/python/paddle/fluid/tests/unittests/test_communicator_geo.py
+++ b/python/paddle/fluid/tests/unittests/test_communicator_geo.py
@@ -28,6 +28,8 @@ import paddle.fluid as fluid
 import paddle.distributed.fleet.base.role_maker as role_maker
 import paddle.distributed.fleet as fleet
 
+from paddle.distributed.utils import find_free_ports
+
 paddle.enable_static()
 
 
@@ -101,12 +103,9 @@ class TestCommunicatorGeoEnd2End(unittest.TestCase):
 
         os.environ["PADDLE_PSERVER_NUMS"] = "1"
         os.environ["PADDLE_TRAINERS_NUM"] = "1"
-        os.environ["POD_IP"] = "127.0.0.1"
-        os.environ["PADDLE_PORT"] = "36001"
         os.environ["PADDLE_TRAINER_ID"] = "0"
         os.environ["PADDLE_TRAINERS_NUM"] = "1"
-        os.environ["PADDLE_PSERVERS_IP_PORT_LIST"] = \
-            "127.0.0.1:36001"
+        os.environ["POD_IP"] = "127.0.0.1"
 
         role = role_maker.PaddleCloudRoleMaker()
 
@@ -150,8 +149,6 @@ class RunServer(TestCommunicatorGeoEnd2End):
         pass
 
 os.environ["TRAINING_ROLE"] = "PSERVER"
-os.environ["http_proxy"] = ""
-os.environ["https_proxy"] = ""
 
 half_run_server = RunServer()
 half_run_server.run_ut()
@@ -160,9 +157,12 @@ half_run_server.run_ut()
         server_file = "run_server_for_communicator_geo.py"
         with open(server_file, "w") as wb:
             wb.write(run_server_cmd)
+
+        port = find_free_ports(1).pop()
+
         os.environ["TRAINING_ROLE"] = "PSERVER"
-        os.environ["http_proxy"] = ""
-        os.environ["https_proxy"] = ""
+        os.environ["PADDLE_PORT"] = str(port)
+        os.environ["PADDLE_PSERVERS_IP_PORT_LIST"] = "127.0.0.1:{}".format(port)
 
         _python = sys.executable
 
@@ -173,17 +173,14 @@ half_run_server.run_ut()
             stdout=subprocess.PIPE,
             stderr=subprocess.PIPE)
 
-        outs, errs = ps_proc.communicate(timeout=15)
-
-        time.sleep(1)
+        time.sleep(5)
 
         os.environ["TRAINING_ROLE"] = "TRAINER"
-        os.environ["http_proxy"] = ""
-        os.environ["https_proxy"] = ""
 
         self.run_ut()
         ps_proc.kill()
         ps_proc.wait()
+        outs, errs = ps_proc.communicate()
 
         if os.path.exists(server_file):
             os.remove(server_file)
diff --git a/python/paddle/fluid/tests/unittests/test_dataloader_dataset.py b/python/paddle/fluid/tests/unittests/test_dataloader_dataset.py
index b8c498fe4a3c71296101bc08e6bbbe0887ac8b6c..08589f0191d8c698ecd8a4017d6c14fa476610b1 100644
--- a/python/paddle/fluid/tests/unittests/test_dataloader_dataset.py
+++ b/python/paddle/fluid/tests/unittests/test_dataloader_dataset.py
@@ -14,9 +14,12 @@
 
 from __future__ import division
 
+import sys
 import unittest
 import numpy as np
 
+import paddle
+import paddle.vision.transforms as transforms
 import paddle.fluid as fluid
 from paddle.io import *
 
@@ -37,5 +40,48 @@ class TestDatasetAbstract(unittest.TestCase):
             pass
 
 
+class TestDatasetWithDiffOutputPlace(unittest.TestCase):
+    def get_dataloader(self, num_workers):
+        dataset = paddle.vision.datasets.MNIST(
+            mode='test', transform=transforms.ToTensor())
+        loader = paddle.io.DataLoader(
+            dataset, batch_size=32, num_workers=num_workers, shuffle=True)
+        return loader
+
+    def run_check_on_cpu(self):
+        paddle.set_device('cpu')
+        loader = self.get_dataloader(0)
+        for image, label in loader:
+            self.assertTrue(image.place.is_cpu_place())
+            self.assertTrue(label.place.is_cpu_place())
+            break
+
+    def test_single_process(self):
+        self.run_check_on_cpu()
+        if paddle.is_compiled_with_cuda():
+            # Get (image, label) tuple from MNIST dataset
+            # - the image is on CUDAPlace, label is on CPUPlace
+            paddle.set_device('gpu')
+            loader = self.get_dataloader(0)
+            for image, label in loader:
+                self.assertTrue(image.place.is_gpu_place())
+                self.assertTrue(label.place.is_cuda_pinned_place())
+                break
+
+    def test_multi_process(self):
+        # DataLoader with multi-process mode is not supported on MacOs and Windows currently
+        if sys.platform != 'darwin' and sys.platform != 'win32':
+            self.run_check_on_cpu()
+            if paddle.is_compiled_with_cuda():
+                # Get (image, label) tuple from MNIST dataset
+                # - the image and label are on CPUPlace
+                paddle.set_device('gpu')
+                loader = self.get_dataloader(1)
+                for image, label in loader:
+                    self.assertTrue(image.place.is_cuda_pinned_place())
+                    self.assertTrue(label.place.is_cuda_pinned_place())
+                    break
+
+
 if __name__ == '__main__':
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_diagflat.py b/python/paddle/fluid/tests/unittests/test_diagflat.py
new file mode 100644
index 0000000000000000000000000000000000000000..ec74855ba25232e6f96f4fb062bb80aaf0c6b44a
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/test_diagflat.py
@@ -0,0 +1,109 @@
+#   Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+
+import unittest
+import numpy as np
+import paddle
+from paddle.static import Program, program_guard
+
+
+class TestDiagFlatError(unittest.TestCase):
+    def test_errors(self):
+        paddle.enable_static()
+        with program_guard(Program(), Program()):
+
+            def test_diagflat_type():
+                x = [1, 2, 3]
+                output = paddle.diagflat(x)
+
+            self.assertRaises(TypeError, test_diagflat_type)
+
+            x = paddle.static.data('data', [3, 3])
+            self.assertRaises(TypeError, paddle.diagflat, x, offset=2.5)
+
+
+class TestDiagFlatAPI(unittest.TestCase):
+    def setUp(self):
+        self.input_np = np.random.random(size=(10, 10)).astype(np.float64)
+        self.expected0 = np.diagflat(self.input_np)
+        self.expected1 = np.diagflat(self.input_np, k=1)
+        self.expected2 = np.diagflat(self.input_np, k=-1)
+
+        self.input_np2 = np.random.random(size=(20)).astype(np.float64)
+        self.expected3 = np.diagflat(self.input_np2)
+        self.expected4 = np.diagflat(self.input_np2, k=1)
+        self.expected5 = np.diagflat(self.input_np2, k=-1)
+
+    def run_imperative(self):
+        x = paddle.to_tensor(self.input_np)
+        y = paddle.diagflat(x)
+        self.assertTrue(np.allclose(y.numpy(), self.expected0))
+
+        y = paddle.diagflat(x, offset=1)
+        self.assertTrue(np.allclose(y.numpy(), self.expected1))
+
+        y = paddle.diagflat(x, offset=-1)
+        self.assertTrue(np.allclose(y.numpy(), self.expected2))
+
+        x = paddle.to_tensor(self.input_np2)
+        y = paddle.diagflat(x)
+        self.assertTrue(np.allclose(y.numpy(), self.expected3))
+
+        y = paddle.diagflat(x, offset=1)
+        self.assertTrue(np.allclose(y.numpy(), self.expected4))
+
+        y = paddle.diagflat(x, offset=-1)
+        self.assertTrue(np.allclose(y.numpy(), self.expected5))
+
+    def run_static(self, use_gpu=False):
+        x = paddle.static.data(name='input', shape=[10, 10], dtype='float64')
+        x2 = paddle.static.data(name='input2', shape=[20], dtype='float64')
+        result0 = paddle.diagflat(x)
+        result3 = paddle.diagflat(x2)
+
+        place = paddle.CUDAPlace(0) if use_gpu else paddle.CPUPlace()
+        exe = paddle.static.Executor(place)
+        exe.run(paddle.static.default_startup_program())
+        res0, res3 = exe.run(
+            feed={"input": self.input_np,
+                  'input2': self.input_np2},
+            fetch_list=[result0, result3])
+
+        self.assertTrue(np.allclose(res0, self.expected0))
+        self.assertTrue(np.allclose(res3, self.expected3))
+
+    def test_cpu(self):
+        paddle.disable_static(place=paddle.CPUPlace())
+        self.run_imperative()
+        paddle.enable_static()
+
+        with paddle.static.program_guard(Program()):
+            self.run_static()
+
+    def test_gpu(self):
+        if not paddle.is_compiled_with_cuda():
+            return
+
+        paddle.disable_static(place=paddle.CUDAPlace(0))
+        self.run_imperative()
+        paddle.enable_static()
+
+        with paddle.static.program_guard(Program()):
+            self.run_static(use_gpu=True)
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_dyn_rnn.py b/python/paddle/fluid/tests/unittests/test_dyn_rnn.py
index 84fee8ace3ec427e81b12d36f32562dd0ab8c954..1cf0c145f830dec2c3438fa22f34b7bdf3522875 100644
--- a/python/paddle/fluid/tests/unittests/test_dyn_rnn.py
+++ b/python/paddle/fluid/tests/unittests/test_dyn_rnn.py
@@ -333,7 +333,7 @@ class TestDynamicRNNErrors(unittest.TestCase):
                     hidden = fluid.layers.fc(input=[word, memory],
                                              size=10,
                                              act='tanh')
-                    out = np.ones(1).astype('float32')
+                    out = numpy.ones(1).astype('float32')
                     drnn.update_memory(ex_mem=memory, new_mem=hidden)
                     drnn.output(hidden, out)
 
diff --git a/python/paddle/fluid/tests/unittests/test_eager_deletion_delete_vars.py b/python/paddle/fluid/tests/unittests/test_eager_deletion_delete_vars.py
index 835f693ab6d7d8c63dfea655ceca29ce6b056e61..1590d866b1c73c88ed2c3fbd1fceb90831954b13 100644
--- a/python/paddle/fluid/tests/unittests/test_eager_deletion_delete_vars.py
+++ b/python/paddle/fluid/tests/unittests/test_eager_deletion_delete_vars.py
@@ -145,7 +145,7 @@ class TestExecutor(unittest.TestCase):
     def pe_main(self):
         image, label, loss = simple_fc_net()
         loss.persistable = False
-        persitables, non_persistables = get_persistables_and_non_persistables(
+        persistables, non_persistables = get_persistables_and_non_persistables(
             fluid.default_main_program(), [loss.name])
 
         exe = fluid.Executor(self.place)
diff --git a/python/paddle/fluid/tests/unittests/test_fc_op.py b/python/paddle/fluid/tests/unittests/test_fc_op.py
index 3bbc8df1882275cb1361426dbe3dcd1f5c3424d5..22126ce41d05cc3991cc78bc21fa9ac47b816640 100644
--- a/python/paddle/fluid/tests/unittests/test_fc_op.py
+++ b/python/paddle/fluid/tests/unittests/test_fc_op.py
@@ -138,6 +138,7 @@ class TestFcOp_NumFlattenDims_NegOne(unittest.TestCase):
     def test_api(self):
         def run_program(num_flatten_dims):
             paddle.seed(SEED)
+            np.random.seed(SEED)
             startup_program = Program()
             main_program = Program()
 
@@ -158,6 +159,7 @@ class TestFcOp_NumFlattenDims_NegOne(unittest.TestCase):
             exe = fluid.Executor(place=place)
             exe.run(startup_program)
             out = exe.run(main_program, feed={"x": input}, fetch_list=[out])
+            return out
 
         res_1 = run_program(-1)
         res_2 = run_program(2)
diff --git a/python/paddle/fluid/tests/unittests/test_gradient_clip.py b/python/paddle/fluid/tests/unittests/test_gradient_clip.py
index f258e830b5fe5f89ec419e1192fae37c5205db34..14f5d4a41a1fed1a81436d0372759db86fc7d1a0 100644
--- a/python/paddle/fluid/tests/unittests/test_gradient_clip.py
+++ b/python/paddle/fluid/tests/unittests/test_gradient_clip.py
@@ -133,7 +133,7 @@ class TestGradientClip(unittest.TestCase):
         print(val)
         self.assertFalse(np.isnan(val))
 
-    def backward_and_optimize(cost):
+    def backward_and_optimize(self, cost):
         pass
 
 
diff --git a/python/paddle/fluid/tests/unittests/test_memcpy_op.py b/python/paddle/fluid/tests/unittests/test_memcpy_op.py
index a089b33b8ea63239652ab9799896881a71128049..38e9379bc166777b4eb6b84a0ecea9306ad75343 100755
--- a/python/paddle/fluid/tests/unittests/test_memcpy_op.py
+++ b/python/paddle/fluid/tests/unittests/test_memcpy_op.py
@@ -171,6 +171,14 @@ class TestMemcpyOPError(unittest.TestCase):
                 fetch_list=[lod_tensor_var.name, pinned_var.name])
 
 
+class TestMemcpyApi(unittest.TestCase):
+    def test_api(self):
+        a = paddle.ones([1024, 1024])
+        b = paddle.tensor.creation._memcpy(a, paddle.CUDAPinnedPlace())
+        self.assertEqual(b.place.__repr__(), "CUDAPinnedPlace")
+        self.assertTrue(np.array_equal(a.numpy(), b.numpy()))
+
+
 if __name__ == '__main__':
     paddle.enable_static()
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_momentum_op.py b/python/paddle/fluid/tests/unittests/test_momentum_op.py
index e31587b225ebae4e5a72faa43a2e3bc31263d0d1..e79f6e5eb4a0696440245bb60f46397b4629734a 100644
--- a/python/paddle/fluid/tests/unittests/test_momentum_op.py
+++ b/python/paddle/fluid/tests/unittests/test_momentum_op.py
@@ -613,6 +613,77 @@ class TestMomentumOpWithDecayAPI(unittest.TestCase):
                 exe.run(main, feed=feeder.feed(data), fetch_list=fetch_list)
 
 
+class TestFusedMomentumWithDecayAPI(unittest.TestCase):
+    def get_program(self, weight_attr, bias_attr=False):
+        main_program = paddle.static.Program()
+        startup_program = paddle.static.Program()
+        with paddle.static.program_guard(
+                main_program=main_program, startup_program=startup_program):
+            x = paddle.static.data(name='x', shape=[10, 10])
+            linear = paddle.nn.Linear(
+                10, 10, weight_attr=weight_attr, bias_attr=bias_attr)
+            out = linear(x)
+            loss = paddle.mean(out)
+            optimizer = paddle.optimizer.Momentum(
+                learning_rate=0.01,
+                momentum=0.9,
+                weight_decay=paddle.regularizer.L2Decay(0.5))
+            optimizer.minimize(loss)
+        return main_program
+
+    def test_param_has_l2decay(self):
+        paddle.enable_static()
+        weight_attr = paddle.ParamAttr(
+            name="weight",
+            initializer=paddle.nn.initializer.Constant(value=0.5),
+            regularizer=paddle.regularizer.L2Decay(0.1))
+        program = self.get_program(weight_attr, bias_attr=False)
+        ops = program.global_block().ops
+
+        self.assertEqual(ops[-1].attr('regularization_method'), 'l2_decay')
+        self.assertEqual(ops[-1].attr('regularization_coeff'), np.float32(0.1))
+        for i in range(len(ops)):
+            self.assertTrue('sum' not in ops[i].type)
+            self.assertTrue('scale' not in ops[i].type)
+
+    def test_param_has_l1decay(self):
+        paddle.enable_static()
+        weight_attr = paddle.ParamAttr(
+            name="weight",
+            initializer=paddle.nn.initializer.Constant(value=0.5),
+            regularizer=paddle.regularizer.L1Decay(0.1))
+        bias_attr = paddle.ParamAttr(
+            name="bias",
+            initializer=paddle.nn.initializer.Constant(value=0.),
+            regularizer=None)
+        program = self.get_program(weight_attr, bias_attr)
+        ops = program.global_block().ops
+
+        self.assertEqual(ops[-1].type, 'momentum')
+        self.assertEqual(ops[-2].type, 'momentum')
+        self.assertEqual(ops[-3].type, 'sum')
+        self.assertEqual(ops[-4].type, 'scale')
+        self.assertEqual(ops[-5].type, 'sign')
+        self.assertEqual(ops[-6].type, 'matmul_grad')
+        if 'weight' in ops[-1].input('Param'):
+            self.assertEqual(ops[-1].attr('regularization_method'), '')
+            self.assertEqual(ops[-1].attr('regularization_coeff'), 0)
+        if 'bias' in ops[-2].input('Param'):
+            self.assertEqual(ops[-2].attr('regularization_method'), 'l2_decay')
+            self.assertEqual(ops[-2].attr('regularization_coeff'),
+                             np.float32(0.5))
+
+    def test_param_has_no_regularizer(self):
+        paddle.enable_static()
+        program = self.get_program(weight_attr=None)
+        ops = program.global_block().ops
+        self.assertEqual(ops[-1].attr('regularization_method'), 'l2_decay')
+        self.assertEqual(ops[-1].attr('regularization_coeff'), np.float32(0.5))
+        for i in range(len(ops)):
+            self.assertTrue('sum' not in ops[i].type)
+            self.assertTrue('scale' not in ops[i].type)
+
+
 class TestMomentumOpVsMomentumOpWithDecayAPI(unittest.TestCase):
     def __update_params(self, momentum, linear):
         for i in range(10):
diff --git a/python/paddle/fluid/tests/unittests/test_multiprocess_dataloader_dataset.py b/python/paddle/fluid/tests/unittests/test_multiprocess_dataloader_dataset.py
index 977882543a8886c38c6ed98290462664b397d013..4c69d003d80f8b7153f9409cbcf789dfe80d2d44 100755
--- a/python/paddle/fluid/tests/unittests/test_multiprocess_dataloader_dataset.py
+++ b/python/paddle/fluid/tests/unittests/test_multiprocess_dataloader_dataset.py
@@ -330,5 +330,19 @@ class TestComplextDataset(unittest.TestCase):
             self.run_main(num_workers)
 
 
+class TestDataLoaderGenerateStates(unittest.TestCase):
+    def setUp(self):
+        self.inputs = [(0, 1), (0, 2), (1, 3)]
+        self.outputs = [[1835504127, 1731038949, 1320224556, 2330041505],
+                        [2834126987, 2358157858, 1860244682, 1437227251],
+                        [457190280, 2660306227, 859341110, 354512857]]
+
+    def test_main(self):
+        from paddle.fluid.dataloader.worker import _generate_states
+        for inp, outp in zip(self.inputs, self.outputs):
+            out = _generate_states(*inp)
+            assert out == outp
+
+
 if __name__ == '__main__':
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_multiprocess_reader_exception.py b/python/paddle/fluid/tests/unittests/test_multiprocess_reader_exception.py
index 95e2462a2e2989d36f574ea81f7103e2188a068d..c3b53e81a66659863139f89cbac407022fa3c9b7 100644
--- a/python/paddle/fluid/tests/unittests/test_multiprocess_reader_exception.py
+++ b/python/paddle/fluid/tests/unittests/test_multiprocess_reader_exception.py
@@ -25,7 +25,7 @@ class ReaderException(Exception):
     pass
 
 
-class TestMultiprocessReaderException(unittest.TestCase):
+class TestMultiprocessReaderExceptionWithQueueSuccess(unittest.TestCase):
     def setUp(self):
         self.use_pipe = False
         self.raise_exception = False
@@ -36,7 +36,7 @@ class TestMultiprocessReaderException(unittest.TestCase):
         else:
             return [fluid.CPUPlace()]
 
-    def main_impl(self, place, iterable, use_legacy_py_reader):
+    def main_impl(self, place, iterable):
         sample_num = 40
         batch_size = 4
 
@@ -53,37 +53,25 @@ class TestMultiprocessReaderException(unittest.TestCase):
             return __impl__
 
         with fluid.program_guard(fluid.Program(), fluid.Program()):
-            if not use_legacy_py_reader:
-                image = fluid.data(
-                    name='image', dtype='float32', shape=[None, 10])
-
-                reader = fluid.io.PyReader(
-                    feed_list=[image], capacity=2, iterable=iterable)
-            else:
-                reader = fluid.layers.py_reader(
-                    capacity=2, shapes=[[-1, 10], ], dtypes=['float32', ])
-                image = fluid.layers.read_file(reader)
+            image = fluid.data(name='image', dtype='float32', shape=[None, 10])
+            reader = fluid.io.DataLoader.from_generator(
+                feed_list=[image], capacity=2, iterable=iterable)
 
             image_p_1 = image + 1
 
             decorated_reader = multiprocess_reader(
                 [fake_reader(), fake_reader()], use_pipe=self.use_pipe)
 
-            if use_legacy_py_reader:
-                reader.decorate_paddle_reader(
-                    fluid.io.batch(
-                        decorated_reader, batch_size=batch_size))
+            if isinstance(place, fluid.CUDAPlace):
+                reader.set_sample_generator(
+                    decorated_reader,
+                    batch_size=batch_size,
+                    places=fluid.cuda_places(0))
             else:
-                if isinstance(place, fluid.CUDAPlace):
-                    reader.decorate_sample_generator(
-                        decorated_reader,
-                        batch_size=batch_size,
-                        places=fluid.cuda_places(0))
-                else:
-                    reader.decorate_sample_generator(
-                        decorated_reader,
-                        batch_size=batch_size,
-                        places=fluid.cpu_places())
+                reader.set_sample_generator(
+                    decorated_reader,
+                    batch_size=batch_size,
+                    places=fluid.cpu_places(1))
 
             exe = fluid.Executor(place)
             exe.run(fluid.default_startup_program())
@@ -97,9 +85,9 @@ class TestMultiprocessReaderException(unittest.TestCase):
                         for data in reader():
                             exe.run(feed=data, fetch_list=[image_p_1])
                             num += 1
-                        self.assertEquals(num, batch_num)
+                        self.assertEqual(num, batch_num)
                     except SystemError as ex:
-                        self.assertEquals(num, 0)
+                        self.assertEqual(num, 0)
                         raise ReaderException()
             else:
                 for _ in range(3):
@@ -112,40 +100,40 @@ class TestMultiprocessReaderException(unittest.TestCase):
                     except fluid.core.EOFException:
                         reader.reset()
                         self.assertFalse(self.raise_exception)
-                        self.assertEquals(num, batch_num)
+                        self.assertEqual(num, batch_num)
                     except SystemError as ex:
                         self.assertTrue(self.raise_exception)
-                        self.assertEquals(num, 0)
+                        self.assertEqual(num, 0)
                         raise ReaderException()
 
     def test_main(self):
         for p in self.places():
             for iterable in [False, True]:
-                use_legacy_py_reader_range = [False
-                                              ] if iterable else [False, True]
-                for use_legacy_py_reader in use_legacy_py_reader_range:
-                    try:
-                        with fluid.scope_guard(fluid.Scope()):
-                            self.main_impl(p, iterable, use_legacy_py_reader)
+                try:
+                    with fluid.scope_guard(fluid.Scope()):
+                        self.main_impl(p, iterable)
 
-                        self.assertTrue(not self.raise_exception)
-                    except ReaderException:
-                        self.assertTrue(self.raise_exception)
+                    self.assertTrue(not self.raise_exception)
+                except ReaderException:
+                    self.assertTrue(self.raise_exception)
 
 
-class TestCase1(TestMultiprocessReaderException):
+class TestMultiprocessReaderExceptionWithQueueFailed(
+        TestMultiprocessReaderExceptionWithQueueSuccess):
     def setUp(self):
         self.use_pipe = False
         self.raise_exception = True
 
 
-class TestCase2(TestMultiprocessReaderException):
+class TestMultiprocessReaderExceptionWithPipeSuccess(
+        TestMultiprocessReaderExceptionWithQueueSuccess):
     def setUp(self):
         self.use_pipe = True
         self.raise_exception = False
 
 
-class TestCase3(TestMultiprocessReaderException):
+class TestMultiprocessReaderExceptionWithPipeFailed(
+        TestMultiprocessReaderExceptionWithQueueSuccess):
     def setUp(self):
         self.use_pipe = True
         self.raise_exception = True
diff --git a/python/paddle/fluid/tests/unittests/test_nan_inf.py b/python/paddle/fluid/tests/unittests/test_nan_inf.py
index 1673002cb79045855cfc76d080a0697f8ef7b396..cb7e673c6ca29c7d089a9c4cdc033d3eae9cacd3 100644
--- a/python/paddle/fluid/tests/unittests/test_nan_inf.py
+++ b/python/paddle/fluid/tests/unittests/test_nan_inf.py
@@ -29,11 +29,10 @@ class TestNanInf(unittest.TestCase):
         self._python_interp = sys.executable
         if os.getenv('WITH_COVERAGE', 'OFF') == 'ON':
             self._python_interp += " -m coverage run --branch -p"
-        self._python_interp += " check_nan_inf_base.py"
 
         self.env = os.environ.copy()
 
-    def test_nan_inf(self):
+    def check_nan_inf(self):
         cmd = self._python_interp
 
         proc = subprocess.Popen(
@@ -53,6 +52,14 @@ class TestNanInf(unittest.TestCase):
         assert (out + err
                 ).find('There are `nan` or `inf` in tensor'.encode()) != -1
 
+    def test_nan_inf_in_static_mode(self):
+        self._python_interp += " check_nan_inf_base.py"
+        self.check_nan_inf()
+
+    def test_nan_inf_in_dynamic_mode(self):
+        self._python_interp += " check_nan_inf_base_dygraph.py"
+        self.check_nan_inf()
+
 
 class TestNanInfEnv(TestNanInf):
     def setUp(self):
diff --git a/python/paddle/fluid/tests/unittests/test_neg_op.py b/python/paddle/fluid/tests/unittests/test_neg_op.py
new file mode 100644
index 0000000000000000000000000000000000000000..e7b16bde023578c7d63ff9c1168caebb2921523c
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/test_neg_op.py
@@ -0,0 +1,91 @@
+#   Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+import numpy as np
+import paddle
+
+
+class TestNegOp(unittest.TestCase):
+    def setUp(self):
+        self.init_dtype_type()
+        self.input = (np.random.random((32, 8)) * 100).astype(self.dtype)
+
+    def init_dtype_type(self):
+        self.dtype = np.float64
+
+    def run_imperative(self):
+        input = paddle.to_tensor(self.input)
+        dy_result = paddle.neg(input)
+        expected_result = np.negative(self.input)
+        self.assertTrue(np.allclose(dy_result.numpy(), expected_result))
+
+    def run_static(self, use_gpu=False):
+        input = paddle.fluid.data(name='input', shape=[32, 8], dtype=self.dtype)
+        result = paddle.neg(input)
+
+        place = paddle.CUDAPlace(0) if use_gpu else paddle.CPUPlace()
+        exe = paddle.static.Executor(place)
+        exe.run(paddle.static.default_startup_program())
+        st_result = exe.run(feed={"input": self.input}, fetch_list=[result])
+        expected_result = np.negative(self.input)
+        self.assertTrue(np.allclose(st_result[0], expected_result))
+
+    def test_cpu(self):
+        paddle.disable_static(place=paddle.CPUPlace())
+        self.run_imperative()
+        paddle.enable_static()
+
+        with paddle.static.program_guard(paddle.static.Program()):
+            self.run_static()
+
+    def test_gpu(self):
+        if not paddle.fluid.core.is_compiled_with_cuda():
+            return
+
+        paddle.disable_static(place=paddle.CUDAPlace(0))
+        self.run_imperative()
+        paddle.enable_static()
+
+        with paddle.static.program_guard(paddle.static.Program()):
+            self.run_static(use_gpu=True)
+
+
+class TestNegOpFp32(TestNegOp):
+    def init_dtype_type(self):
+        self.dtype = np.float32
+
+
+class TestNegOpInt64(TestNegOp):
+    def init_dtype_type(self):
+        self.dtype = np.int64
+
+
+class TestNegOpInt32(TestNegOp):
+    def init_dtype_type(self):
+        self.dtype = np.int32
+
+
+class TestNegOpInt16(TestNegOp):
+    def init_dtype_type(self):
+        self.dtype = np.int16
+
+
+class TestNegOpInt8(TestNegOp):
+    def init_dtype_type(self):
+        self.dtype = np.int8
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_nn_quant_functional_layers.py b/python/paddle/fluid/tests/unittests/test_nn_quant_functional_layers.py
new file mode 100644
index 0000000000000000000000000000000000000000..86dc43bacf86be8aff306ed630f4ad66a51adb55
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/test_nn_quant_functional_layers.py
@@ -0,0 +1,87 @@
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+
+import unittest
+import numpy as np
+import paddle
+
+
+class TestFunctionalLayers(unittest.TestCase):
+    """
+    """
+
+    def setUp(self):
+        paddle.disable_static()
+        np.random.seed(1)
+
+        shape = [3, 100, 120]
+        self.x = paddle.to_tensor(np.random.random(shape))
+        self.y = paddle.to_tensor(np.random.random(shape))
+
+    def check(self, x, y):
+        self.assertTrue(np.allclose(x.numpy(), y.numpy()))
+
+    def test_quant_add(self):
+        out_1 = paddle.add(self.x, self.y)
+        out_2 = paddle.nn.quant.add()(self.x, self.y)
+        self.check(out_1, out_2)
+
+    def test_quant_subtract(self):
+        out_1 = paddle.subtract(self.x, self.y)
+        out_2 = paddle.nn.quant.subtract()(self.x, self.y)
+        self.check(out_1, out_2)
+
+    def test_quant_multiply(self):
+        out_1 = paddle.multiply(self.x, self.y)
+        out_2 = paddle.nn.quant.multiply()(self.x, self.y)
+        self.check(out_1, out_2)
+
+    def test_quant_divide(self):
+        out_1 = paddle.divide(self.x, self.y)
+        out_2 = paddle.nn.quant.divide()(self.x, self.y)
+        self.check(out_1, out_2)
+
+    def test_quant_reshape(self):
+        reshape = [120, 300]
+        out_1 = paddle.reshape(self.x, reshape)
+        out_2 = paddle.nn.quant.reshape()(self.x.clone(), reshape)
+        self.check(out_1, out_2)
+        self.assertTrue(out_1.shape == out_2.shape)
+
+    def test_quant_transpose(self):
+        perm = [1, 2, 0]
+        out_1 = paddle.transpose(self.x, perm)
+        out_2 = paddle.nn.quant.transpose()(self.x.clone(), perm)
+        self.check(out_1, out_2)
+        self.assertTrue(out_1.shape == out_2.shape)
+
+    def test_quant_concat(self):
+        out_1 = paddle.concat([self.x, self.y], axis=0)
+        out_2 = paddle.nn.quant.concat()([self.x, self.y], 0)
+        self.check(out_1, out_2)
+        self.assertTrue(out_1.shape == out_2.shape)
+
+    def test_quant_flatten(self):
+        start_axis = 1
+        end_axis = 2
+        out_1 = paddle.flatten(self.x, start_axis, end_axis)
+        out_2 = paddle.nn.quant.flatten()(self.x.clone(), start_axis, end_axis)
+        self.check(out_1, out_2)
+        self.assertTrue(out_1.shape == out_2.shape)
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_onnx_export.py b/python/paddle/fluid/tests/unittests/test_onnx_export.py
index 79d36063d77d5b128fdf3ba4a8a4fd711b226779..0985ed33af376c40e64ee93e636650a881970783 100644
--- a/python/paddle/fluid/tests/unittests/test_onnx_export.py
+++ b/python/paddle/fluid/tests/unittests/test_onnx_export.py
@@ -47,7 +47,7 @@ class TestExportWithTensor(unittest.TestCase):
         self.x_spec = paddle.static.InputSpec(
             shape=[None, 128], dtype='float32')
 
-    def test_with_tensor():
+    def test_with_tensor(self):
         model = LinearNet()
         paddle.onnx.export(model, 'linear_net', input_spec=[self.x_spec])
 
diff --git a/python/paddle/fluid/tests/unittests/test_paddle_save_load.py b/python/paddle/fluid/tests/unittests/test_paddle_save_load.py
index be2a6a653cc6f40e6e606200b307c7755a8f9559..594d0db035c6a5f71a5e07ca9547e66cfe58771e 100644
--- a/python/paddle/fluid/tests/unittests/test_paddle_save_load.py
+++ b/python/paddle/fluid/tests/unittests/test_paddle_save_load.py
@@ -19,6 +19,7 @@ import numpy as np
 import os
 import sys
 import six
+from io import BytesIO
 
 import paddle
 import paddle.nn as nn
@@ -760,6 +761,71 @@ class TestSaveLoadAny(unittest.TestCase):
         self.assertTrue(np.array_equal(origin_array, load_tensor_array))
 
 
+class TestSaveLoadToMemory(unittest.TestCase):
+    def test_dygraph_save_to_memory(self):
+        paddle.disable_static()
+        linear = LinearNet()
+        state_dict = linear.state_dict()
+        byio = BytesIO()
+        paddle.save(state_dict, byio)
+        tensor = paddle.randn([2, 3], dtype='float32')
+        paddle.save(tensor, byio)
+        byio.seek(0)
+        # load state_dict
+        dict_load = paddle.load(byio, return_numpy=True)
+        for k, v in state_dict.items():
+            self.assertTrue(np.array_equal(v.numpy(), dict_load[k]))
+        # load tensor
+        tensor_load = paddle.load(byio, return_numpy=True)
+        self.assertTrue(np.array_equal(tensor_load, tensor.numpy()))
+
+        with self.assertRaises(ValueError):
+            paddle.save(4, 3)
+        with self.assertRaises(ValueError):
+            paddle.save(state_dict, '')
+        with self.assertRaises(ValueError):
+            paddle.fluid.io._open_file_buffer('temp', 'b')
+
+    def test_static_save_to_memory(self):
+        paddle.enable_static()
+        with new_program_scope():
+            # create network
+            x = paddle.static.data(
+                name="x", shape=[None, IMAGE_SIZE], dtype='float32')
+            z = paddle.static.nn.fc(x, 10, bias_attr=False)
+            z = paddle.static.nn.fc(z, 128, bias_attr=False)
+            loss = fluid.layers.reduce_mean(z)
+            place = fluid.CPUPlace(
+            ) if not paddle.fluid.core.is_compiled_with_cuda(
+            ) else fluid.CUDAPlace(0)
+            prog = paddle.static.default_main_program()
+            exe = paddle.static.Executor(place)
+            exe.run(paddle.static.default_startup_program())
+
+            state_dict = prog.state_dict()
+            keys = list(state_dict.keys())
+            tensor = state_dict[keys[0]]
+
+            byio = BytesIO()
+            byio2 = BytesIO()
+            paddle.save(prog, byio2)
+            paddle.save(tensor, byio)
+            paddle.save(state_dict, byio)
+            byio.seek(0)
+            byio2.seek(0)
+
+            prog_load = paddle.load(byio2)
+            self.assertTrue(prog.desc.serialize_to_string() ==
+                            prog_load.desc.serialize_to_string())
+
+            tensor_load = paddle.load(byio, return_numpy=True)
+            self.assertTrue(np.array_equal(tensor_load, np.array(tensor)))
+
+            state_dict_load = paddle.load(byio, return_numpy=True)
+            for k, v in state_dict.items():
+                self.assertTrue(np.array_equal(np.array(v), state_dict_load[k]))
+
+
 class TestSaveLoad(unittest.TestCase):
     def setUp(self):
         # enable dygraph mode
diff --git a/python/paddle/fluid/tests/unittests/test_paddle_save_load_binary.py b/python/paddle/fluid/tests/unittests/test_paddle_save_load_binary.py
index 7385da56beab3eb9517e10f7cd6e37741daffbdf..0b9e038f7cd95e0584c26b003af03f7238b0f6b4 100644
--- a/python/paddle/fluid/tests/unittests/test_paddle_save_load_binary.py
+++ b/python/paddle/fluid/tests/unittests/test_paddle_save_load_binary.py
@@ -16,6 +16,7 @@ from __future__ import print_function
 
 import unittest
 import numpy as np
+from io import BytesIO
 import os
 import sys
 import six
@@ -176,13 +177,27 @@ class TestSaveLoadBinaryFormat(unittest.TestCase):
             paddle.save(temp_lod, path, use_binary_format=True)
 
         with self.assertRaises(RuntimeError):
-            fluid.core._save_lod_tensor(
+            fluid.core.save_lod_tensor(
                 temp_lod, 'test_save_load_error_not_exist_file/not_exist_file')
 
         with self.assertRaises(RuntimeError):
-            fluid.core._load_lod_tensor(
+            fluid.core.load_lod_tensor(
                 temp_lod, 'test_save_load_error_not_exist_file/not_exist_file')
 
+        # save to memory
+        byio = BytesIO()
+        paddle.save(tensor, byio, use_binary_format=True)
+        byio.seek(0)
+        # load from memory
+        loaded_tensor_mem = paddle.load(byio)
+        to_array_mem = np.array(loaded_tensor_mem)
+        self.assertTrue(np.array_equal(np.array(tensor), to_array_mem))
+
+        with self.assertRaises(NotImplementedError):
+            paddle.framework.io._save_lod_tensor(tensor, 1)
+        with self.assertRaises(NotImplementedError):
+            paddle.framework.io._load_lod_tensor(1)
+
     def test_save_load_selected_rows(self):
         paddle.enable_static()
         place = fluid.CPUPlace() if not paddle.fluid.core.is_compiled_with_cuda(
@@ -210,10 +225,28 @@ class TestSaveLoadBinaryFormat(unittest.TestCase):
             np.array_equal(np.array(load_sr.get_tensor()), np_array))
 
         with self.assertRaises(RuntimeError):
-            fluid.core._save_selected_rows(
+            fluid.core.save_selected_rows(
                 selected_rows,
                 'test_paddle_save_load_selected_rows_not_exist_file/temp')
         with self.assertRaises(RuntimeError):
-            fluid.core._load_selected_rows(
+            fluid.core.load_selected_rows(
                 selected_rows,
                 'test_paddle_save_load_selected_rows_not_exist_file/temp')
+
+        # save to memory
+        byio = BytesIO()
+        paddle.save(selected_rows, byio, use_binary_format=True)
+        byio.seek(0)
+        # load from memory
+        selected_rows_mem = paddle.load(byio)
+        to_array_mem = np.array(selected_rows_mem)
+        self.assertTrue(isinstance(selected_rows_mem, fluid.core.SelectedRows))
+        self.assertTrue(list(selected_rows_mem.rows()) == rows)
+        self.assertTrue(selected_rows_mem.height() == height)
+        self.assertTrue(
+            np.array_equal(np.array(selected_rows_mem.get_tensor()), np_array))
+
+        with self.assertRaises(NotImplementedError):
+            paddle.framework.io._save_selected_rows(selected_rows, 1)
+        with self.assertRaises(NotImplementedError):
+            paddle.framework.io._load_selected_rows(1)
diff --git a/python/paddle/fluid/tests/unittests/test_pipeline.py b/python/paddle/fluid/tests/unittests/test_pipeline.py
index cd592416c1a512a1fc95143efb5817b1d3a74561..1be10113a5591cc10671c1a63215d1f7617d4239 100644
--- a/python/paddle/fluid/tests/unittests/test_pipeline.py
+++ b/python/paddle/fluid/tests/unittests/test_pipeline.py
@@ -44,6 +44,15 @@ class TestPipeline(TestDistBase):
                 check_error_log=True,
                 log_name=flag_name)
 
+    def test_dist_train_multi_device(self):
+        import paddle.fluid as fluid
+        if fluid.core.is_compiled_with_cuda():
+            self.check_with_place(
+                "pipeline_mnist_multi_device.py",
+                check_error_log=True,
+                delta=1e0,
+                log_name=flag_name)
+
     def test_dist_train_one_device(self):
         import paddle.fluid as fluid
         if fluid.core.is_compiled_with_cuda():
diff --git a/python/paddle/fluid/tests/unittests/test_pylayer_op.py b/python/paddle/fluid/tests/unittests/test_pylayer_op.py
index e058115d691993781d7f6d0fb9aa20b633ab60d9..a852b4c90421acb7865abf2aeb58f7a0b346bf41 100644
--- a/python/paddle/fluid/tests/unittests/test_pylayer_op.py
+++ b/python/paddle/fluid/tests/unittests/test_pylayer_op.py
@@ -21,6 +21,11 @@ import paddle
 from paddle.autograd import PyLayer
 
 
+class FakeTensor(paddle.fluid.core.VarBase):
+    def __init__(self):
+        pass
+
+
 class TestPyLayer(unittest.TestCase):
     def test_simple_pylayer_multiple_output(self):
         class tanh(PyLayer):
@@ -426,6 +431,129 @@ class TestPyLayer(unittest.TestCase):
             z = paddle.tanh(data)
             z = cus_tanh.apply(data)
 
+    def test_return_to_tensor(self):
+        class Tanh(PyLayer):
+            @staticmethod
+            def forward(ctx, x1):
+                y1 = paddle.tanh(x1)
+                ctx.save_for_backward(y1)
+                tensor_1 = paddle.to_tensor([1, 2], dtype='float32')
+                return y1, 5, None, "helloworld", tensor_1
+
+            @staticmethod
+            def backward(ctx, dy1, dy2):
+                y1, = ctx.saved_tensor()
+                re1 = dy1 * (1 - paddle.square(y1))
+                return dy1
+
+        input1 = paddle.randn([2, 3]).astype("float32")
+        input2 = input1.detach().clone()
+        input1.stop_gradient = False
+        input2.stop_gradient = False
+        z, number, none_item, string_item, tensor1 = Tanh.apply(x1=input1)
+        z.mean().backward()
+
+
+class TestPyLayerReturnType(unittest.TestCase):
+    def test_forward_args_fake_tensor(self):
+        class Tanh(PyLayer):
+            @staticmethod
+            def forward(ctx, x1):
+                y1 = FakeTensor()
+                return y1, x1
+
+            @staticmethod
+            def backward(ctx, dy1, dy2):
+                return dy1
+
+        input1 = FakeTensor()
+
+        with self.assertRaises(ValueError):
+            y1, y2 = Tanh.apply(input1)
+
+    def test_forward_kwargs_fake_tensor(self):
+        class Tanh(PyLayer):
+            @staticmethod
+            def forward(ctx, x1):
+
+                return x1
+
+            @staticmethod
+            def backward(ctx, dy1, dy2):
+                return dy1
+
+        input1 = FakeTensor()
+
+        with self.assertRaises(ValueError):
+            y = Tanh.apply(x1=input1)
+
+    def test_forward_return_fake_tensor(self):
+        class Tanh(PyLayer):
+            @staticmethod
+            def forward(ctx, x1):
+
+                return FakeTensor()
+
+            @staticmethod
+            def backward(ctx, dy1, dy2):
+                return dy1
+
+        input1 = paddle.randn([3, 2])
+
+        with self.assertRaises(ValueError):
+            y = Tanh.apply(x1=input1)
+
+    def test_forward_return_fake_tensor_tuple(self):
+        class Tanh(PyLayer):
+            @staticmethod
+            def forward(ctx, x1):
+
+                return FakeTensor(), FakeTensor()
+
+            @staticmethod
+            def backward(ctx, dy1, dy2):
+                return dy1
+
+        input1 = paddle.randn([3, 2])
+
+        with self.assertRaises(ValueError):
+            y = Tanh.apply(x1=input1)
+
+    def test_backward_return_fake_tensor_tuple(self):
+        class Tanh(PyLayer):
+            @staticmethod
+            def forward(ctx, x1, x2):
+                return x1 + 1, x1 + 2
+
+            @staticmethod
+            def backward(ctx, dy1, dy2):
+
+                return FakeTensor(), 2
+
+        input1 = paddle.randn([3, 2])
+        input1.stop_gradient = False
+        y, _ = Tanh.apply(input1, 1 + input1)
+
+        with self.assertRaises(ValueError):
+            y.mean().backward()
+
+    def test_backward_return_fake_tensor(self):
+        class Tanh(PyLayer):
+            @staticmethod
+            def forward(ctx, x1):
+                return x1 + 1, x1 + 2
+
+            @staticmethod
+            def backward(ctx, dy1, dy2):
+                return FakeTensor()
+
+        input1 = paddle.randn([3, 2])
+        input1.stop_gradient = False
+        y, _ = Tanh.apply(input1)
+
+        with self.assertRaises(ValueError):
+            y.mean().backward()
+
 
 if __name__ == '__main__':
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_raw_program_optimizer.py b/python/paddle/fluid/tests/unittests/test_raw_program_optimizer.py
new file mode 100644
index 0000000000000000000000000000000000000000..34930e3577b9b561e80f15ee336e31ec19987170
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/test_raw_program_optimizer.py
@@ -0,0 +1,77 @@
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+
+import unittest
+
+import paddle
+import paddle.fluid as fluid
+import paddle.fluid.core as core
+import paddle.distributed.fleet as fleet
+import numpy as np
+import os
+
+
+class TestRawProgramOptimizer(unittest.TestCase):
+    def setUp(self):
+        os.environ["PADDLE_TRAINER_ID"] = "0"
+        os.environ["PADDLE_TRAINER_ENDPOINTS"] = "127.0.0.1:36001"
+
+    def mlp(self, input_x, input_y, hid_dim=128, label_dim=2):
+        fc_1 = paddle.static.nn.fc(x=input_x, size=hid_dim, activation='tanh')
+        fc_2 = paddle.static.nn.fc(x=fc_1, size=hid_dim, activation='tanh')
+        prediction = paddle.static.nn.fc(x=[fc_2],
+                                         size=label_dim,
+                                         activation='softmax')
+        cost = paddle.nn.functional.cross_entropy(
+            input=prediction, label=input_y)
+        avg_cost = paddle.mean(x=cost)
+        return avg_cost
+
+    def gen_data(self):
+        return {
+            "x": np.random.random(size=(128, 32)).astype('float32'),
+            "y": np.random.randint(
+                2, size=(128, 1)).astype('int64')
+        }
+
+    def test_single_gpu(self):
+        paddle.enable_static()
+        fleet.init(is_collective=True)
+        sharding_program = paddle.static.Program()
+        sharding_startup_program = paddle.static.Program()
+        strategy = fleet.DistributedStrategy()
+        strategy.without_graph_optimization = True
+        with fluid.program_guard(sharding_program, sharding_startup_program):
+            with fluid.unique_name.guard():
+                input_x = paddle.static.data(
+                    name="x", shape=[None, 32], dtype='float32')
+                input_y = paddle.static.data(
+                    name="y", shape=[None, 1], dtype='int64')
+                cost = self.mlp(input_x=input_x, input_y=input_y)
+                output_name = cost.name
+                optimizer = fleet.distributed_optimizer(fluid.optimizer.Adam(),
+                                                        strategy)
+                optimizer.minimize(cost)
+
+        trainer_id = fleet.worker_index()
+        exe = paddle.static.Executor(paddle.CUDAPlace(trainer_id))
+        rank = fleet.worker_index()
+        exe.run(sharding_startup_program)
+        exe.run(program=sharding_program, feed=self.gen_data())
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_regularizer.py b/python/paddle/fluid/tests/unittests/test_regularizer.py
index edd69d67aaf4b6df11f3f59ba21dca1a53609175..08a70fe1852d02cd94a10166158d05c01627111f 100644
--- a/python/paddle/fluid/tests/unittests/test_regularizer.py
+++ b/python/paddle/fluid/tests/unittests/test_regularizer.py
@@ -59,6 +59,7 @@ class TestL2DecayRegularizer(unittest.TestCase):
         params_grads = append_backward(mean_out)
         self.assertEqual(len(params_grads), 1)
         count_ops = len(block.ops)
+        optimizer = paddle.optimizer.Adam()
         params_grads = optimizer.append_regularization_ops(params_grads)
         self.assertEqual(len(params_grads), 1)
         self.assertEqual(len(block.ops), count_ops + 2)
@@ -97,6 +98,7 @@ class TestL1DecayRegularizer(unittest.TestCase):
         params_grads = append_backward(mean_out)
         self.assertEqual(len(params_grads), 1)
         count_ops = len(block.ops)
+        optimizer = paddle.optimizer.Adam()
         params_grads = optimizer.append_regularization_ops(params_grads)
         self.assertEqual(len(params_grads), 1)
         self.assertEqual(len(block.ops), count_ops + 3)
diff --git a/python/paddle/fluid/tests/unittests/test_roi_align_op.py b/python/paddle/fluid/tests/unittests/test_roi_align_op.py
index 7d030855d114ee4fcc604a935b34866fe71e8a03..7fab4017ab0ba17accef790c225699dac58e848f 100644
--- a/python/paddle/fluid/tests/unittests/test_roi_align_op.py
+++ b/python/paddle/fluid/tests/unittests/test_roi_align_op.py
@@ -129,8 +129,9 @@ class TestROIAlignOp(OpTest):
 
             roi_width = roi_xmax - roi_xmin
             roi_height = roi_ymax - roi_ymin
-            roi_width = max(roi_width, 1)
-            roi_height = max(roi_height, 1)
+            if not self.aligned:
+                roi_width = max(roi_width, 1)
+                roi_height = max(roi_height, 1)
 
             bin_size_h = float(roi_height) / float(self.pooled_height)
             bin_size_w = float(roi_width) / float(self.pooled_width)
@@ -138,7 +139,7 @@ class TestROIAlignOp(OpTest):
                                  math.ceil(roi_height / self.pooled_height)
             roi_bin_grid_w = self.sampling_ratio if self.sampling_ratio > 0 else \
                                  math.ceil(roi_width / self.pooled_width)
-            count = int(roi_bin_grid_h * roi_bin_grid_w)
+            count = max(int(roi_bin_grid_h * roi_bin_grid_w), 1)
             pre_size = count * self.pooled_width * self.pooled_height
             bilinear_pos, bilinear_w = self.pre_calc(x_i, roi_xmin, roi_ymin,
                                                      int(roi_bin_grid_h),
diff --git a/python/paddle/fluid/tests/unittests/test_slice_op.py b/python/paddle/fluid/tests/unittests/test_slice_op.py
index bd784b65c10f00ace463d21e3331af02096523fb..b83478a5b8b0b094ed959d011202216a0bb04b63 100644
--- a/python/paddle/fluid/tests/unittests/test_slice_op.py
+++ b/python/paddle/fluid/tests/unittests/test_slice_op.py
@@ -22,6 +22,8 @@ import paddle.fluid as fluid
 import paddle.fluid.layers as layers
 import paddle
 
+paddle.enable_static()
+
 
 # Situation 1: starts(list, no tensor), ends(list, no tensor)
 # 1.1 without attr(decrease)
@@ -683,6 +685,16 @@ class TestImperativeVarBaseGetItem(unittest.TestCase):
         self.assertRaises(Exception, test_float_in_index)
 
 
+class TestInferShape(unittest.TestCase):
+    def test(self):
+        x = paddle.ones(shape=[3, 4, 5])
+        x.desc.set_shape([3, -1, 5])
+        self.assertEqual(x.shape, (3, -1, 5))
+
+        out0 = paddle.slice(x, axes=[1], starts=[0], ends=[3])
+        self.assertEqual(out0.shape, (3, 3, 5))
+
+
 @unittest.skipIf(not core.is_compiled_with_cuda(),
                  "core is not compiled with CUDA")
 class TestImperativeCUDAPinnedInput(unittest.TestCase):
diff --git a/python/paddle/fluid/tests/unittests/test_strided_slice_op.py b/python/paddle/fluid/tests/unittests/test_strided_slice_op.py
index 71550c8f24753cbe5ba31a7eb83a67d35fb1efe6..ebf7c01e2cae5f30b45c44e20e0395b48cf57ed8 100644
--- a/python/paddle/fluid/tests/unittests/test_strided_slice_op.py
+++ b/python/paddle/fluid/tests/unittests/test_strided_slice_op.py
@@ -216,6 +216,71 @@ class TestStrideSliceOp13(TestStrideSliceOp):
         self.infer_flags = [1, 1, 1, 1, 1]
 
 
+class TestStrideSliceOpBool(TestStrideSliceOp):
+    def test_check_grad(self):
+        pass
+
+
+class TestStrideSliceOpBool1D(TestStrideSliceOpBool):
+    def initTestCase(self):
+        self.input = np.random.rand(100).astype("bool")
+        self.axes = [0]
+        self.starts = [3]
+        self.ends = [8]
+        self.strides = [1]
+        self.infer_flags = [1]
+
+
+class TestStrideSliceOpBool2D(TestStrideSliceOpBool):
+    def initTestCase(self):
+        self.input = np.random.rand(10, 10).astype("bool")
+        self.axes = [0, 1]
+        self.starts = [1, 0]
+        self.ends = [2, 2]
+        self.strides = [1, 1]
+        self.infer_flags = [1, 1]
+
+
+class TestStrideSliceOpBool3D(TestStrideSliceOpBool):
+    def initTestCase(self):
+        self.input = np.random.rand(3, 4, 10).astype("bool")
+        self.axes = [0, 1, 2]
+        self.starts = [0, -1, 0]
+        self.ends = [2, -3, 5]
+        self.strides = [1, -1, 1]
+        self.infer_flags = [1, 1, 1]
+
+
+class TestStrideSliceOpBool4D(TestStrideSliceOpBool):
+    def initTestCase(self):
+        self.input = np.random.rand(3, 3, 3, 4).astype("bool")
+        self.axes = [0, 1, 2, 3]
+        self.starts = [1, 0, 0, 0]
+        self.ends = [2, 2, 3, 4]
+        self.strides = [1, 1, 1, 2]
+        self.infer_flags = [1, 1, 1, 1]
+
+
+class TestStrideSliceOpBool5D(TestStrideSliceOpBool):
+    def initTestCase(self):
+        self.input = np.random.rand(3, 3, 3, 4, 5).astype("bool")
+        self.axes = [0, 1, 2, 3, 4]
+        self.starts = [1, 0, 0, 0, 0]
+        self.ends = [2, 2, 3, 4, 4]
+        self.strides = [1, 1, 1, 1, 1]
+        self.infer_flags = [1, 1, 1, 1]
+
+
+class TestStrideSliceOpBool6D(TestStrideSliceOpBool):
+    def initTestCase(self):
+        self.input = np.random.rand(3, 3, 3, 6, 7, 8).astype("bool")
+        self.axes = [0, 1, 2, 3, 4, 5]
+        self.starts = [1, 0, 0, 0, 1, 2]
+        self.ends = [2, 2, 3, 1, 2, 8]
+        self.strides = [1, 1, 1, 1, 1, 2]
+        self.infer_flags = [1, 1, 1, 1, 1]
+
+
 class TestStridedSliceOp_starts_ListTensor(OpTest):
     def setUp(self):
         self.op_type = "strided_slice"
diff --git a/python/paddle/fluid/tests/unittests/test_traced_layer_err_msg.py b/python/paddle/fluid/tests/unittests/test_traced_layer_err_msg.py
index cb5186468890d8108042faba56f16d641adb663e..85d830485e23f16487bebc9cc033fae2951ed7f8 100644
--- a/python/paddle/fluid/tests/unittests/test_traced_layer_err_msg.py
+++ b/python/paddle/fluid/tests/unittests/test_traced_layer_err_msg.py
@@ -72,7 +72,7 @@ class TestTracedLayerErrMsg(unittest.TestCase):
                     self.layer, 3)
             self.assertEqual(
                 "The type of 'each element of inputs' in fluid.dygraph.jit.TracedLayer.trace must be fluid.Variable, but received <{} 'int'>.".
-                format(self.type_str, self.type_str), str(e.exception))
+                format(self.type_str), str(e.exception))
             with self.assertRaises(TypeError) as e:
                 dygraph_out, traced_layer = fluid.dygraph.TracedLayer.trace(
                     self.layer, [True, 1])
diff --git a/python/paddle/fluid/tests/unittests/test_variable.py b/python/paddle/fluid/tests/unittests/test_variable.py
index 6ffecd33f8f48d69ffc7593cb684a93f2d4be226..c1956545f55ad1333124bca03608d35d43cf3fd6 100644
--- a/python/paddle/fluid/tests/unittests/test_variable.py
+++ b/python/paddle/fluid/tests/unittests/test_variable.py
@@ -17,6 +17,7 @@ from __future__ import print_function
 import unittest
 import paddle
 from paddle.fluid.framework import default_main_program, Program, convert_np_dtype_to_dtype_, in_dygraph_mode
+import paddle
 import paddle.fluid as fluid
 import paddle.fluid.layers as layers
 import paddle.fluid.core as core
@@ -218,12 +219,60 @@ class TestVariable(unittest.TestCase):
         self.assertTrue((result[2] == expected[2]).all())
         self.assertTrue((result[3] == expected[3]).all())
 
+    def _test_slice_index_ellipsis(self, place):
+        data = np.random.rand(2, 3, 4).astype("float32")
+        prog = paddle.static.Program()
+        with paddle.static.program_guard(prog):
+            x = paddle.assign(data)
+            out1 = x[0:, ..., 1:]
+            out2 = x[0:, ...]
+            out3 = x[..., 1:]
+            out4 = x[...]
+
+        exe = paddle.static.Executor(place)
+        result = exe.run(prog, fetch_list=[out1, out2, out3, out4])
+
+        expected = [data[0:, ..., 1:], data[0:, ...], data[..., 1:], data[...]]
+
+        self.assertTrue((result[0] == expected[0]).all())
+        self.assertTrue((result[1] == expected[1]).all())
+        self.assertTrue((result[2] == expected[2]).all())
+        self.assertTrue((result[3] == expected[3]).all())
+
         with self.assertRaises(IndexError):
             res = x[[1, 0], [0, 0]]
 
         with self.assertRaises(TypeError):
             res = x[[1.2, 0]]
 
+    def _test_slice_index_list_bool(self, place):
+        data = np.random.rand(2, 3).astype("float32")
+        prog = paddle.static.Program()
+        with paddle.static.program_guard(prog):
+            x = paddle.assign(data)
+            idx0 = [True, False]
+            idx1 = [False, True]
+            idx2 = [False, False]
+            idx3 = [True, True]
+
+            out0 = x[idx0]
+            out1 = x[idx1]
+            out2 = x[idx2]
+            out3 = x[idx3]
+
+        exe = paddle.static.Executor(place)
+        result = exe.run(prog, fetch_list=[out0, out1, out2, out3])
+
+        expected = [data[idx0], data[idx1], data[idx2], data[idx3]]
+
+        self.assertTrue((result[0] == expected[0]).all())
+        self.assertTrue((result[1] == expected[1]).all())
+        self.assertTrue((result[2] == expected[2]).all())
+        self.assertTrue((result[3] == expected[3]).all())
+
+        with self.assertRaises(TypeError):
+            res = x[[True, 0]]
+
     def test_slice(self):
         places = [fluid.CPUPlace()]
         if core.is_compiled_with_cuda():
@@ -233,6 +282,8 @@ class TestVariable(unittest.TestCase):
             self._test_slice(place)
             self._test_slice_index_tensor(place)
             self._test_slice_index_list(place)
+            self._test_slice_index_ellipsis(place)
+            self._test_slice_index_list_bool(place)
 
     def _tostring(self):
         b = default_main_program().current_block()
diff --git a/python/paddle/fluid/tests/unittests/test_yolo_box_op.py b/python/paddle/fluid/tests/unittests/test_yolo_box_op.py
index 24c463ebfc9a1336c6a7eea19d4190db17d6f08c..5793f0148fc5475a89c3b53831bc2019af542b61 100644
--- a/python/paddle/fluid/tests/unittests/test_yolo_box_op.py
+++ b/python/paddle/fluid/tests/unittests/test_yolo_box_op.py
@@ -35,10 +35,16 @@ def YoloBox(x, img_size, attrs):
     downsample = attrs['downsample']
     clip_bbox = attrs['clip_bbox']
     scale_x_y = attrs['scale_x_y']
+    iou_aware = attrs['iou_aware']
+    iou_aware_factor = attrs['iou_aware_factor']
     bias_x_y = -0.5 * (scale_x_y - 1.)
     input_h = downsample * h
     input_w = downsample * w
 
+    if iou_aware:
+        ioup = x[:, :an_num, :, :]
+        ioup = np.expand_dims(ioup, axis=-1)
+        x = x[:, an_num:, :, :]
     x = x.reshape((n, an_num, 5 + class_num, h, w)).transpose((0, 1, 3, 4, 2))
 
     pred_box = x[:, :, :, :, :4].copy()
@@ -57,7 +63,11 @@ def YoloBox(x, img_size, attrs):
     pred_box[:, :, :, :, 2] = np.exp(pred_box[:, :, :, :, 2]) * anchor_w
     pred_box[:, :, :, :, 3] = np.exp(pred_box[:, :, :, :, 3]) * anchor_h
 
-    pred_conf = sigmoid(x[:, :, :, :, 4:5])
+    if iou_aware:
+        pred_conf = sigmoid(x[:, :, :, :, 4:5])**(
+            1 - iou_aware_factor) * sigmoid(ioup)**iou_aware_factor
+    else:
+        pred_conf = sigmoid(x[:, :, :, :, 4:5])
     pred_conf[pred_conf < conf_thresh] = 0.
     pred_score = sigmoid(x[:, :, :, :, 5:]) * pred_conf
     pred_box = pred_box * (pred_conf > 0.).astype('float32')
@@ -97,6 +107,8 @@ class TestYoloBoxOp(OpTest):
             "downsample": self.downsample,
             "clip_bbox": self.clip_bbox,
             "scale_x_y": self.scale_x_y,
+            "iou_aware": self.iou_aware,
+            "iou_aware_factor": self.iou_aware_factor
         }
 
         self.inputs = {
@@ -123,6 +135,8 @@ class TestYoloBoxOp(OpTest):
         self.x_shape = (self.batch_size, an_num * (5 + self.class_num), 13, 13)
         self.imgsize_shape = (self.batch_size, 2)
         self.scale_x_y = 1.
+        self.iou_aware = False
+        self.iou_aware_factor = 0.5
 
 
 class TestYoloBoxOpNoClipBbox(TestYoloBoxOp):
@@ -137,6 +151,8 @@ class TestYoloBoxOpNoClipBbox(TestYoloBoxOp):
         self.x_shape = (self.batch_size, an_num * (5 + self.class_num), 13, 13)
         self.imgsize_shape = (self.batch_size, 2)
         self.scale_x_y = 1.
+        self.iou_aware = False
+        self.iou_aware_factor = 0.5
 
 
 class TestYoloBoxOpScaleXY(TestYoloBoxOp):
@@ -151,19 +167,36 @@ class TestYoloBoxOpScaleXY(TestYoloBoxOp):
         self.x_shape = (self.batch_size, an_num * (5 + self.class_num), 13, 13)
         self.imgsize_shape = (self.batch_size, 2)
         self.scale_x_y = 1.2
+        self.iou_aware = False
+        self.iou_aware_factor = 0.5
+
+
+class TestYoloBoxOpIoUAware(TestYoloBoxOp):
+    def initTestCase(self):
+        self.anchors = [10, 13, 16, 30, 33, 23]
+        an_num = int(len(self.anchors) // 2)
+        self.batch_size = 32
+        self.class_num = 2
+        self.conf_thresh = 0.5
+        self.downsample = 32
+        self.clip_bbox = True
+        self.x_shape = (self.batch_size, an_num * (6 + self.class_num), 13, 13)
+        self.imgsize_shape = (self.batch_size, 2)
+        self.scale_x_y = 1.
+        self.iou_aware = True
+        self.iou_aware_factor = 0.5
 
 
 class TestYoloBoxDygraph(unittest.TestCase):
     def test_dygraph(self):
         paddle.disable_static()
-        x = np.random.random([2, 14, 8, 8]).astype('float32')
         img_size = np.ones((2, 2)).astype('int32')
-
-        x = paddle.to_tensor(x)
         img_size = paddle.to_tensor(img_size)
 
+        x1 = np.random.random([2, 14, 8, 8]).astype('float32')
+        x1 = paddle.to_tensor(x1)
         boxes, scores = paddle.vision.ops.yolo_box(
-            x,
+            x1,
             img_size=img_size,
             anchors=[10, 13, 16, 30],
             class_num=2,
@@ -172,16 +205,30 @@ class TestYoloBoxDygraph(unittest.TestCase):
             clip_bbox=True,
             scale_x_y=1.)
         assert boxes is not None and scores is not None
+
+        x2 = np.random.random([2, 16, 8, 8]).astype('float32')
+        x2 = paddle.to_tensor(x2)
+        boxes, scores = paddle.vision.ops.yolo_box(
+            x2,
+            img_size=img_size,
+            anchors=[10, 13, 16, 30],
+            class_num=2,
+            conf_thresh=0.01,
+            downsample_ratio=8,
+            clip_bbox=True,
+            scale_x_y=1.,
+            iou_aware=True,
+            iou_aware_factor=0.5)
         paddle.enable_static()
 
 
 class TestYoloBoxStatic(unittest.TestCase):
     def test_static(self):
-        x = paddle.static.data('x', [2, 14, 8, 8], 'float32')
+        x1 = paddle.static.data('x1', [2, 14, 8, 8], 'float32')
         img_size = paddle.static.data('img_size', [2, 2], 'int32')
 
         boxes, scores = paddle.vision.ops.yolo_box(
-            x,
+            x1,
             img_size=img_size,
             anchors=[10, 13, 16, 30],
             class_num=2,
@@ -191,6 +238,20 @@ class TestYoloBoxStatic(unittest.TestCase):
             scale_x_y=1.)
         assert boxes is not None and scores is not None
 
+        x2 = paddle.static.data('x2', [2, 16, 8, 8], 'float32')
+        boxes, scores = paddle.vision.ops.yolo_box(
+            x2,
+            img_size=img_size,
+            anchors=[10, 13, 16, 30],
+            class_num=2,
+            conf_thresh=0.01,
+            downsample_ratio=8,
+            clip_bbox=True,
+            scale_x_y=1.,
+            iou_aware=True,
+            iou_aware_factor=0.5)
+        assert boxes is not None and scores is not None
+
 
 class TestYoloBoxOpHW(TestYoloBoxOp):
     def initTestCase(self):
@@ -204,6 +265,8 @@ class TestYoloBoxOpHW(TestYoloBoxOp):
         self.x_shape = (self.batch_size, an_num * (5 + self.class_num), 13, 9)
         self.imgsize_shape = (self.batch_size, 2)
         self.scale_x_y = 1.
+        self.iou_aware = False
+        self.iou_aware_factor = 0.5
 
 
 if __name__ == "__main__":
diff --git a/python/paddle/fluid/tests/unittests/xpu/test_pool2d_op_xpu.py b/python/paddle/fluid/tests/unittests/xpu/test_pool2d_op_xpu.py
index bebb5c762649145cab666633ac91371ab679f551..53a91af3a716ba1f48446923e71b3f34d6df3c06 100644
--- a/python/paddle/fluid/tests/unittests/xpu/test_pool2d_op_xpu.py
+++ b/python/paddle/fluid/tests/unittests/xpu/test_pool2d_op_xpu.py
@@ -25,6 +25,7 @@ from op_test_xpu import XPUOpTest
 import paddle.fluid as fluid
 from paddle.fluid import Program, program_guard
 import paddle
+from test_pool2d_op import adaptive_start_index, adaptive_end_index
 
 paddle.enable_static()
 
diff --git a/python/paddle/fluid/transpiler/collective.py b/python/paddle/fluid/transpiler/collective.py
index ef6975c3d241e5de0a4dab17e88ebf6896472f32..308a876977cf4f11ba3a79f08c4729268121e6e2 100644
--- a/python/paddle/fluid/transpiler/collective.py
+++ b/python/paddle/fluid/transpiler/collective.py
@@ -434,9 +434,10 @@ class MultiThread(GradAllReduce):
             print("total endpoints: ", self.endpoints)
             print("rank: %d, ring_id: %d" % (self.rank, self.nrings))
             for ring_id in range(self.nrings):
-                self._init_communicator(
-                    self.startup_program, self.current_endpoint, self.endpoints,
-                    self.rank, ring_id, self.wait_port, True)
+                self._init_communicator(self.startup_program,
+                                        self.current_endpoint, self.endpoints,
+                                        self.rank, ring_id, self.wait_port)
+
         else:
             print("begin to _transpile_startup_program for single-node")
             block = self.startup_program.global_block()
diff --git a/python/paddle/fluid/variable_index.py b/python/paddle/fluid/variable_index.py
index aed8c82d43b4dda373d30916ac291b4eff8a1064..c9363dff13d81cb8817b28c7036678c7f906f7e6 100644
--- a/python/paddle/fluid/variable_index.py
+++ b/python/paddle/fluid/variable_index.py
@@ -112,9 +112,23 @@ def _getitem_impl_(var, item):
 
     use_strided_slice = False
     item, none_axes = replace_none(item)
+    item = replace_ellipsis(var, item)
 
     for dim, slice_item in enumerate(item):
         if is_integer_or_scalar_tensor(slice_item):
+            if isinstance(slice_item,
+                          int) and var.shape[dim] is not None and var.shape[
+                              dim] >= 0 and slice_item >= var.shape[dim]:
+                # For python, if users write a, b = var, the __getitem__
+                # method will iterate through 0, 1, 2 ... until __getitem__
+                # throws an IndexError, then stop. The var[0], var[1] will
+                # be given to a, b respectively. If more values are given,
+                # the unpack size would cause error.
+                #
+                # We raises IndexError here to support grammar like `a, b = var`
+                raise IndexError(
+                    "slice_item %d at dim %d should be >= 0 and < var.shape[%d]: %d"
+                    % (slice_item, dim, dim, var.shape[dim]))
             decrease_axes.append(dim)
             start = slice_item
             step = 1
@@ -139,19 +153,36 @@ def _getitem_impl_(var, item):
             end = MAX_INTEGER if end is None else end
 
         elif isinstance(slice_item, list):
+            is_bool_list = False
             for i in slice_item:
-                if not isinstance(i, int):
-                    raise TypeError("Only support int value in list")
+                if not isinstance(i, (int, bool)):
+                    raise TypeError("Only support int or bool in index list.")
+
+                if isinstance(i, bool):
+                    is_bool_list = True
+                    break
 
             if len(item) != 1:
                 raise IndexError(
                     "When index contains a list, its length must be 1, but received {}".
                     format(len(item)))
 
+            if is_bool_list:
+                new_slice_item = []
+                for idx, ele in enumerate(slice_item):
+                    if not isinstance(ele, bool):
+                        raise TypeError(
+                            "Mixed bool index with other types is not supported."
+                        )
+
+                    if ele is True:
+                        new_slice_item.append(idx)
+                slice_item = new_slice_item
+
             from .layers import assign
             from ..tensor import index_select
 
-            idx = assign(np.array(slice_item))
+            idx = assign(np.array(slice_item).astype("int32"))
             return index_select(var, index=idx, axis=0)
 
         elif isinstance(slice_item, Variable):
diff --git a/python/paddle/framework/framework.py b/python/paddle/framework/framework.py
index 93056a60c371c3889868ac200f278406688bdee8..e9d690c28d60ec84cc53e7b21ec34b983828d350 100644
--- a/python/paddle/framework/framework.py
+++ b/python/paddle/framework/framework.py
@@ -87,8 +87,6 @@ def get_default_dtype():
 @contextmanager
 def set_grad_enabled(mode):
     """
-    :api_attr: imperative
-
     Create a context which enables or disables dygraph gradient calculation.
 
     Args:
@@ -96,6 +94,7 @@ def set_grad_enabled(mode):
 
     Examples:
         .. code-block:: python
+            
             import paddle
             x = paddle.ones([3, 2])
             x.stop_gradient = False
diff --git a/python/paddle/framework/io.py b/python/paddle/framework/io.py
index 1705db50d391a9d507e25c0b8f8aa6081b170923..5f1ffa81eab17b720f9f02a9d55a8720d64aa27d 100644
--- a/python/paddle/framework/io.py
+++ b/python/paddle/framework/io.py
@@ -32,6 +32,7 @@ from paddle import fluid
 from paddle.fluid import core
 from paddle.fluid.io import _unpack_saved_dict, _pack_loaded_dict, _pickle_loads_mac
 from paddle.fluid.io import _legacy_save as _legacy_static_save
+from paddle.fluid.io import _open_file_buffer, _is_file_path, _is_memory_buffer
 
 from paddle.fluid.framework import Variable, _varbase_creator, _dygraph_tracer, in_dygraph_mode, ParamBase, _current_expected_place, Program
 from paddle.fluid.dygraph.jit import _SaveLoadConfig
@@ -450,30 +451,81 @@ def _parse_load_result(obj, return_numpy):
 def _save_lod_tensor(tensor, file_name):
     if not tensor._is_initialized():
         raise ValueError("The saved tensor is not initialized.")
-    _seek = core._save_lod_tensor(tensor, file_name)
-    # '_seek' is the end position of this tensor in the file.
+    if _is_file_path(file_name):
+        _seek = core.save_lod_tensor(tensor, file_name)
+        # '_seek' is the end position of this tensor in the file.
+
+    elif _is_memory_buffer(file_name):
+        tensor_bytes = core.save_lod_tensor_to_memory(tensor)
+
+        with _open_file_buffer(file_name, 'wb') as f:
+            f.write(tensor_bytes)
+            _seek = f.tell()
+
+    else:
+        raise NotImplementedError(
+            'Only supports saving objects to file or BytesIO, but received {}'.
+            format(type(file_name)))
     return _seek
 
 
 def _load_lod_tensor(file_name):
     temp_t = paddle.fluid.core.LoDTensor()
-    # '_seek' is the end position of this tensor in the file.
-    _seek = paddle.fluid.core._load_lod_tensor(temp_t, file_name)
+    if _is_file_path(file_name):
+        # '_seek' is the end position of this tensor in the file.
+        _seek = paddle.fluid.core.load_lod_tensor(temp_t, file_name)
+
+    elif _is_memory_buffer(file_name):
+        with _open_file_buffer(file_name, 'rb') as f:
+            tensor_bytes = f.read()
+            paddle.fluid.core.load_lod_tensor_from_memory(temp_t, tensor_bytes)
+            _seek = f.tell()
+
+    else:
+        raise NotImplementedError(
+            'Only supports load objects from file or BytesIO, but received {}'.
+            format(type(file_name)))
+
     return temp_t, _seek
 
 
 def _save_selected_rows(selected_rows, file_name):
-    # '_seek' is the end position of this SelectedRows in the file.
     if not selected_rows.get_tensor()._is_initialized():
         raise ValueError("The saved tensor is not initialized.")
-    _seek = core._save_selected_rows(selected_rows, file_name)
+    if _is_file_path(file_name):
+        # '_seek' is the end position of this SelectedRows in the file.
+        _seek = core.save_selected_rows(selected_rows, file_name)
+
+    elif _is_memory_buffer(file_name):
+        selected_rows_bytes = core.save_selected_rows_to_memory(selected_rows)
+        with _open_file_buffer(file_name, 'wb') as f:
+            f.write(selected_rows_bytes)
+            _seek = f.tell()
+    else:
+        raise NotImplementedError(
+            'Only supports saving objects to file or BytesIO, but received {}'.
+            format(type(file_name)))
     return _seek
 
 
 def _load_selected_rows(file_name):
     temp_sr = core.SelectedRows()
-    # '_seek' is the end position of this SelectedRows in the file.
-    _seek = core._load_selected_rows(temp_sr, file_name)
+    if _is_file_path(file_name):
+        # '_seek' is the end position of this SelectedRows in the file.
+        _seek = core.load_selected_rows(temp_sr, file_name)
+
+    elif _is_memory_buffer(file_name):
+        with _open_file_buffer(file_name, 'rb') as f:
+            selected_rows_bytes = f.read()
+            paddle.fluid.core.load_selected_rows_from_memory(
+                temp_sr, selected_rows_bytes)
+        _seek = f.tell()
+
+    else:
+        raise NotImplementedError(
+            'Only supports load objects from file or BytesIO, but received {}'.
+            format(type(file_name)))
+
     return temp_sr, _seek
 
 
@@ -509,7 +561,7 @@ def save(obj, path, protocol=4, **configs):
     
     Args:
         obj(Object) : The object to be saved.
-        path(str) : The path of the object to be saved. 
+        path(str|BytesIO) : The path/buffer of the object to be saved. 
           If saved in the current directory, the input path string will be used as the file name. 
         protocol(int, optional): The protocol version of pickle module must be greater than 1 and less than 5.
                                  Default: 4
@@ -593,18 +645,39 @@ def save(obj, path, protocol=4, **configs):
             main_program = paddle.static.default_main_program()
             path = "example/main_program.pdmodel"
             paddle.save(main_program, path)
-    '''
-    # 1. input check
-    filename = os.path.basename(path)
-    if filename == "":
-        raise ValueError("The input path MUST be format of dirname/filename "
-                         "[dirname\\filename in Windows system], but received "
-                         "filename is empty string.")
 
-    # 2. save object
-    dirname = os.path.dirname(path)
-    if dirname and not os.path.exists(dirname):
-        os.makedirs(dirname)
+
+            # example 5: save object to memory
+            from io import BytesIO
+            import paddle
+            from paddle.nn import Linear
+            paddle.disable_static()
+
+            linear = Linear(5, 10)
+            state_dict = linear.state_dict()
+            byio = BytesIO()
+            paddle.save(state_dict, byio)
+            tensor = paddle.randn([2, 3], dtype='float32')
+            paddle.save(tensor, byio)
+    
+    '''
+    if _is_file_path(path):
+        # 1. input check
+        filename = os.path.basename(path)
+        if filename == "":
+            raise ValueError(
+                "The input path MUST be format of dirname/filename "
+                "[dirname\\filename in Windows system], but received "
+                "filename is empty string.")
+
+        # 2. save object
+        dirname = os.path.dirname(path)
+        if dirname and not os.path.exists(dirname):
+            os.makedirs(dirname)
+    elif not _is_memory_buffer(path):
+        raise ValueError(
+            "only supports saving objects to file and `BytesIO`, but got {}".
+            format(type(path)))
 
     config = _parse_save_config(configs)
 
@@ -625,7 +698,7 @@ def save(obj, path, protocol=4, **configs):
 
         if isinstance(obj, Program):
             obj.desc.flush()
-            with open(path, "wb") as f:
+            with _open_file_buffer(path, "wb") as f:
                 f.write(obj.desc.serialize_to_string())
 
         elif _is_state_dict(obj):
@@ -634,7 +707,7 @@ def save(obj, path, protocol=4, **configs):
             else:
                 _legacy_static_save(obj, path, protocol)
         else:
-            with open(path, 'wb') as f:
+            with _open_file_buffer(path, 'wb') as f:
                 _pickle_save(obj, f, protocol)
 
 
@@ -648,12 +721,6 @@ def _legacy_save(obj, path, protocol=2):
     if len(obj) == 0:
         warnings.warn("The input state dict is empty, no need to save.")
 
-    filename = os.path.basename(path)
-    if filename == "":
-        raise ValueError("The input path MUST be format of dirname/filename "
-                         "[dirname\\filename in Windows system], but received "
-                         "filename is empty string.")
-
     if not isinstance(protocol, int):
         raise ValueError("The 'protocol' MUST be `int`, but received {}".format(
             type(protocol)))
@@ -662,26 +729,33 @@ def _legacy_save(obj, path, protocol=2):
         raise ValueError("Expected 1<'protocol'<5, but received protocol={}".
                          format(protocol))
 
-    # 2. save object
-    dirname = os.path.dirname(path)
-    if dirname and not os.path.exists(dirname):
-        os.makedirs(dirname)
+    if _is_file_path(path):
+        filename = os.path.basename(path)
+        if filename == "":
+            raise ValueError(
+                "The input path MUST be format of dirname/filename "
+                "[dirname\\filename in Windows system], but received "
+                "filename is empty string.")
+        # 2. save object
+        dirname = os.path.dirname(path)
+        if dirname and not os.path.exists(dirname):
+            os.makedirs(dirname)
 
-    # TODO(chenweihang): supports save other object
     if isinstance(obj, dict):
         saved_obj = _build_saved_state_dict(obj)
 
     saved_obj = _unpack_saved_dict(saved_obj, protocol)
 
     # When value of dict is lager than 4GB ,there is a Bug on 'MAC python3'
-    if sys.platform == 'darwin' and sys.version_info.major == 3:
+    if _is_file_path(
+            path) and sys.platform == 'darwin' and sys.version_info.major == 3:
         pickle_bytes = pickle.dumps(saved_obj, protocol=protocol)
         with open(path, 'wb') as f:
             max_bytes = 2**30
             for i in range(0, len(pickle_bytes), max_bytes):
                 f.write(pickle_bytes[i:i + max_bytes])
     else:
-        with open(path, 'wb') as f:
+        with _open_file_buffer(path, 'wb') as f:
             pickle.dump(saved_obj, f, protocol=protocol)
 
 
@@ -716,7 +790,7 @@ def load(path, **configs):
         ``Layer.set_state_dict`` later.
 
     Args:
-        path(str) : The path to load the target object. Generally, the path is the target 
+        path(str|BytesIO) : The path/buffer to load the target object. Generally, the path is the target 
             file path. When loading state_dict from the saved result of the API used to save 
             the inference model, the path may be a file prefix or directory.
         **configs (dict, optional): other load configuration options for compatibility. We do not 
@@ -822,18 +896,36 @@ def load(path, **configs):
             print(load_main)
 
 
+            # example 5: save object to memory
+            from io import BytesIO
+            import paddle
+            from paddle.nn import Linear
+            paddle.disable_static()
+
+            linear = Linear(5, 10)
+            state_dict = linear.state_dict()
+            byio = BytesIO()
+            paddle.save(state_dict, byio)
+            tensor = paddle.randn([2, 3], dtype='float32')
+            paddle.save(tensor, byio)
+            byio.seek(0)
+            # load state_dict
+            dict_load = paddle.load(byio)
+
     '''
 
-    if os.path.isfile(path):
+    if _is_memory_buffer(path) or os.path.isfile(path):
         config = _parse_load_config(configs)
         if six.PY2:
             exception_type = KeyError
         else:
             exception_type = pickle.UnpicklingError
         try:
-            with open(path, 'rb') as f:
+            with _open_file_buffer(path, 'rb') as f:
                 # When value of dict is lager than 4GB ,there is a Bug on 'MAC python3'
-                if sys.platform == 'darwin' and sys.version_info.major == 3:
+                if _is_file_path(
+                        path
+                ) and sys.platform == 'darwin' and sys.version_info.major == 3:
                     load_result = _pickle_loads_mac(path, f)
                 else:
                     load_result = pickle.load(f) if six.PY2 else pickle.load(
@@ -875,7 +967,7 @@ def load(path, **configs):
                         return tensor
                 except:
                     try:
-                        with open(path, "rb") as f:
+                        with _open_file_buffer(path, "rb") as f:
                             program_desc_str = f.read()
                             program = Program.parse_from_string(
                                 program_desc_str)
@@ -895,9 +987,9 @@ def _legacy_load(path, **configs):
     load_result = None
     config = _parse_load_config(configs)
 
-    if os.path.isfile(path):
+    if os.path.isfile(path) or _is_memory_buffer(path):
         # we think path is file means this file is created by paddle.save
-        with open(path, 'rb') as f:
+        with _open_file_buffer(path, 'rb') as f:
             load_result = pickle.load(f) if six.PY2 else pickle.load(
                 f, encoding='latin1')
         load_result = _pack_loaded_dict(load_result)
diff --git a/python/paddle/hapi/callbacks.py b/python/paddle/hapi/callbacks.py
index 834b92f9fe6a0c0abaa0946d1676029d4849ae45..5f1f383438287106bd6ee8448f7948fcba8c185b 100644
--- a/python/paddle/hapi/callbacks.py
+++ b/python/paddle/hapi/callbacks.py
@@ -324,7 +324,7 @@ class ProgBarLogger(Callback):
             ])
             train_dataset = MNIST(mode='train', transform=transform)
 
-            lenet = paddle.vision.LeNet()
+            lenet = paddle.vision.models.LeNet()
             model = paddle.Model(lenet,
                 inputs, labels)
 
@@ -558,7 +558,7 @@ class ModelCheckpoint(Callback):
             ])
             train_dataset = MNIST(mode='train', transform=transform)
 
-            lenet = paddle.vision.LeNet()
+            lenet = paddle.vision.models.LeNet()
             model = paddle.Model(lenet,
                 inputs, labels)
 
@@ -618,7 +618,7 @@ class LRScheduler(Callback):
             ])
             train_dataset = paddle.vision.datasets.MNIST(mode='train', transform=transform)
 
-            lenet = paddle.vision.LeNet()
+            lenet = paddle.vision.models.LeNet()
             model = paddle.Model(lenet,
                 inputs, labels)
 
@@ -634,7 +634,7 @@ class LRScheduler(Callback):
                     boundaries=boundaries, values=values)
                 learning_rate = paddle.optimizer.lr.LinearWarmup(
                     learning_rate=learning_rate,
-                    warmup_steps=wamup_epochs,
+                    warmup_steps=wamup_steps,
                     start_lr=base_lr / 5.,
                     end_lr=base_lr,
                     verbose=True)
@@ -860,7 +860,7 @@ class VisualDL(Callback):
             train_dataset = paddle.vision.datasets.MNIST(mode='train', transform=transform)
             eval_dataset = paddle.vision.datasets.MNIST(mode='test', transform=transform)
 
-            net = paddle.vision.LeNet()
+            net = paddle.vision.models.LeNet()
             model = paddle.Model(net, inputs, labels)
 
             optim = paddle.optimizer.Adam(0.001, parameters=net.parameters())
diff --git a/python/paddle/hapi/hub.py b/python/paddle/hapi/hub.py
index 243bd79c191dd6af08a0dae769ca2de630d42b40..b491bc0271bec733f1e7d331780a8e70badf59d8 100644
--- a/python/paddle/hapi/hub.py
+++ b/python/paddle/hapi/hub.py
@@ -110,7 +110,11 @@ def _get_cache_or_reload(repo, force_reload, verbose=True, source='github'):
         url = _git_archive_link(repo_owner, repo_name, branch, source=source)
 
         fpath = get_path_from_url(
-            url, hub_dir, check_exist=not force_reload, decompress=False)
+            url,
+            hub_dir,
+            check_exist=not force_reload,
+            decompress=False,
+            method=('wget' if source == 'gitee' else 'get'))
         shutil.move(fpath, cached_file)
 
         with zipfile.ZipFile(cached_file) as cached_zipfile:
diff --git a/python/paddle/hapi/model.py b/python/paddle/hapi/model.py
index 160d6c54759d901e2529221c99dce63b29f06810..3cba75fd526910be1fa6f452133fd7bcd1377787 100644
--- a/python/paddle/hapi/model.py
+++ b/python/paddle/hapi/model.py
@@ -30,20 +30,28 @@ from collections import Iterable
 import paddle
 from paddle import fluid
 from paddle.fluid import core
-from paddle.fluid.framework import in_dygraph_mode, Variable, ParamBase, _current_expected_place
-from paddle.fluid.framework import in_dygraph_mode, Variable, _get_paddle_place
+from paddle.fluid.framework import in_dygraph_mode
+from paddle.fluid.framework import Variable
+from paddle.fluid.framework import ParamBase
+from paddle.fluid.framework import _current_expected_place
+from paddle.fluid.framework import _get_paddle_place
 from paddle.fluid.framework import _current_expected_place as _get_device
 from paddle.fluid.executor import global_scope
 from paddle.fluid.io import is_belong_to_optimizer
 from paddle.fluid.dygraph.base import to_variable
 from paddle.fluid.dygraph.parallel import ParallelEnv
-from paddle.fluid.dygraph.dygraph_to_static.program_translator import ProgramTranslator, FunctionSpec
-from paddle.fluid.dygraph.io import INFER_MODEL_SUFFIX, INFER_PARAMS_SUFFIX
+from paddle.fluid.dygraph.dygraph_to_static.program_translator import ProgramTranslator
+from paddle.fluid.dygraph.dygraph_to_static.program_translator import FunctionSpec
+from paddle.fluid.dygraph.io import INFER_MODEL_SUFFIX
+from paddle.fluid.dygraph.io import INFER_PARAMS_SUFFIX
 from paddle.fluid.layers.utils import flatten
 from paddle.fluid.layers import collective
 
-from paddle.io import DataLoader, Dataset, DistributedBatchSampler
-from paddle.fluid.executor import scope_guard, Executor
+from paddle.io import DataLoader
+from paddle.io import Dataset
+from paddle.io import DistributedBatchSampler
+from paddle.fluid.executor import scope_guard
+from paddle.fluid.executor import Executor
 from paddle.fluid.dygraph.layers import Layer
 from paddle.metric import Metric
 from paddle.static import InputSpec as Input
@@ -163,10 +171,9 @@ def init_communicator(program, rank, nranks, wait_port, current_endpoint,
             })
     elif core.is_compiled_with_npu():
         hccl_id_var = block.create_var(
-            name=unique_name.generate('hccl_id'),
+            name=fluid.unique_name.generate('hccl_id'),
             persistable=True,
             type=core.VarDesc.VarType.RAW)
-        endpoint_to_index_map = {e: idx for idx, e in enumerate(endpoints)}
         block.append_op(
             type='c_gen_hccl_id',
             inputs={},
@@ -710,10 +717,10 @@ class DynamicGraphAdapter(object):
                 enable=self._amp_level != 'O0', **self._amp_custom_lists):
             if self._nranks > 1:
                 outputs = self.ddp_model.forward(
-                    * [to_variable(x) for x in inputs])
+                    *[to_variable(x) for x in inputs])
             else:
                 outputs = self.model.network.forward(
-                    * [to_variable(x) for x in inputs])
+                    *[to_variable(x) for x in inputs])
 
             losses = self.model._loss(*(to_list(outputs) + labels))
             losses = to_list(losses)
@@ -732,7 +739,7 @@ class DynamicGraphAdapter(object):
         metrics = []
         for metric in self.model._metrics:
             metric_outs = metric.compute(*(to_list(outputs) + labels))
-            m = metric.update(* [to_numpy(m) for m in to_list(metric_outs)])
+            m = metric.update(*[to_numpy(m) for m in to_list(metric_outs)])
             metrics.append(m)
 
         return ([to_numpy(l) for l in losses], metrics) \
@@ -746,7 +753,7 @@ class DynamicGraphAdapter(object):
         labels = labels or []
         labels = [to_variable(l) for l in to_list(labels)]
 
-        outputs = self.model.network.forward(* [to_variable(x) for x in inputs])
+        outputs = self.model.network.forward(*[to_variable(x) for x in inputs])
         if self.model._loss:
             losses = self.model._loss(*(to_list(outputs) + labels))
             losses = to_list(losses)
@@ -777,7 +784,7 @@ class DynamicGraphAdapter(object):
                     self._merge_count[self.mode + '_batch'] = samples
 
             metric_outs = metric.compute(*(to_list(outputs) + labels))
-            m = metric.update(* [to_numpy(m) for m in to_list(metric_outs)])
+            m = metric.update(*[to_numpy(m) for m in to_list(metric_outs)])
             metrics.append(m)
 
         if self.model._loss and len(metrics):
@@ -1363,8 +1370,9 @@ class Model(object):
             # pure float16 training has some restricts now
             if self._adapter._amp_level == "O2":
                 if in_dygraph_mode():
-                    warnings.warn("Pure float16 training is not supported in dygraph mode now, "\
-                        "and it will be supported in future version.")
+                    warnings.warn(
+                        "Pure float16 training is not supported in dygraph mode now, and it will be supported in future version."
+                    )
                 else:
                     # grad clip is not supported in pure fp16 training now
                     assert self._optimizer._grad_clip is None, \
@@ -1398,8 +1406,7 @@ class Model(object):
 
         if 'use_pure_fp16' in amp_configs:
             raise ValueError(
-                "''use_pure_fp16' is an invalid parameter, "
-                "the level of mixed precision training only depends on 'O1' or 'O2'."
+                "'use_pure_fp16' is an invalid parameter, the level of mixed precision training only depends on 'O1' or 'O2'."
             )
 
         _check_pure_fp16_configs()
@@ -1427,9 +1434,8 @@ class Model(object):
             }
             if amp_config_key_set - accepted_param_set:
                 raise ValueError(
-                    "Except for 'level', the keys of 'amp_configs' must be accepted by mixed precision APIs, "
-                    "but {} could not be recognized.".format(
-                        tuple(amp_config_key_set - accepted_param_set)))
+                    "Except for 'level', the keys of 'amp_configs' must be accepted by mixed precision APIs, but {} could not be recognized.".
+                    format(tuple(amp_config_key_set - accepted_param_set)))
 
             if 'use_fp16_guard' in amp_config_key_set:
                 if in_dygraph_mode():
@@ -1501,8 +1507,9 @@ class Model(object):
         self._optimizer = optimizer
         if loss is not None:
             if not isinstance(loss, paddle.nn.Layer) and not callable(loss):
-                raise TypeError("'loss' must be sub classes of " \
-                    "`paddle.nn.Layer` or any callable function.")
+                raise TypeError(
+                    "'loss' must be sub classes of `paddle.nn.Layer` or any callable function."
+                )
         self._loss = loss
 
         metrics = metrics or []
@@ -1831,6 +1838,7 @@ class Model(object):
                 batch_size=1,
                 num_workers=0,
                 stack_outputs=False,
+                verbose=1,
                 callbacks=None):
         """
         Compute the output predictions on testing data.
@@ -1851,7 +1859,10 @@ class Model(object):
                 be a length N list in shape [[X, Y], [X, Y], ....[X, Y]] if stack_outputs
                 is False. stack_outputs as False is used for LoDTensor output situation,
                 it is recommended set as True if outputs contains no LoDTensor. Default: False.
+            verbose (int): The verbosity mode, should be 0, 1, or 2. 0 = silent,
+                1 = progress bar, 2 = one line per batch. Default: 1.
             callbacks(Callback): A Callback instance, default None.
+
         Returns:
             list: output of models.
 
@@ -1911,7 +1922,7 @@ class Model(object):
 
         self._test_dataloader = test_loader
 
-        cbks = config_callbacks(callbacks, model=self, verbose=1)
+        cbks = config_callbacks(callbacks, model=self, verbose=verbose)
 
         test_steps = self._len_data_loader(test_loader)
         logs = {'steps': test_steps}
@@ -2080,7 +2091,7 @@ class Model(object):
               input = InputSpec([None, 1, 28, 28], 'float32', 'image')
               label = InputSpec([None, 1], 'int64', 'label')
            
-              model = paddle.Model(paddle.vision.LeNet(),
+              model = paddle.Model(paddle.vision.models.LeNet(),
                   input, label)
               optim = paddle.optimizer.Adam(
                   learning_rate=0.001, parameters=model.parameters())
@@ -2122,9 +2133,11 @@ class Model(object):
             else:
                 out_specs = to_list(specs)
         elif isinstance(specs, dict):
-            assert is_input == False
-            out_specs = [specs[n] \
-                for n in extract_args(self.network.forward) if n != 'self']
+            assert is_input is False
+            out_specs = [
+                specs[n] for n in extract_args(self.network.forward)
+                if n != 'self'
+            ]
         else:
             out_specs = to_list(specs)
         # Note: checks each element has specificed `name`.
diff --git a/python/paddle/incubate/__init__.py b/python/paddle/incubate/__init__.py
index 03e5a88624086b8781a1d8bee4437d9a17c98f76..22769053b1ac97508027910db28d91c526adead3 100644
--- a/python/paddle/incubate/__init__.py
+++ b/python/paddle/incubate/__init__.py
@@ -12,10 +12,11 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-from . import optimizer
-from . import checkpoint
-from ..fluid.layer_helper import LayerHelper
+from .optimizer import LookAhead  # noqa: F401
+from .optimizer import ModelAverage  # noqa: F401
+from .checkpoint import auto_checkpoint  # noqa: F401
+from ..fluid.layer_helper import LayerHelper  # noqa: F401
 
-__all__ = []
-__all__ += optimizer.__all__
-__all__ += checkpoint.__all__
+__all__ = [  # noqa
+    'LookAhead', 'ModelAverage'
+]
diff --git a/python/paddle/incubate/checkpoint/__init__.py b/python/paddle/incubate/checkpoint/__init__.py
index 7ddd256df747981019e3afb0bb1dd839cf3ea550..79e6259de0275410664b9bfb2c34c33e21c5d529 100644
--- a/python/paddle/incubate/checkpoint/__init__.py
+++ b/python/paddle/incubate/checkpoint/__init__.py
@@ -12,6 +12,6 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-from ...fluid.incubate.checkpoint import auto_checkpoint
+from ...fluid.incubate.checkpoint import auto_checkpoint  # noqa: F401
 
-__all__ = ["auto_checkpoint"]
+__all__ = []
diff --git a/python/paddle/incubate/optimizer/__init__.py b/python/paddle/incubate/optimizer/__init__.py
index 4a3889d0ee1a905a534f33909b4241f5c91be2f5..d966d187f288ac0865109cf361dd310328792aaf 100644
--- a/python/paddle/incubate/optimizer/__init__.py
+++ b/python/paddle/incubate/optimizer/__init__.py
@@ -12,7 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-from .lookahead import LookAhead
-from .modelaverage import ModelAverage
+from .lookahead import LookAhead  # noqa: F401
+from .modelaverage import ModelAverage  # noqa: F401
 
-__all__ = ['LookAhead', 'ModelAverage']
+__all__ = []
diff --git a/python/paddle/incubate/optimizer/lookahead.py b/python/paddle/incubate/optimizer/lookahead.py
index f90d520a5dfe8adc524ef20b7489ea008fb9c51a..720a84a24f0aa65c833939844c53e871b4e0680b 100644
--- a/python/paddle/incubate/optimizer/lookahead.py
+++ b/python/paddle/incubate/optimizer/lookahead.py
@@ -20,7 +20,7 @@ import paddle
 import numpy as np
 from paddle.fluid.dygraph import base as imperative_base
 
-__all__ = ["LookAhead"]
+__all__ = []
 
 
 class LookAhead(Optimizer):
@@ -99,7 +99,7 @@ class LookAhead(Optimizer):
             layer = LinearNet()
             loss_fn = nn.CrossEntropyLoss()
             optimizer = paddle.optimizer.SGD(learning_rate=0.1, parameters=layer.parameters())
-            lookahead = paddle.incubate.optimizer.LookAhead(optimizer, alpha=0.2, k=5)
+            lookahead = paddle.incubate.LookAhead(optimizer, alpha=0.2, k=5)
 
             # create data loader
             dataset = RandomDataset(BATCH_NUM * BATCH_SIZE)
@@ -163,7 +163,7 @@ class LookAhead(Optimizer):
                 out = linear(inp)
                 loss = paddle.mean(out)
                 sgd = paddle.optimizer.SGD(learning_rate=0.1,parameters=linear.parameters())
-                lookahead = paddle.incubate.optimizer.LookAhead(sgd, alpha=0.2, k=5)
+                lookahead = paddle.incubate.LookAhead(sgd, alpha=0.2, k=5)
                 loss.backward()
                 lookahead.step()
                 lookahead.clear_grad()
@@ -274,7 +274,7 @@ class LookAhead(Optimizer):
                 out = linear(inp)
                 loss = paddle.mean(out)
                 sgd = paddle.optimizer.SGD(learning_rate=0.1,parameters=linear.parameters())
-                lookahead = paddle.incubate.optimizer.LookAhead(sgd, alpha=0.2, k=5)
+                lookahead = paddle.incubate.LookAhead(sgd, alpha=0.2, k=5)
                 loss.backward()
                 lookahead.minimize(loss)
                 lookahead.clear_grad()
@@ -282,9 +282,6 @@ class LookAhead(Optimizer):
         """
         assert isinstance(loss, Variable), "The loss should be an Tensor."
 
-        parameter_list = parameters if parameters \
-            else self._parameter_list
-
         # Apply inner optimizer to the main_program
         optimize_ops, params_grads = self.inner_optimizer.minimize(
             loss,
diff --git a/python/paddle/incubate/optimizer/modelaverage.py b/python/paddle/incubate/optimizer/modelaverage.py
index 8afcaf9207e7cc84d143356b4f5efb74a175f2bd..8ffc3bdac62d040ccd45fe9768fdf566e784dcc4 100644
--- a/python/paddle/incubate/optimizer/modelaverage.py
+++ b/python/paddle/incubate/optimizer/modelaverage.py
@@ -21,7 +21,7 @@ import numpy as np
 from paddle.fluid.dygraph import base as imperative_base
 from paddle.fluid.wrapped_decorator import signature_safe_contextmanager
 
-__all__ = ["ModelAverage"]
+__all__ = []
 
 
 class ModelAverage(Optimizer):
@@ -129,7 +129,7 @@ class ModelAverage(Optimizer):
         layer = LinearNet()
         loss_fn = nn.CrossEntropyLoss()
         optimizer = opt.Momentum(learning_rate=0.2, momentum=0.1, parameters=layer.parameters())
-        model_average = paddle.incubate.optimizer.ModelAverage(0.15,
+        model_average = paddle.incubate.ModelAverage(0.15,
                                                     parameters=layer.parameters(),
                                                     min_average_window=2,
                                                     max_average_window=10)
@@ -313,7 +313,7 @@ class ModelAverage(Optimizer):
                 sgd = paddle.optimizer.SGD(learning_rate=0.1,parameters=linear.parameters())
                 sgd.minimize(loss)
 
-                modelaverage = paddle.incubate.optimizer.ModelAverage(0.15,
+                modelaverage = paddle.incubate.ModelAverage(0.15,
                                                             parameters=linear.parameters(),
                                                             min_average_window=2,
                                                             max_average_window=4)
@@ -345,7 +345,7 @@ class ModelAverage(Optimizer):
                 out = linear(inp)
                 loss = paddle.mean(out)
                 sgd = paddle.optimizer.SGD(learning_rate=0.1,parameters=linear.parameters())
-                modelaverage = paddle.incubate.optimizer.ModelAverage(0.15,
+                modelaverage = paddle.incubate.ModelAverage(0.15,
                                                             parameters=linear.parameters(),
                                                             min_average_window=2,
                                                             max_average_window=4)
@@ -395,7 +395,7 @@ class ModelAverage(Optimizer):
 
                 sgd = paddle.optimizer.SGD(learning_rate=0.1,parameters=linear.parameters())
 
-                modelaverage = paddle.incubate.optimizer.ModelAverage(0.15,
+                modelaverage = paddle.incubate.ModelAverage(0.15,
                                                             parameters=linear.parameters(),
                                                             min_average_window=2,
                                                             max_average_window=4)
@@ -415,7 +415,6 @@ class ModelAverage(Optimizer):
                                                         param)
                 old_num_accumulates = self._get_accumulator(
                     'old_num_accumulates', param)
-                num_updates = self._get_accumulator('num_updates', param)
                 sum_1 = self._get_accumulator('sum_1', param)
                 sum_2 = self._get_accumulator('sum_2', param)
                 sum_3 = self._get_accumulator('sum_3', param)
@@ -467,7 +466,7 @@ class ModelAverage(Optimizer):
 
                 sgd = paddle.optimizer.SGD(learning_rate=0.1,parameters=linear.parameters())
 
-                modelaverage = paddle.incubate.optimizer.ModelAverage(0.15,
+                modelaverage = paddle.incubate.ModelAverage(0.15,
                                                             parameters=linear.parameters(),
                                                             min_average_window=2,
                                                             max_average_window=4)
@@ -506,17 +505,15 @@ class ModelAverage(Optimizer):
             self._get_accumulator('num_accumulates', param))
         old_num_accumulates = block._clone_variable(
             self._get_accumulator('old_num_accumulates', param))
-        num_updates = block._clone_variable(
-            self._get_accumulator('num_updates', param))
         # backup param value to grad
         layers.assign(input=param, output=grad)
         # param = (sum_1 + sum_2 + sum_3) / (num_accumulates + old_num_accumulates)
         tmp = layers.sum(x=[num_accumulates, old_num_accumulates])
         sum = layers.sum(x=[sum_1, sum_2, sum_3])
         tmp = layers.cast(
-            x=tmp, dtype='float32' if self._dtype == None else self._dtype)
+            x=tmp, dtype='float32' if self._dtype is None else self._dtype)
         sum = layers.cast(
-            x=sum, dtype='float32' if self._dtype == None else self._dtype)
+            x=sum, dtype='float32' if self._dtype is None else self._dtype)
         layers.ops._elementwise_div(x=sum, y=tmp, out=param)
 
     def _add_average_restore_op(self, block, param):
diff --git a/python/paddle/inference/__init__.py b/python/paddle/inference/__init__.py
index c388301ec3408e436eacb2567e8e529d0bbc03bb..4e172039716628a157c8324c17ff2d4be3666349 100644
--- a/python/paddle/inference/__init__.py
+++ b/python/paddle/inference/__init__.py
@@ -12,5 +12,26 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-from ..fluid.inference import Config, DataType, PlaceType, PrecisionType, Tensor, \
-    Predictor, create_predictor, get_version, get_num_bytes_of_data_type, PredictorPool
+from ..fluid.inference import Config  # noqa: F401
+from ..fluid.inference import DataType  # noqa: F401
+from ..fluid.inference import PlaceType  # noqa: F401
+from ..fluid.inference import PrecisionType  # noqa: F401
+from ..fluid.inference import Tensor  # noqa: F401
+from ..fluid.inference import Predictor  # noqa: F401
+from ..fluid.inference import create_predictor  # noqa: F401
+from ..fluid.inference import get_version  # noqa: F401
+from ..fluid.inference import get_num_bytes_of_data_type  # noqa: F401
+from ..fluid.inference import PredictorPool  # noqa: F401
+
+__all__ = [  # noqa
+    'Config',
+    'DataType',
+    'PlaceType',
+    'PrecisionType',
+    'Tensor',
+    'Predictor',
+    'create_predictor',
+    'get_version',
+    'get_num_bytes_of_data_type',
+    'PredictorPool'
+]
diff --git a/python/paddle/jit/__init__.py b/python/paddle/jit/__init__.py
index 650837b2d7702c70131250b9da94abd62b369e7a..576989e8e0d2aa019dc9ec7c7d69afa941f1dcb7 100644
--- a/python/paddle/jit/__init__.py
+++ b/python/paddle/jit/__init__.py
@@ -14,19 +14,26 @@
 
 from __future__ import print_function
 
-from ..fluid.dygraph.jit import save  #DEFINE_ALIAS
-from ..fluid.dygraph.jit import load  #DEFINE_ALIAS
-from ..fluid.dygraph.jit import TracedLayer  #DEFINE_ALIAS
-from ..fluid.dygraph.jit import set_code_level  #DEFINE_ALIAS
-from ..fluid.dygraph.jit import set_verbosity  #DEFINE_ALIAS
-from ..fluid.dygraph.jit import declarative as to_static  #DEFINE_ALIAS
-from ..fluid.dygraph.jit import not_to_static  #DEFINE_ALIAS
-from ..fluid.dygraph import ProgramTranslator  #DEFINE_ALIAS
-from ..fluid.dygraph.io import TranslatedLayer  #DEFINE_ALIAS
+from ..fluid.dygraph.jit import save  # noqa: F401
+from ..fluid.dygraph.jit import load  # noqa: F401
+from ..fluid.dygraph.jit import TracedLayer  # noqa: F401
+from ..fluid.dygraph.jit import set_code_level  # noqa: F401
+from ..fluid.dygraph.jit import set_verbosity  # noqa: F401
+from ..fluid.dygraph.jit import declarative as to_static  # noqa: F401
+from ..fluid.dygraph.jit import not_to_static  # noqa: F401
+from ..fluid.dygraph import ProgramTranslator  # noqa: F401
+from ..fluid.dygraph.io import TranslatedLayer  # noqa: F401
 
-from . import dy2static
+from . import dy2static  # noqa: F401
 
-__all__ = [
-    'save', 'load', 'TracedLayer', 'to_static', 'ProgramTranslator',
-    'TranslatedLayer', 'set_code_level', 'set_verbosity', 'not_to_static'
+__all__ = [  # noqa
+    'save',
+    'load',
+    'TracedLayer',
+    'to_static',
+    'ProgramTranslator',
+    'TranslatedLayer',
+    'set_code_level',
+    'set_verbosity',
+    'not_to_static'
 ]
diff --git a/python/paddle/jit/dy2static/__init__.py b/python/paddle/jit/dy2static/__init__.py
index 239b554180b1bd74517b152dfdf079082600b806..030d5499c2ca96d997dfe571b81c039bb0eb2c99 100644
--- a/python/paddle/jit/dy2static/__init__.py
+++ b/python/paddle/jit/dy2static/__init__.py
@@ -12,18 +12,28 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-from __future__ import print_function
-
-from . import convert_operators
-from .convert_operators import *
-
-from . import convert_call_func
-from .convert_call_func import *
-
-from . import variable_trans_func
-from .variable_trans_func import *
+from .convert_call_func import convert_call  # noqa: F401
+from .convert_operators import cast_bool_if_necessary  # noqa: F401
+from .convert_operators import convert_assert  # noqa: F401
+from .convert_operators import convert_ifelse  # noqa: F401
+from .convert_operators import convert_len  # noqa: F401
+from .convert_operators import convert_logical_and  # noqa: F401
+from .convert_operators import convert_logical_not  # noqa: F401
+from .convert_operators import convert_logical_or  # noqa: F401
+from .convert_operators import convert_pop  # noqa: F401
+from .convert_operators import convert_print  # noqa: F401
+from .convert_operators import convert_shape_compare  # noqa: F401
+from .convert_operators import convert_var_dtype  # noqa: F401
+from .convert_operators import convert_var_shape  # noqa: F401
+from .convert_operators import convert_var_shape_simple  # noqa: F401
+from .convert_operators import eval_if_exist_else_none  # noqa: F401
+from .convert_operators import choose_shape_attr_or_api  # noqa: F401
+from .convert_operators import convert_while_loop  # noqa: F401
+from .variable_trans_func import create_bool_as_type  # noqa: F401
+from .variable_trans_func import create_fill_constant_node  # noqa: F401
+from .variable_trans_func import create_static_variable_gast_node  # noqa: F401
+from .variable_trans_func import data_layer_not_check  # noqa: F401
+from .variable_trans_func import to_static_variable  # noqa: F401
+from .variable_trans_func import to_static_variable_gast_node  # noqa: F401
 
 __all__ = []
-__all__ += convert_operators.__all__
-__all__ += convert_call_func.__all__
-__all__ += variable_trans_func.__all__
diff --git a/python/paddle/jit/dy2static/convert_call_func.py b/python/paddle/jit/dy2static/convert_call_func.py
index be2377608e36c75d95cb2c1c609e99cef7d438a7..4f6197a3cba6ae811998def0d59a221d2265ce0c 100644
--- a/python/paddle/jit/dy2static/convert_call_func.py
+++ b/python/paddle/jit/dy2static/convert_call_func.py
@@ -13,6 +13,6 @@
 # limitations under the License.
 from __future__ import print_function
 
-from ...fluid.dygraph.dygraph_to_static.convert_call_func import convert_call  #DEFINE_ALIAS
+from ...fluid.dygraph.dygraph_to_static.convert_call_func import convert_call  # noqa: F401
 
-__all__ = ['convert_call']
+__all__ = []
diff --git a/python/paddle/jit/dy2static/convert_operators.py b/python/paddle/jit/dy2static/convert_operators.py
index 9321cf4a0b8324cf5e312b36a17b8ab1edc72809..8d67e06d9b27a56e9aa0fc7bc57844290d1c83e1 100644
--- a/python/paddle/jit/dy2static/convert_operators.py
+++ b/python/paddle/jit/dy2static/convert_operators.py
@@ -13,27 +13,21 @@
 # limitations under the License.
 from __future__ import print_function
 
-from ...fluid.dygraph.dygraph_to_static.convert_operators import cast_bool_if_necessary  #DEFINE_ALIAS
-from ...fluid.dygraph.dygraph_to_static.convert_operators import convert_assert  #DEFINE_ALIAS
-from ...fluid.dygraph.dygraph_to_static.convert_operators import convert_ifelse  #DEFINE_ALIAS
-from ...fluid.dygraph.dygraph_to_static.convert_operators import convert_len  #DEFINE_ALIAS
-from ...fluid.dygraph.dygraph_to_static.convert_operators import convert_logical_and  #DEFINE_ALIAS
-from ...fluid.dygraph.dygraph_to_static.convert_operators import convert_logical_not  #DEFINE_ALIAS
-from ...fluid.dygraph.dygraph_to_static.convert_operators import convert_logical_or  #DEFINE_ALIAS
-from ...fluid.dygraph.dygraph_to_static.convert_operators import convert_pop  #DEFINE_ALIAS
-from ...fluid.dygraph.dygraph_to_static.convert_operators import convert_print  #DEFINE_ALIAS
-from ...fluid.dygraph.dygraph_to_static.convert_operators import convert_shape_compare  #DEFINE_ALIAS
-from ...fluid.dygraph.dygraph_to_static.convert_operators import convert_var_dtype  #DEFINE_ALIAS
-from ...fluid.dygraph.dygraph_to_static.convert_operators import convert_var_shape  #DEFINE_ALIAS
-from ...fluid.dygraph.dygraph_to_static.convert_operators import convert_var_shape_simple  #DEFINE_ALIAS
-from ...fluid.dygraph.dygraph_to_static.convert_operators import eval_if_exist_else_none  #DEFINE_ALIAS
-from ...fluid.dygraph.dygraph_to_static.convert_operators import choose_shape_attr_or_api  #DEFINE_ALIAS
-from ...fluid.dygraph.dygraph_to_static.convert_operators import convert_while_loop  #DEFINE_ALIAS
+from ...fluid.dygraph.dygraph_to_static.convert_operators import cast_bool_if_necessary  # noqa: F401
+from ...fluid.dygraph.dygraph_to_static.convert_operators import convert_assert  # noqa: F401
+from ...fluid.dygraph.dygraph_to_static.convert_operators import convert_ifelse  # noqa: F401
+from ...fluid.dygraph.dygraph_to_static.convert_operators import convert_len  # noqa: F401
+from ...fluid.dygraph.dygraph_to_static.convert_operators import convert_logical_and  # noqa: F401
+from ...fluid.dygraph.dygraph_to_static.convert_operators import convert_logical_not  # noqa: F401
+from ...fluid.dygraph.dygraph_to_static.convert_operators import convert_logical_or  # noqa: F401
+from ...fluid.dygraph.dygraph_to_static.convert_operators import convert_pop  # noqa: F401
+from ...fluid.dygraph.dygraph_to_static.convert_operators import convert_print  # noqa: F401
+from ...fluid.dygraph.dygraph_to_static.convert_operators import convert_shape_compare  # noqa: F401
+from ...fluid.dygraph.dygraph_to_static.convert_operators import convert_var_dtype  # noqa: F401
+from ...fluid.dygraph.dygraph_to_static.convert_operators import convert_var_shape  # noqa: F401
+from ...fluid.dygraph.dygraph_to_static.convert_operators import convert_var_shape_simple  # noqa: F401
+from ...fluid.dygraph.dygraph_to_static.convert_operators import eval_if_exist_else_none  # noqa: F401
+from ...fluid.dygraph.dygraph_to_static.convert_operators import choose_shape_attr_or_api  # noqa: F401
+from ...fluid.dygraph.dygraph_to_static.convert_operators import convert_while_loop  # noqa: F401
 
-__all__ = [
-    'cast_bool_if_necessary', 'convert_assert', 'convert_ifelse', 'convert_len',
-    'convert_logical_and', 'convert_logical_not', 'convert_logical_or',
-    'convert_pop', 'convert_print', 'convert_shape_compare',
-    'convert_var_dtype', 'convert_var_shape', 'convert_var_shape_simple',
-    'eval_if_exist_else_none', 'choose_shape_attr_or_api', 'convert_while_loop'
-]
+__all__ = []
diff --git a/python/paddle/jit/dy2static/variable_trans_func.py b/python/paddle/jit/dy2static/variable_trans_func.py
index 2deb1bbb0eef2542d8f8890a7fa476f370ba5e5a..9ce2bc2da381655e65225397831faa228c613ca6 100644
--- a/python/paddle/jit/dy2static/variable_trans_func.py
+++ b/python/paddle/jit/dy2static/variable_trans_func.py
@@ -14,15 +14,11 @@
 
 from __future__ import print_function
 
-from ...fluid.dygraph.dygraph_to_static.variable_trans_func import create_bool_as_type  #DEFINE_ALIAS
-from ...fluid.dygraph.dygraph_to_static.variable_trans_func import create_fill_constant_node  #DEFINE_ALIAS
-from ...fluid.dygraph.dygraph_to_static.variable_trans_func import create_static_variable_gast_node  #DEFINE_ALIAS
-from ...fluid.dygraph.dygraph_to_static.variable_trans_func import data_layer_not_check  #DEFINE_ALIAS
-from ...fluid.dygraph.dygraph_to_static.variable_trans_func import to_static_variable  #DEFINE_ALIAS
-from ...fluid.dygraph.dygraph_to_static.variable_trans_func import to_static_variable_gast_node  #DEFINE_ALIAS
+from ...fluid.dygraph.dygraph_to_static.variable_trans_func import create_bool_as_type  # noqa: F401
+from ...fluid.dygraph.dygraph_to_static.variable_trans_func import create_fill_constant_node  # noqa: F401
+from ...fluid.dygraph.dygraph_to_static.variable_trans_func import create_static_variable_gast_node  # noqa: F401
+from ...fluid.dygraph.dygraph_to_static.variable_trans_func import data_layer_not_check  # noqa: F401
+from ...fluid.dygraph.dygraph_to_static.variable_trans_func import to_static_variable  # noqa: F401
+from ...fluid.dygraph.dygraph_to_static.variable_trans_func import to_static_variable_gast_node  # noqa: F401
 
-__all__ = [
-    'create_bool_as_type', 'create_fill_constant_node',
-    'create_static_variable_gast_node', 'data_layer_not_check',
-    'to_static_variable', 'to_static_variable_gast_node'
-]
+__all__ = []
diff --git a/python/paddle/metric/__init__.py b/python/paddle/metric/__init__.py
index e41f6d76dd22159ab189654c6d30818c600b8286..2f2ef4c6f54269067406763a02e8f0772e86bc82 100644
--- a/python/paddle/metric/__init__.py
+++ b/python/paddle/metric/__init__.py
@@ -12,7 +12,18 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-from .metrics import *
-from . import metrics
+from .metrics import Metric  # noqa: F401
+from .metrics import Accuracy  # noqa: F401
+from .metrics import Precision  # noqa: F401
+from .metrics import Recall  # noqa: F401
+from .metrics import Auc  # noqa: F401
+from .metrics import accuracy  # noqa: F401
 
-__all__ = metrics.__all__
+__all__ = [ #noqa
+    'Metric',
+    'Accuracy',
+    'Precision',
+    'Recall',
+    'Auc',
+    'accuracy'
+]
diff --git a/python/paddle/metric/metrics.py b/python/paddle/metric/metrics.py
index 61d1eb0e373341374199b811f198f7e295026ecc..40758fb8dc3e0f034e4d5ea9ccf6e8d2897287e1 100644
--- a/python/paddle/metric/metrics.py
+++ b/python/paddle/metric/metrics.py
@@ -26,7 +26,7 @@ from ..fluid.layers.nn import topk
 from ..fluid.framework import core, _varbase_creator, in_dygraph_mode
 import paddle
 
-__all__ = ['Metric', 'Accuracy', 'Precision', 'Recall', 'Auc', 'accuracy']
+__all__ = []
 
 
 def _is_numpy_(var):
@@ -222,7 +222,7 @@ class Accuracy(Metric):
           transform = T.Compose([T.Transpose(), T.Normalize([127.5], [127.5])])
           train_dataset = MNIST(mode='train', transform=transform)
 
-          model = paddle.Model(paddle.vision.LeNet(), input, label)
+          model = paddle.Model(paddle.vision.models.LeNet(), input, label)
           optim = paddle.optimizer.Adam(
               learning_rate=0.001, parameters=model.parameters())
           model.prepare(
diff --git a/python/paddle/nn/__init__.py b/python/paddle/nn/__init__.py
index 7cf3f94872de17c12910ae5453f74a18c4a1502d..5fe17e8c193e3ea99eddbd8bfb2668e3a1228286 100644
--- a/python/paddle/nn/__init__.py
+++ b/python/paddle/nn/__init__.py
@@ -138,6 +138,7 @@ from ..fluid.dygraph.container import Sequential  # noqa: F401
 from . import utils  # noqa: F401
 from . import functional  # noqa: F401
 from . import initializer  # noqa: F401
+from . import quant  # noqa: F401
 
 #TODO: remove 'diag_embed', 'remove_weight_norm', 'weight_norm' months later.
 import paddle.utils.deprecated as deprecated
@@ -286,5 +287,6 @@ __all__ = [     #noqa
            'Swish',
            'PixelShuffle',
            'ELU',
-           'ReLU6'
+           'ReLU6',
+           'LayerDict'
 ]
diff --git a/python/paddle/nn/functional/__init__.py b/python/paddle/nn/functional/__init__.py
index d4c17a27a61780b431916b2634585de035778ce8..ff18afa9d20282a6e1147cbca0d17580456c0c95 100644
--- a/python/paddle/nn/functional/__init__.py
+++ b/python/paddle/nn/functional/__init__.py
@@ -194,5 +194,6 @@ __all__ = [     #noqa
            'embedding',
            'gather_tree',
            'one_hot',
-           'normalize'
+           'normalize',
+           'temporal_shift'
 ]
diff --git a/python/paddle/nn/functional/conv.py b/python/paddle/nn/functional/conv.py
index 67958b8683fe174d2c9e387668ab8c7ee4a39276..66913f3ad2f659456d8def39852f3423fdd3dd6c 100644
--- a/python/paddle/nn/functional/conv.py
+++ b/python/paddle/nn/functional/conv.py
@@ -12,6 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 from __future__ import print_function
+from paddle.fluid.framework import _global_flags
 
 import numpy as np
 from ...device import get_cudnn_version
@@ -537,7 +538,7 @@ def conv2d(x,
     use_cudnn = True if (core.is_compiled_with_cuda() and
                          cudnn_version is not None) else False
 
-    use_mkldnn = core.globals()["FLAGS_use_mkldnn"]
+    use_mkldnn = _global_flags()["FLAGS_use_mkldnn"]
 
     # update attrs
     padding, padding_algorithm = _update_padding_nd(padding, channel_last, 2)
diff --git a/python/paddle/nn/layer/conv.py b/python/paddle/nn/layer/conv.py
index eecea3034a752ec0203764219fbdcd8c671c02cf..fc98157273447f61c3c08020ce04991f48ab259f 100644
--- a/python/paddle/nn/layer/conv.py
+++ b/python/paddle/nn/layer/conv.py
@@ -98,7 +98,7 @@ class _ConvNd(layers.Layer):
                                                   'kernel_size')
         self._padding = padding
         self._padding_mode = padding_mode
-        self.output_padding = output_padding
+        self._output_padding = output_padding
         if dims != 1:
             self._updated_padding, self._padding_algorithm = _update_padding_nd(
                 padding, channel_last, dims)
@@ -163,7 +163,7 @@ class _ConvNd(layers.Layer):
             main_str += ', padding={_padding}'
         if self._padding_mode is not 'zeros':
             main_str += ', padding_mode={_padding_mode}'
-        if self.output_padding != 0:
+        if self._output_padding != 0:
             main_str += ', output_padding={_output_padding}'
         if self._dilation != [1] * len(self._dilation):
             main_str += ', dilation={_dilation}'
@@ -508,7 +508,7 @@ class Conv1DTranspose(_ConvNd):
             self.weight,
             bias=self.bias,
             output_size=output_size,
-            output_padding=self.output_padding,
+            output_padding=self._output_padding,
             padding=self._padding,
             stride=self._stride,
             dilation=self._dilation,
@@ -824,7 +824,7 @@ class Conv2DTranspose(_ConvNd):
 
     def forward(self, x, output_size=None):
         if output_size is None:
-            output_padding = self.output_padding
+            output_padding = self._output_padding
         else:
             output_padding = 0
 
@@ -1161,7 +1161,7 @@ class Conv3DTranspose(_ConvNd):
 
     def forward(self, x, output_size=None):
         if output_size is None:
-            output_padding = self.output_padding
+            output_padding = self._output_padding
         else:
             output_padding = 0
 
diff --git a/python/paddle/nn/quant/__init__.py b/python/paddle/nn/quant/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..c7f9a5073def836bfaded41e5d17715e42ffc42e
--- /dev/null
+++ b/python/paddle/nn/quant/__init__.py
@@ -0,0 +1,25 @@
+#   Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from .functional_layers import FloatFunctionalLayer  # noqa: F401
+from .functional_layers import add  # noqa: F401
+from .functional_layers import subtract  # noqa: F401
+from .functional_layers import multiply  # noqa: F401
+from .functional_layers import divide  # noqa: F401
+from .functional_layers import reshape  # noqa: F401
+from .functional_layers import transpose  # noqa: F401
+from .functional_layers import concat  # noqa: F401
+from .functional_layers import flatten  # noqa: F401
+
+__all__ = []
diff --git a/python/paddle/nn/quant/functional_layers.py b/python/paddle/nn/quant/functional_layers.py
new file mode 100644
index 0000000000000000000000000000000000000000..ce5fb3e616eb5960da6c2535c10d34d0fbf4766d
--- /dev/null
+++ b/python/paddle/nn/quant/functional_layers.py
@@ -0,0 +1,87 @@
+#   Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from ...fluid.dygraph import layers
+from ...tensor import math, manipulation
+
+__all__ = []
+
+
+class FloatFunctionalLayer(layers.Layer):
+    def __init__(self):
+        super(FloatFunctionalLayer, self).__init__()
+
+
+class add(FloatFunctionalLayer):
+    def __init__(self):
+        super(add, self).__init__()
+
+    def forward(self, x, y, name=None):
+        return math.add(x, y, name)
+
+
+class subtract(FloatFunctionalLayer):
+    def __init__(self):
+        super(subtract, self).__init__()
+
+    def forward(self, x, y, name=None):
+        return math.subtract(x, y, name)
+
+
+class multiply(FloatFunctionalLayer):
+    def __init__(self):
+        super(multiply, self).__init__()
+
+    def forward(self, x, y, name=None):
+        return math.multiply(x, y, name)
+
+
+class divide(FloatFunctionalLayer):
+    def __init__(self):
+        super(divide, self).__init__()
+
+    def forward(self, x, y, name=None):
+        return math.divide(x, y, name)
+
+
+class reshape(FloatFunctionalLayer):
+    def __init__(self):
+        super(reshape, self).__init__()
+
+    def forward(self, x, shape, name=None):
+        return manipulation.reshape(x, shape, name)
+
+
+class transpose(FloatFunctionalLayer):
+    def __init__(self):
+        super(transpose, self).__init__()
+
+    def forward(self, x, perm, name=None):
+        return manipulation.transpose(x, perm, name)
+
+
+class concat(FloatFunctionalLayer):
+    def __init__(self):
+        super(concat, self).__init__()
+
+    def forward(self, x, axis=0, name=None):
+        return manipulation.concat(x, axis, name)
+
+
+class flatten(FloatFunctionalLayer):
+    def __init__(self):
+        super(flatten, self).__init__()
+
+    def forward(self, x, start_axis=0, stop_axis=-1, name=None):
+        return manipulation.flatten(x, start_axis, stop_axis, name)
diff --git a/python/paddle/nn/utils/spectral_norm_hook.py b/python/paddle/nn/utils/spectral_norm_hook.py
index 250eb235fd7d43d96c46ab97c9cbabca16744429..75266abdf0d13257c48e11113e9474cb9847b6ea 100644
--- a/python/paddle/nn/utils/spectral_norm_hook.py
+++ b/python/paddle/nn/utils/spectral_norm_hook.py
@@ -20,7 +20,7 @@ from ..layer.conv import Conv1DTranspose, Conv2DTranspose, Conv3DTranspose
 from ..layer.common import Linear
 from .. import functional as F
 
-__all__ = ['spectral_norm']
+__all__ = []
 
 
 def normal_(x, mean=0., std=1.):
diff --git a/python/paddle/onnx/__init__.py b/python/paddle/onnx/__init__.py
index 885d1968ce1ae1ef4f6a4ff79f8ac40acb971baa..8853e78bf3d808108d540496f1d7d9e1d09121c4 100644
--- a/python/paddle/onnx/__init__.py
+++ b/python/paddle/onnx/__init__.py
@@ -12,7 +12,6 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-from __future__ import print_function
-from .export import export
+from .export import export  # noqa: F401
 
 __all__ = ['export']
diff --git a/python/paddle/onnx/export.py b/python/paddle/onnx/export.py
index 4b99b42bb0423c676e8d08b1931c6488b8ab1e98..b8a217a5134fb8007f7563349c3efd40e132b0b2 100644
--- a/python/paddle/onnx/export.py
+++ b/python/paddle/onnx/export.py
@@ -15,7 +15,7 @@
 import os
 from paddle.utils import try_import
 
-__all__ = ['export']
+__all__ = []
 
 
 def export(layer, path, input_spec=None, opset_version=9, **configs):
diff --git a/python/paddle/optimizer/lr.py b/python/paddle/optimizer/lr.py
index 7da933a9b72798db2606242c02065b66b333812f..db4e80d8d9a59b4de1d926fb95cdd9dfde696387 100644
--- a/python/paddle/optimizer/lr.py
+++ b/python/paddle/optimizer/lr.py
@@ -1349,7 +1349,7 @@ class ReduceOnPlateau(LRScheduler):
         if isinstance(metrics, (Tensor, numpy.ndarray)):
             assert len(metrics.shape) == 1 and metrics.shape[0] == 1, "the metrics.shape " \
                 "should be (1L,), but the current metrics.shape is {}. Maybe that "  \
-                "you should call paddle.mean to process it first.".format(loss.shape)
+                "you should call paddle.mean to process it first.".format(metrics.shape)
         elif not isinstance(metrics,
                             (int, float, numpy.float32, numpy.float64)):
             raise TypeError(
diff --git a/python/paddle/optimizer/momentum.py b/python/paddle/optimizer/momentum.py
index faff090bcb1f4ec2e906d2a3071930176a9c339f..85c5c60a34c50083e30eee65c3ad151d4516edc5 100644
--- a/python/paddle/optimizer/momentum.py
+++ b/python/paddle/optimizer/momentum.py
@@ -252,6 +252,19 @@ class Momentum(Optimizer):
                 )
             self._add_accumulator(self._velocity_acc_str, p)
 
+    def _create_regularization_of_grad(self, param, grad, regularization=None):
+        """ Create and add backward regularization Operators
+    
+        Function helper of append_regularization_ops.
+        """
+        # If ParamAttr is set to L2Decay, we skip doing regularization here. And then we fused
+        # L2Decay with momentum which can refer to _append_optimize_op below.
+        if hasattr(param, 'regularizer') and isinstance(param.regularizer,
+                                                        L2DecayRegularizer):
+            return grad
+        return super(Momentum, self)._create_regularization_of_grad(
+            param, grad, regularization)
+
     def _append_optimize_op(self, block, param_and_grad):
         assert isinstance(block, framework.Block)
         if isinstance(param_and_grad, dict):
@@ -261,6 +274,20 @@ class Momentum(Optimizer):
                                              param_and_grad[0])
         lr = self._create_param_lr(param_and_grad)
 
+        # For fusion of momentum and l2decay 
+        param = param_and_grad[0]
+        regularization_method = self._regularization_method
+        regularization_coeff = self._regularization_coeff
+        if hasattr(param, 'regularizer'):
+            # we skip param's l2decay before, so fuse it with momentum here.
+            if isinstance(param.regularizer, L2DecayRegularizer):
+                regularization_method = "l2_decay"
+                regularization_coeff = param.regularizer._regularization_coeff
+            # the param's regularization has been done before, we avoid do l2decay in momentum.
+            elif param.regularizer is not None:
+                regularization_method = ""
+                regularization_coeff = 0
+
         if framework.in_dygraph_mode():
             if isinstance(param_and_grad, dict):
                 self._update_regularization(param_and_grad['weight_decay'])
@@ -268,8 +295,8 @@ class Momentum(Optimizer):
                 param_and_grad[0], param_and_grad[1], velocity_acc, lr,
                 param_and_grad[0], velocity_acc, 'mu', self._momentum,
                 'use_nesterov', self._use_nesterov, 'regularization_method',
-                self._regularization_method, 'regularization_coeff',
-                self._regularization_coeff)
+                regularization_method, 'regularization_coeff',
+                regularization_coeff)
             return None
 
         find_master = self._multi_precision and param_and_grad[
@@ -280,8 +307,8 @@ class Momentum(Optimizer):
         attrs = {
             "mu": self._momentum,
             "use_nesterov": self._use_nesterov,
-            "regularization_method": self._regularization_method,
-            "regularization_coeff": self._regularization_coeff,
+            "regularization_method": regularization_method,
+            "regularization_coeff": regularization_coeff,
             "multi_precision": find_master,
             "rescale_grad": self._rescale_grad
         }
diff --git a/python/paddle/optimizer/optimizer.py b/python/paddle/optimizer/optimizer.py
index 0f22b920b17deba923b945115f4f274c84f2ddf6..93b618b7c9edc0f2efde1badcb691d6774e0b666 100644
--- a/python/paddle/optimizer/optimizer.py
+++ b/python/paddle/optimizer/optimizer.py
@@ -32,7 +32,6 @@ from ..fluid.framework import program_guard, Parameter
 from ..fluid.initializer import Constant
 from ..fluid.layer_helper import LayerHelper
 from ..fluid.layers import ops
-from ..fluid.regularizer import append_regularization_ops
 from ..fluid.dygraph import base as imperative_base
 from ..fluid.dygraph import no_grad
 from paddle.fluid import core
@@ -310,11 +309,11 @@ class Optimizer(object):
 
                 assert model_np.shape == load_para_np.shape,  \
                                           "Parameter shape not match, Dygraph Parameter [ {} ] need tensor with shape {} but load tensor with shape {}".format(
-                                                 item.name, model_np.shape, load_para_np.shape)
+                                                 model_np.name, model_np.shape, load_para_np.shape)
 
                 assert model_np.dtype == load_para_np.dtype, \
                                           "Parameter dtype not match, Dygraph Parameter [ {} ] need tensor with dtype {}  but load tensor with dtype {}".format(
-                                                item.name, model_np.dtype, load_para_np.dtype)
+                                                model_np.name, model_np.dtype, load_para_np.dtype)
 
                 tensor.set(load_para_np, framework._current_expected_place())
 
@@ -850,8 +849,8 @@ class Optimizer(object):
             params_grads = append_gradient_clip_ops(params_grads)
 
         # Add regularization if any
-        params_grads = append_regularization_ops(params_grads,
-                                                 self.regularization)
+        params_grads = self.append_regularization_ops(params_grads,
+                                                      self.regularization)
 
         optimize_ops = self._create_optimization_pass(params_grads)
         return optimize_ops
@@ -874,7 +873,7 @@ class Optimizer(object):
                 if isinstance(params_grads, list):
                     if self._grad_clip is not None:
                         params_grads = self._grad_clip(params_grads)
-                    params_grads = append_regularization_ops(
+                    params_grads = self.append_regularization_ops(
                         params_grads, self.regularization)
                 else:
                     grad_clip = params_grads['grad_clip']
@@ -882,7 +881,7 @@ class Optimizer(object):
                         params_grads['params'] = grad_clip(params_grads[
                             'params'])
 
-                    params_grads['params'] = append_regularization_ops(
+                    params_grads['params'] = self.append_regularization_ops(
                         params_grads['params'], self.regularization)
                 optimize_ops = self._create_optimization_pass(params_grads)
         else:
@@ -891,6 +890,93 @@ class Optimizer(object):
                 optimize_ops = self.apply_gradients(params_grads)
         return optimize_ops
 
+    def _create_regularization_of_grad(self, param, grad, regularization=None):
+        """ Create and add backward regularization Operators
+    
+        Function helper of append_regularization_ops.
+        """
+        # If no gradient or no regularization is specified,  then we don't need to do anything
+        if grad is None or ((not hasattr(param, 'regularizer') or
+                             (hasattr(param, 'regularizer') and
+                              param.regularizer is None)) and
+                            regularization is None):
+            return grad
+        regularization_term = None
+        if hasattr(param, 'regularizer') and param.regularizer is not None:
+            # Add variable for regularization term in grad block
+            regularization_term = param.regularizer(param, grad, grad.block)
+        elif regularization is not None:
+            regularization_term = regularization(param, grad, grad.block)
+
+        assert regularization_term is not None
+
+        new_grad = grad
+        if grad.type == core.VarDesc.VarType.SELECTED_ROWS:
+            # FIXME(zcd): If the grad is SELECTED_ROWS, after regularization,
+            # the grad's type and name will be changed. But the gradient's name
+            # is used in ParallelExecutor Reduce mode, so I add a flag for
+            # the new_grad here.
+            new_grad = grad.block.create_var(
+                name=grad.name + core.kNewGradSuffix(),
+                dtype=param.dtype,
+                shape=param.shape,
+                lod_level=param.lod_level,
+                type=core.VarDesc.VarType.LOD_TENSOR)
+
+        inputs = {"X": [grad, regularization_term]}
+        outputs = {"Out": [new_grad]}
+        if framework.in_dygraph_mode():
+            new_grad = core.ops.sum([grad, regularization_term])
+        else:
+            grad.block.append_op(type='sum', inputs=inputs, outputs=outputs)
+
+        return new_grad
+
+    def append_regularization_ops(self,
+                                  parameters_and_grads,
+                                  regularization=None):
+        r"""Create and add backward regularization Operators
+    
+        Creates and adds backward regularization operators in the BlockDesc.
+        This will add gradients of the regularizer function to the gradients
+        of the parameters and return these modified gradients. This is the
+        same as implementing weight decay in optimizers for regularization.
+    
+        Args:
+            parameters_and_grads: A list of (parameters, gradients) pairs
+                                  that need to be regularized.
+            regularization: A global regularizer. If the parameter is not
+                            set. It will be applied with regularizer.
+    
+        Returns:
+            list[(Variable, Variable)]: list of (parameters, gradients) \
+            pair with the regularized gradient
+    
+        Raises:
+            Exception: Unknown regularization type
+        """
+        params_and_grads = []
+        if framework.in_dygraph_mode():
+            for param, grad in parameters_and_grads:
+                new_grad = self._create_regularization_of_grad(param, grad,
+                                                               regularization)
+                params_and_grads.append((param, new_grad))
+        else:
+            repeate_regularizer = False
+            with framework.name_scope('regularization'):
+                for param, grad in parameters_and_grads:
+                    if not repeate_regularizer and param.regularizer is not None and regularization is not None:
+                        repeate_regularizer = True
+                        logging.info(
+                            "If regularizer of a Parameter has been set by 'fluid.ParamAttr' or 'fluid.WeightNormParamAttr' already. "
+                            "The Regularization[%s] in Optimizer will not take effect, and it will only be applied to other Parameters!"
+                            % regularization.__str__())
+                    with param.block.program._optimized_guard([param, grad]):
+                        new_grad = self._create_regularization_of_grad(
+                            param, grad, regularization)
+                        params_and_grads.append((param, new_grad))
+        return params_and_grads
+
     def _get_no_grad_set(self, loss, no_grad_set=None):
         no_grad_set = _get_no_grad_set_name(no_grad_set)
         parameters = loss.block.program.global_block().all_parameters()
diff --git a/python/paddle/reader/decorator.py b/python/paddle/reader/decorator.py
index da9749722e132952e6a77ca82afae4580b427cee..66f971c59d7d5b893ee102c0ff416447080c05e9 100644
--- a/python/paddle/reader/decorator.py
+++ b/python/paddle/reader/decorator.py
@@ -18,6 +18,7 @@ import multiprocessing
 import six
 import sys
 import warnings
+import logging
 
 from six.moves.queue import Queue
 from six.moves import zip_longest
diff --git a/python/paddle/static/__init__.py b/python/paddle/static/__init__.py
index 89da75ae91e40ed5fcf8d45dfca4e2628cc24f9e..93394f9b5afdef5cb8a4d4f355c36e59d974fd5a 100644
--- a/python/paddle/static/__init__.py
+++ b/python/paddle/static/__init__.py
@@ -85,10 +85,21 @@ __all__ = [     #noqa
            'load',
            'save_inference_model',
            'load_inference_model',
+           'serialize_program',
+           'serialize_persistables',
+           'save_to_file',
+           'deserialize_program',
+           'deserialize_persistables',
+           'load_from_file',
+           'normalize_program',
            'load_program_state',
            'set_program_state',
            'cpu_places',
            'cuda_places',
            'Variable',
-           'create_global_var'
+           'create_global_var',
+           'accuracy',
+           'auc',
+           'device_guard',
+           'create_parameter'
 ]
diff --git a/python/paddle/static/io.py b/python/paddle/static/io.py
index 58e8ebc481d799955d8a738e4c8a581ccd319679..a9cae0c14e3b19f78e08f8069e8e33bd68077a28 100644
--- a/python/paddle/static/io.py
+++ b/python/paddle/static/io.py
@@ -157,7 +157,7 @@ def normalize_program(program, feed_vars, fetch_vars):
             exe.run(paddle.static.default_startup_program())
 
             # normalize main program.
-            program = default_main_program()
+            program = paddle.static.default_main_program()
             normalized_program = paddle.static.normalize_program(program, [image], [predict])
 
     """
diff --git a/python/paddle/static/nn/__init__.py b/python/paddle/static/nn/__init__.py
index 416f6e4f3df06886dbd15c7a427b3620a1957842..b589d9f87895b73ba319499969c744f21f49c657 100644
--- a/python/paddle/static/nn/__init__.py
+++ b/python/paddle/static/nn/__init__.py
@@ -68,7 +68,6 @@ __all__ = [     #noqa
     'conv2d_transpose',
     'conv3d',
     'conv3d_transpose',
-    'create_parameter',
     'crf_decoding',
     'data_norm',
     'deform_conv2d',
diff --git a/python/paddle/tensor/__init__.py b/python/paddle/tensor/__init__.py
index c8d80fc9bc68cbbff4e270bbab4d8203e663bb2e..ac4f8e07f702ebdd6fb80460a024067d48e1dd3d 100755
--- a/python/paddle/tensor/__init__.py
+++ b/python/paddle/tensor/__init__.py
@@ -18,6 +18,7 @@ from .attribute import real  # noqa: F401
 from .attribute import imag  # noqa: F401
 from .creation import to_tensor  # noqa: F401
 from .creation import diag  # noqa: F401
+from .creation import diagflat  # noqa: F401
 from .creation import eye  # noqa: F401
 from .creation import linspace  # noqa: F401
 from .creation import ones  # noqa: F401
@@ -25,7 +26,6 @@ from .creation import ones_like  # noqa: F401
 from .creation import zeros  # noqa: F401
 from .creation import zeros_like  # noqa: F401
 from .creation import arange  # noqa: F401
-from .creation import eye  # noqa: F401
 from .creation import full  # noqa: F401
 from .creation import full_like  # noqa: F401
 from .creation import triu  # noqa: F401
@@ -82,7 +82,6 @@ from .manipulation import squeeze  # noqa: F401
 from .manipulation import squeeze_  # noqa: F401
 from .manipulation import stack  # noqa: F401
 from .manipulation import strided_slice  # noqa: F401
-from .manipulation import transpose  # noqa: F401
 from .manipulation import unique  # noqa: F401
 from .manipulation import unsqueeze  # noqa: F401
 from .manipulation import unsqueeze_  # noqa: F401
@@ -143,7 +142,6 @@ from .math import add  # noqa: F401
 from .math import add_  # noqa: F401
 from .math import subtract  # noqa: F401
 from .math import subtract_  # noqa: F401
-from .math import atan  # noqa: F401
 from .math import logsumexp  # noqa: F401
 from .math import inverse  # noqa: F401
 from .math import log2  # noqa: F401
@@ -163,6 +161,8 @@ from .math import all  # noqa: F401
 from .math import any  # noqa: F401
 from .math import broadcast_shape  # noqa: F401
 from .math import conj  # noqa: F401
+from .math import neg  # noqa: F401
+from .math import lgamma  # noqa: F401
 
 from .random import multinomial  # noqa: F401
 from .random import standard_normal  # noqa: F401
@@ -227,7 +227,6 @@ tensor_method_func  = [ #noqa
            'log2',
            'log10',
            'logsumexp',
-           'mul',
            'multiplex',
            'pow',
            'prod',
@@ -280,6 +279,8 @@ tensor_method_func  = [ #noqa
            'isnan',
            'broadcast_shape',
            'conj',
+           'neg',
+           'lgamma',
            'equal',
            'equal_all',
            'greater_equal',
diff --git a/python/paddle/tensor/creation.py b/python/paddle/tensor/creation.py
index fb0244a41499a0c794ad9eb3005794beb3b951b1..b446a5921b06739ebbcb860d0813cc70ac62bb4e 100644
--- a/python/paddle/tensor/creation.py
+++ b/python/paddle/tensor/creation.py
@@ -585,7 +585,7 @@ def tril(x, diagonal=0, name=None):
 
     Args:
         x (Tensor): The input x which is a Tensor.
-            Support data types: ``float64``, ``float32``, ``int32``, ``int64``.
+            Support data types: ``bool``, ``float64``, ``float32``, ``int32``, ``int64``.
         diagonal (int, optional): The diagonal to consider, default value is 0.
             If :attr:`diagonal` = 0, all elements on and below the main diagonal are
             retained. A positive value includes just as many diagonals above the main
@@ -772,6 +772,131 @@ def meshgrid(*args, **kwargs):
     return out
 
 
+def diagflat(x, offset=0, name=None):
+    """
+    If ``x`` is a vector (1-D tensor), a 2-D square tensor whth the elements of ``x`` as the diagonal is returned.
+
+    If ``x`` is a tensor (more than 1-D), a 2-D square tensor with the elements of flattened ``x`` as the diagonal is returned.
+
+    The argument ``offset`` controls the diagonal offset.
+
+
+    If ``offset`` = 0, it is the main diagonal.
+
+    If ``offset`` > 0, it is superdiagonal.
+
+    If ``offset`` < 0, it is subdiagonal.
+
+    Args:
+        x (Tensor): The input tensor. It can be any shape. Its data type should be float32, float64, int32, int64.
+        offset (int, optional): The diagonal offset. A positive value represents superdiagonal, 0 represents the main diagonal, and a negative value represents subdiagonal. Default: 0 (main diagonal).
+        name (str, optional): Name for the operation (optional, default is None). For more information, please refer to :ref:`api_guide_Name`.
+
+    Returns:
+        Tensor, a square matrix. The output data type is the same as input data type.
+
+    Examples:
+        .. code-block:: python
+
+          import paddle
+
+          x = paddle.to_tensor([1, 2, 3])
+          y = paddle.diagflat(x)
+          print(y.numpy())
+          # [[1 0 0]
+          #  [0 2 0]
+          #  [0 0 3]]
+
+          y = paddle.diagflat(x, offset=1)
+          print(y.numpy())
+          # [[0 1 0 0]
+          #  [0 0 2 0]
+          #  [0 0 0 3]
+          #  [0 0 0 0]]
+
+          y = paddle.diagflat(x, offset=-1)
+          print(y.numpy())
+          # [[0 0 0 0]
+          #  [1 0 0 0]
+          #  [0 2 0 0]
+          #  [0 0 3 0]]
+        
+        .. code-block:: python
+
+          import paddle
+
+          x = paddle.to_tensor([[1, 2], [3, 4]])
+          y = paddle.diagflat(x)
+          print(y.numpy())
+          # [[1 0 0 0]
+          #  [0 2 0 0]
+          #  [0 0 3 0]
+          #  [0 0 0 4]]
+
+          y = paddle.diagflat(x, offset=1)
+          print(y.numpy())
+          # [[0 1 0 0 0]
+          #  [0 0 2 0 0]
+          #  [0 0 0 3 0]
+          #  [0 0 0 0 4]
+          #  [0 0 0 0 0]]
+
+          y = paddle.diagflat(x, offset=-1)
+          print(y.numpy())
+          # [[0 0 0 0 0]
+          #  [1 0 0 0 0]
+          #  [0 2 0 0 0]
+          #  [0 0 3 0 0]
+          #  [0 0 0 4 0]]
+    """
+    padding_value = 0
+    if in_dygraph_mode():
+        if len(x.shape) == 1:
+            return core.ops.diag_v2(x, "offset", offset, "padding_value",
+                                    padding_value)
+        else:
+            y, _ = core.ops.flatten_contiguous_range(x, "start_axis", 0,
+                                                     "stop_axis", -1)
+            return core.ops.diag_v2(y, "offset", offset, "padding_value",
+                                    padding_value)
+
+    check_type(x, 'x', (Variable), 'diagflat')
+    check_dtype(x.dtype, 'x', ['float32', 'float64', 'int32', 'int64'],
+                'diagflat')
+    check_type(offset, 'offset', (int), 'diagflat')
+
+    helper = LayerHelper("diagflat", **locals())
+    out1 = helper.create_variable_for_type_inference(dtype=x.dtype)
+    out1_shape = helper.create_variable_for_type_inference(x.dtype)
+    out2 = helper.create_variable_for_type_inference(dtype=x.dtype)
+
+    if len(x.shape) == 1:
+        helper.append_op(
+            type='diag_v2',
+            inputs={'X': x},
+            outputs={'Out': out2},
+            attrs={'offset': offset,
+                   'padding_value': padding_value})
+    else:
+        helper.append_op(
+            type='flatten_contiguous_range',
+            inputs={'X': x},
+            outputs={'Out': out1,
+                     'XShape': out1_shape},
+            attrs={'start_axis': 0,
+                   'stop_axis': -1})
+        out1.stop_gradient = True
+
+        helper.append_op(
+            type='diag_v2',
+            inputs={'X': out1},
+            outputs={'Out': out2},
+            attrs={'offset': offset,
+                   'padding_value': padding_value})
+    out2.stop_gradient = True
+    return out2
+
+
 def diag(x, offset=0, padding_value=0, name=None):
     """
     If ``x`` is a vector (1-D tensor), a 2-D square tensor whth the elements of ``x`` as the diagonal is returned.
@@ -1054,3 +1179,64 @@ def assign(x, output=None):
     check_type(x, 'x', (Variable, np.ndarray, list, tuple, float, int, bool),
                'assign')
     return tensor.assign(x, output)
+
+
+#NOTE(zhiqiu): not public 
+def _memcpy(input, place=None, output=None):
+    """
+
+    The OP copies the :attr:`input` to the :attr:`output`.
+    NOTE: currently, only support CUDAPlace <-> CUDAPinnedPlace or NPUPlace <-> CPUPlace.
+
+    Parameters:
+        input (Tensor): A tensor. Its data type supports float16, float32, float64, int32, int64, and bool.
+        device (Place): Target place for the output.
+        output (Tensor, optional): A tensor. If :attr:`output` is None, a new tensor will
+            be created as :attr:`output`. Default: None.
+
+    Returns:
+        Tensor: A tensor with the same shape, data type and value as :attr:`input`.
+
+    Examples:
+        .. code-block:: python
+
+          import paddle
+          import numpy as np
+          data = paddle.full(shape=[3, 2], fill_value=2.5, dtype='float64') # [[2.5, 2.5], [2.5, 2.5], [2.5, 2.5]]
+          result = paddle._memcpy(data, place=paddle.CPUPlace())  # result2 = [[2.5, 2.5], [2.5, 2.5], [2.5, 2.5]]
+    """
+    helper = LayerHelper('memcpy', **locals())
+    check_type(input, 'input', (Variable), 'memcpy')
+
+    if isinstance(input, (Variable, core.VarBase)):
+        check_dtype(input.dtype, 'input', [
+            'float16', 'uint16', 'float32', 'float64', 'int32', 'int64',
+            'uint8', 'bool'
+        ], 'memcpy', '(When the type of input in memcpy is Variable.)')
+    if output is None:
+        output = helper.create_variable_for_type_inference(dtype=input.dtype)
+
+    dst_place_type = -1
+    if place is None:
+        dst_place_type = -1
+    else:
+        p = core.Place()
+        p.set_place(place)
+        if p.is_cpu_place():
+            dst_place_type = 0
+        elif p.is_gpu_place():
+            dst_place_type = 1
+        elif p.is_cuda_pinned_place():
+            dst_place_type = 2
+        elif p.is_xpu_place():
+            dst_place_type = 3
+        elif p.is_npu_place():
+            dst_place_type = 4
+
+    attrs = {'dst_place_type': dst_place_type}
+    helper.append_op(
+        type='memcpy',
+        inputs={'X': [input]},
+        outputs={'Out': [output]},
+        attrs=attrs)
+    return output
diff --git a/python/paddle/tensor/math.py b/python/paddle/tensor/math.py
index 2f69946c52139be5c65bf5b2c38d0d17c9b58103..652c7c41fb8cc09f413ee943e9f1f5f4f2c28063 100755
--- a/python/paddle/tensor/math.py
+++ b/python/paddle/tensor/math.py
@@ -62,6 +62,7 @@ from ..fluid.layers import erf    # noqa: F401
 from ..fluid.layers import sqrt    # noqa: F401
 from ..fluid.layers import sqrt_    # noqa: F401
 from ..fluid.layers import sin    # noqa: F401
+from ..fluid.layers import lgamma    # noqa: F401
 
 from ..fluid.layers import multiplex    # noqa: F401
 from ..fluid import layers
@@ -2280,3 +2281,27 @@ def conj(x, name=None):
 
     helper.append_op(type='conj', inputs={'X': x}, outputs={'Out': [out]})
     return out
+
+def neg(x, name=None):
+    """
+    This function computes the negative of the Tensor elementwisely.
+
+    Args:
+        x (Tensor): Input of neg operator, an N-D Tensor, with data type float32, float64, int8, int16, int32, or int64.
+        name (str, optional): Name for the operation (optional, default is None). For more information, please refer to :ref:`api_guide_Name`.
+
+    Returns:
+        out (Tensor): The negative of input Tensor. The shape and data type are the same with input Tensor.
+
+    Examples:
+        .. code-block:: python
+
+            import paddle
+
+            x = paddle.to_tensor([-0.4, -0.2, 0.1, 0.3])
+            out = paddle.neg(x)
+            print(out)
+            # [0.4 0.2 -0.1 -0.3]
+    """
+
+    return layers.scale(x, scale=-1.0, bias=0.0, bias_after_scale=True, act=None, name=name)
diff --git a/python/paddle/tests/test_callback_visualdl.py b/python/paddle/tests/test_callback_visualdl.py
index 36316183104fe3a19bfa5e9868e26e54f5405dd1..db3b83f2b141417b363d5dbe2d4fedd2542d31ad 100644
--- a/python/paddle/tests/test_callback_visualdl.py
+++ b/python/paddle/tests/test_callback_visualdl.py
@@ -55,7 +55,7 @@ class TestCallbacks(unittest.TestCase):
         train_dataset = MnistDataset(mode='train', transform=transform)
         eval_dataset = MnistDataset(mode='test', transform=transform)
 
-        net = paddle.vision.LeNet()
+        net = paddle.vision.models.LeNet()
         model = paddle.Model(net, inputs, labels)
 
         optim = paddle.optimizer.Adam(0.001, parameters=net.parameters())
diff --git a/python/paddle/tests/test_dataset_cifar.py b/python/paddle/tests/test_dataset_cifar.py
index abf79fb1e3974ce0c1d9de4efd1df05056ff3821..2e9efddf9712e35423e19dad02f738c40dbc8b51 100644
--- a/python/paddle/tests/test_dataset_cifar.py
+++ b/python/paddle/tests/test_dataset_cifar.py
@@ -32,6 +32,8 @@ class TestCifar10Train(unittest.TestCase):
         self.assertTrue(data.shape[2] == 3)
         self.assertTrue(data.shape[1] == 32)
         self.assertTrue(data.shape[0] == 32)
+        self.assertTrue(len(label.shape) == 1)
+        self.assertTrue(label.shape[0] == 1)
         self.assertTrue(0 <= int(label) <= 9)
 
 
@@ -49,6 +51,8 @@ class TestCifar10Test(unittest.TestCase):
         self.assertTrue(data.shape[2] == 3)
         self.assertTrue(data.shape[1] == 32)
         self.assertTrue(data.shape[0] == 32)
+        self.assertTrue(len(label.shape) == 1)
+        self.assertTrue(label.shape[0] == 1)
         self.assertTrue(0 <= int(label) <= 9)
 
         # test cv2 backend
@@ -63,6 +67,8 @@ class TestCifar10Test(unittest.TestCase):
         self.assertTrue(data.shape[2] == 3)
         self.assertTrue(data.shape[1] == 32)
         self.assertTrue(data.shape[0] == 32)
+        self.assertTrue(len(label.shape) == 1)
+        self.assertTrue(label.shape[0] == 1)
         self.assertTrue(0 <= int(label) <= 99)
 
         with self.assertRaises(ValueError):
@@ -83,6 +89,8 @@ class TestCifar100Train(unittest.TestCase):
         self.assertTrue(data.shape[2] == 3)
         self.assertTrue(data.shape[1] == 32)
         self.assertTrue(data.shape[0] == 32)
+        self.assertTrue(len(label.shape) == 1)
+        self.assertTrue(label.shape[0] == 1)
         self.assertTrue(0 <= int(label) <= 99)
 
 
@@ -100,6 +108,8 @@ class TestCifar100Test(unittest.TestCase):
         self.assertTrue(data.shape[2] == 3)
         self.assertTrue(data.shape[1] == 32)
         self.assertTrue(data.shape[0] == 32)
+        self.assertTrue(len(label.shape) == 1)
+        self.assertTrue(label.shape[0] == 1)
         self.assertTrue(0 <= int(label) <= 99)
 
         # test cv2 backend
@@ -114,6 +124,8 @@ class TestCifar100Test(unittest.TestCase):
         self.assertTrue(data.shape[2] == 3)
         self.assertTrue(data.shape[1] == 32)
         self.assertTrue(data.shape[0] == 32)
+        self.assertTrue(len(label.shape) == 1)
+        self.assertTrue(label.shape[0] == 1)
         self.assertTrue(0 <= int(label) <= 99)
 
         with self.assertRaises(ValueError):
diff --git a/python/paddle/tests/test_download.py b/python/paddle/tests/test_download.py
index 4be2dde1bccb132041723df8af7f5f36f24e133c..986d84dd153b2f54624d3394fbb5a4b1b52b8953 100644
--- a/python/paddle/tests/test_download.py
+++ b/python/paddle/tests/test_download.py
@@ -77,6 +77,31 @@ class TestDownload(unittest.TestCase):
                 'www.baidu.com',
                 './test', )
 
+    def test_wget_download_error(self, ):
+        with self.assertRaises(RuntimeError):
+            from paddle.utils.download import _download
+            _download('www.baidu', './test', method='wget')
+
+    def test_download_methods(self, ):
+        urls = [
+            "https://paddle-hapi.bj.bcebos.com/unittest/files.tar",
+            "https://paddle-hapi.bj.bcebos.com/unittest/files.zip",
+        ]
+
+        import sys
+        from paddle.utils.download import _download
+        if sys.platform == 'linux':
+            methods = ['wget', 'get']
+        else:
+            methods = ['get']
+
+        for url in urls:
+            for method in methods:
+                _download(
+                    url,
+                    path='./test',
+                    method=method, )
+
 
 if __name__ == '__main__':
     unittest.main()
diff --git a/python/paddle/tests/test_model.py b/python/paddle/tests/test_model.py
index ae574a8241bfffccea7c9d0e7fe71a83a710e778..0ced69c0f2ea9620918c3cd0e3b970a1226a3863 100644
--- a/python/paddle/tests/test_model.py
+++ b/python/paddle/tests/test_model.py
@@ -126,7 +126,7 @@ class TestModel(unittest.TestCase):
     @classmethod
     def setUpClass(cls):
         if not fluid.is_compiled_with_cuda():
-            self.skipTest('module not tested when ONLY_CPU compling')
+            cls.skipTest('module not tested when ONLY_CPU compling')
         cls.device = paddle.set_device('gpu')
         fluid.enable_dygraph(cls.device)
 
diff --git a/python/paddle/text/datasets/wmt14.py b/python/paddle/text/datasets/wmt14.py
index 424a564216d19072d3927358fc1ff9c3a3af307b..38ca09bf299831add72d04ade5e5a30b4948344b 100644
--- a/python/paddle/text/datasets/wmt14.py
+++ b/python/paddle/text/datasets/wmt14.py
@@ -14,9 +14,11 @@
 
 from __future__ import print_function
 
+import six
 import tarfile
 import numpy as np
 import gzip
+import six
 
 from paddle.io import Dataset
 import paddle.compat as cpt
diff --git a/python/paddle/utils/download.py b/python/paddle/utils/download.py
index 3ad627ddea927467caaa1524285724850a5cdc36..29baddff05af22df4f11e8e0fcb38b6d66983a47 100644
--- a/python/paddle/utils/download.py
+++ b/python/paddle/utils/download.py
@@ -21,6 +21,7 @@ import sys
 import os.path as osp
 import shutil
 import requests
+import subprocess
 import hashlib
 import tarfile
 import zipfile
@@ -121,7 +122,8 @@ def get_path_from_url(url,
                       root_dir,
                       md5sum=None,
                       check_exist=True,
-                      decompress=True):
+                      decompress=True,
+                      method='get'):
     """ Download from given url to root_dir.
     if file or directory specified by url is exists under
     root_dir, return the path directly, otherwise download
@@ -132,7 +134,9 @@ def get_path_from_url(url,
         root_dir (str): root dir for downloading, it should be
                         WEIGHTS_HOME or DATASET_HOME
         md5sum (str): md5 sum of download package
-    
+        decompress (bool): decompress zip or tar file. Default is `True`
+        method (str): which download method to use. Support `wget` and `get`. Default is `get`.
+
     Returns:
         str: a local path to save downloaded models & weights & datasets.
     """
@@ -150,7 +154,7 @@ def get_path_from_url(url,
         logger.info("Found {}".format(fullpath))
     else:
         if ParallelEnv().current_endpoint in unique_endpoints:
-            fullpath = _download(url, root_dir, md5sum)
+            fullpath = _download(url, root_dir, md5sum, method=method)
         else:
             while not os.path.exists(fullpath):
                 time.sleep(1)
@@ -163,13 +167,79 @@ def get_path_from_url(url,
     return fullpath
 
 
-def _download(url, path, md5sum=None):
+def _get_download(url, fullname):
+    # using requests.get method
+    fname = osp.basename(fullname)
+    try:
+        req = requests.get(url, stream=True)
+    except Exception as e:  # requests.exceptions.ConnectionError
+        logger.info("Downloading {} from {} failed with exception {}".format(
+            fname, url, str(e)))
+        return False
+
+    if req.status_code != 200:
+        raise RuntimeError("Downloading from {} failed with code "
+                           "{}!".format(url, req.status_code))
+
+    # For protecting download interupted, download to
+    # tmp_fullname firstly, move tmp_fullname to fullname
+    # after download finished
+    tmp_fullname = fullname + "_tmp"
+    total_size = req.headers.get('content-length')
+    with open(tmp_fullname, 'wb') as f:
+        if total_size:
+            with tqdm(total=(int(total_size) + 1023) // 1024) as pbar:
+                for chunk in req.iter_content(chunk_size=1024):
+                    f.write(chunk)
+                    pbar.update(1)
+        else:
+            for chunk in req.iter_content(chunk_size=1024):
+                if chunk:
+                    f.write(chunk)
+    shutil.move(tmp_fullname, fullname)
+
+    return fullname
+
+
+def _wget_download(url, fullname):
+    # using wget to download url
+    tmp_fullname = fullname + "_tmp"
+    # –user-agent
+    command = 'wget -O {} -t {} {}'.format(tmp_fullname, DOWNLOAD_RETRY_LIMIT,
+                                           url)
+    subprc = subprocess.Popen(
+        command, shell=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
+    _ = subprc.communicate()
+
+    if subprc.returncode != 0:
+        raise RuntimeError(
+            '{} failed. Please make sure `wget` is installed or {} exists'.
+            format(command, url))
+
+    shutil.move(tmp_fullname, fullname)
+
+    return fullname
+
+
+_download_methods = {
+    'get': _get_download,
+    'wget': _wget_download,
+}
+
+
+def _download(url, path, md5sum=None, method='get'):
     """
     Download from url, save to path.
 
     url (str): download url
     path (str): download to given path
+    md5sum (str): md5 sum of download package
+    method (str): which download method to use. Support `wget` and `get`. Default is `get`.
+
     """
+    assert method in _download_methods, 'make sure `{}` implemented'.format(
+        method)
+
     if not osp.exists(path):
         os.makedirs(path)
 
@@ -177,6 +247,7 @@ def _download(url, path, md5sum=None):
     fullname = osp.join(path, fname)
     retry_cnt = 0
 
+    logger.info("Downloading {} from {}".format(fname, url))
     while not (osp.exists(fullname) and _md5check(fullname, md5sum)):
         if retry_cnt < DOWNLOAD_RETRY_LIMIT:
             retry_cnt += 1
@@ -184,38 +255,10 @@ def _download(url, path, md5sum=None):
             raise RuntimeError("Download from {} failed. "
                                "Retry limit reached".format(url))
 
-        logger.info("Downloading {} from {}".format(fname, url))
-
-        try:
-            req = requests.get(url, stream=True)
-        except Exception as e:  # requests.exceptions.ConnectionError
-            logger.info(
-                "Downloading {} from {} failed {} times with exception {}".
-                format(fname, url, retry_cnt + 1, str(e)))
+        if not _download_methods[method](url, fullname):
             time.sleep(1)
             continue
 
-        if req.status_code != 200:
-            raise RuntimeError("Downloading from {} failed with code "
-                               "{}!".format(url, req.status_code))
-
-        # For protecting download interupted, download to
-        # tmp_fullname firstly, move tmp_fullname to fullname
-        # after download finished
-        tmp_fullname = fullname + "_tmp"
-        total_size = req.headers.get('content-length')
-        with open(tmp_fullname, 'wb') as f:
-            if total_size:
-                with tqdm(total=(int(total_size) + 1023) // 1024) as pbar:
-                    for chunk in req.iter_content(chunk_size=1024):
-                        f.write(chunk)
-                        pbar.update(1)
-            else:
-                for chunk in req.iter_content(chunk_size=1024):
-                    if chunk:
-                        f.write(chunk)
-        shutil.move(tmp_fullname, fullname)
-
     return fullname
 
 
diff --git a/python/paddle/vision/__init__.py b/python/paddle/vision/__init__.py
index aeb07bf281fb0a0289640e0591af4d864ca10b39..79fb7844dd58c664ce5c391788aacc384e49432c 100644
--- a/python/paddle/vision/__init__.py
+++ b/python/paddle/vision/__init__.py
@@ -11,22 +11,59 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
+import paddle
+import paddle.nn as nn
+from . import models  # noqa: F401
+from . import transforms  # noqa: F401
+from . import datasets  # noqa: F401
+from . import ops  # noqa: F401
+from .image import set_image_backend  # noqa: F401
+from .image import get_image_backend  # noqa: F401
+from .image import image_load  # noqa: F401
+from .models import LeNet as models_LeNet
+import paddle.utils.deprecated as deprecated
 
-from . import models
-from .models import *
+__all__ = [  #noqa
+    'set_image_backend', 'get_image_backend', 'image_load'
+]
 
-from . import transforms
-from .transforms import *
 
-from . import datasets
-from .datasets import *
+class LeNet(models_LeNet):
+    """LeNet model from
+    `"LeCun Y, Bottou L, Bengio Y, et al. Gradient-based learning applied to document recognition[J]. Proceedings of the IEEE, 1998, 86(11): 2278-2324.`_
 
-from . import image
-from .image import *
+    Args:
+        num_classes (int): output dim of last fc layer. If num_classes <=0, last fc layer 
+                            will not be defined. Default: 10.
 
-from . import ops
+    Examples:
+        .. code-block:: python
 
-__all__ = models.__all__ \
-        + transforms.__all__ \
-        + datasets.__all__ \
-        + image.__all__
+            from paddle.vision.models import LeNet
+
+            model = LeNet()
+    """
+
+    @deprecated(
+        since="2.0.0",
+        update_to="paddle.vision.models.LeNet",
+        level=1,
+        reason="Please use new API in models, paddle.vision.LeNet will be removed in future"
+    )
+    def __init__(self, num_classes=10):
+        super(LeNet, self).__init__(num_classes=10)
+        self.num_classes = num_classes
+        self.features = nn.Sequential(
+            nn.Conv2D(
+                1, 6, 3, stride=1, padding=1),
+            nn.ReLU(),
+            nn.MaxPool2D(2, 2),
+            nn.Conv2D(
+                6, 16, 5, stride=1, padding=0),
+            nn.ReLU(),
+            nn.MaxPool2D(2, 2))
+
+        if num_classes > 0:
+            self.fc = nn.Sequential(
+                nn.Linear(400, 120),
+                nn.Linear(120, 84), nn.Linear(84, num_classes))
diff --git a/python/paddle/vision/datasets/__init__.py b/python/paddle/vision/datasets/__init__.py
index 6703aa4197603be2d82d930e3cd2622ff6b4cd77..3ee7503e27979753ba97241256f769841b40b0c8 100644
--- a/python/paddle/vision/datasets/__init__.py
+++ b/python/paddle/vision/datasets/__init__.py
@@ -12,20 +12,22 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-from . import folder
-from . import mnist
-from . import flowers
-from . import cifar
-from . import voc2012
+from .folder import DatasetFolder  # noqa: F401
+from .folder import ImageFolder  # noqa: F401
+from .mnist import MNIST  # noqa: F401
+from .mnist import FashionMNIST  # noqa: F401
+from .flowers import Flowers  # noqa: F401
+from .cifar import Cifar10  # noqa: F401
+from .cifar import Cifar100  # noqa: F401
+from .voc2012 import VOC2012  # noqa: F401
 
-from .folder import *
-from .mnist import *
-from .flowers import *
-from .cifar import *
-from .voc2012 import *
-
-__all__ = folder.__all__ \
-          + mnist.__all__ \
-          + flowers.__all__ \
-          + cifar.__all__ \
-          + voc2012.__all__
+__all__ = [ #noqa
+    'DatasetFolder'
+    'ImageFolder',
+    'MNIST',
+    'FashionMNIST',
+    'Flowers',
+    'Cifar10',
+    'Cifar100',
+    'VOC2012'
+]
diff --git a/python/paddle/vision/datasets/cifar.py b/python/paddle/vision/datasets/cifar.py
index 0a0a48026af80eccc891df9202ab3a42f37ba06d..a70b0317fc27d93bab5d0fd2fd8163bbfc676406 100644
--- a/python/paddle/vision/datasets/cifar.py
+++ b/python/paddle/vision/datasets/cifar.py
@@ -24,7 +24,7 @@ import paddle
 from paddle.io import Dataset
 from paddle.dataset.common import _check_exists_and_download
 
-__all__ = ['Cifar10', 'Cifar100']
+__all__ = []
 
 URL_PREFIX = 'https://dataset.bj.bcebos.com/cifar/'
 CIFAR10_URL = URL_PREFIX + 'cifar-10-python.tar.gz'
@@ -151,7 +151,8 @@ class Cifar10(Dataset):
                     six.b('labels'), batch.get(six.b('fine_labels'), None))
                 assert labels is not None
                 for sample, label in six.moves.zip(data, labels):
-                    self.data.append((sample, label))
+                    self.data.append((sample,
+                                      np.array([label]).astype('int64')))
 
     def __getitem__(self, idx):
         image, label = self.data[idx]
@@ -164,9 +165,9 @@ class Cifar10(Dataset):
             image = self.transform(image)
 
         if self.backend == 'pil':
-            return image, np.array(label).astype('int64')
+            return image, label.astype('int64')
 
-        return image.astype(self.dtype), np.array(label).astype('int64')
+        return image.astype(self.dtype), label.astype('int64')
 
     def __len__(self):
         return len(self.data)
diff --git a/python/paddle/vision/datasets/flowers.py b/python/paddle/vision/datasets/flowers.py
index 65c0b604efd5d719cf9df313592b6e3561b5958a..0b006ada4a045ecab199bb09bed80c5b94d87dd8 100644
--- a/python/paddle/vision/datasets/flowers.py
+++ b/python/paddle/vision/datasets/flowers.py
@@ -25,7 +25,7 @@ from paddle.io import Dataset
 from paddle.utils import try_import
 from paddle.dataset.common import _check_exists_and_download
 
-__all__ = ["Flowers"]
+__all__ = []
 
 DATA_URL = 'http://paddlemodels.bj.bcebos.com/flowers/102flowers.tgz'
 LABEL_URL = 'http://paddlemodels.bj.bcebos.com/flowers/imagelabels.mat'
diff --git a/python/paddle/vision/datasets/folder.py b/python/paddle/vision/datasets/folder.py
index 718af041307a15e5c44ece79b478d7a47bf8729c..220b3d8ecb4b412a83e452381ef021afdf0e4940 100644
--- a/python/paddle/vision/datasets/folder.py
+++ b/python/paddle/vision/datasets/folder.py
@@ -20,7 +20,7 @@ import paddle
 from paddle.io import Dataset
 from paddle.utils import try_import
 
-__all__ = ["DatasetFolder", "ImageFolder"]
+__all__ = []
 
 
 def has_valid_extension(filename, extensions):
diff --git a/python/paddle/vision/datasets/mnist.py b/python/paddle/vision/datasets/mnist.py
index 1b998fd71a62e5bd21545e1548e628042fca833a..84760f9598b6adc60eb3873633db0bc87bf64785 100644
--- a/python/paddle/vision/datasets/mnist.py
+++ b/python/paddle/vision/datasets/mnist.py
@@ -24,7 +24,7 @@ import paddle
 from paddle.io import Dataset
 from paddle.dataset.common import _check_exists_and_download
 
-__all__ = ["MNIST", "FashionMNIST"]
+__all__ = []
 
 
 class MNIST(Dataset):
diff --git a/python/paddle/vision/datasets/voc2012.py b/python/paddle/vision/datasets/voc2012.py
index 1a42d143f0f72b21b6f431400713500f395b03f9..5a82d7864cb009da066929c830f6213818b7c203 100644
--- a/python/paddle/vision/datasets/voc2012.py
+++ b/python/paddle/vision/datasets/voc2012.py
@@ -23,7 +23,7 @@ import paddle
 from paddle.io import Dataset
 from paddle.dataset.common import _check_exists_and_download
 
-__all__ = ["VOC2012"]
+__all__ = []
 
 VOC_URL = 'https://dataset.bj.bcebos.com/voc/VOCtrainval_11-May-2012.tar'
 
diff --git a/python/paddle/vision/image.py b/python/paddle/vision/image.py
index 19986816b7cc42282050057b5cc791faa8fd1c1f..5c260b1d90a891134d344bb364d065bca2518c5b 100644
--- a/python/paddle/vision/image.py
+++ b/python/paddle/vision/image.py
@@ -15,7 +15,7 @@
 from PIL import Image
 from paddle.utils import try_import
 
-__all__ = ['set_image_backend', 'get_image_backend', 'image_load']
+__all__ = []
 
 _image_backend = 'pil'
 
diff --git a/python/paddle/vision/models/__init__.py b/python/paddle/vision/models/__init__.py
index 60d8c246ae10e2bcb2a6576ce13a99e5e984c5bc..d38f3b1722ee8c2f31d53a26b96d3320abd2e350 100644
--- a/python/paddle/vision/models/__init__.py
+++ b/python/paddle/vision/models/__init__.py
@@ -12,20 +12,38 @@
 #See the License for the specific language governing permissions and
 #limitations under the License.
 
-from . import resnet
-from . import vgg
-from . import mobilenetv1
-from . import mobilenetv2
-from . import lenet
+from .resnet import ResNet  # noqa: F401
+from .resnet import resnet18  # noqa: F401
+from .resnet import resnet34  # noqa: F401
+from .resnet import resnet50  # noqa: F401
+from .resnet import resnet101  # noqa: F401
+from .resnet import resnet152  # noqa: F401
+from .mobilenetv1 import MobileNetV1  # noqa: F401
+from .mobilenetv1 import mobilenet_v1  # noqa: F401
+from .mobilenetv2 import MobileNetV2  # noqa: F401
+from .mobilenetv2 import mobilenet_v2  # noqa: F401
+from .vgg import VGG  # noqa: F401
+from .vgg import vgg11  # noqa: F401
+from .vgg import vgg13  # noqa: F401
+from .vgg import vgg16  # noqa: F401
+from .vgg import vgg19  # noqa: F401
+from .lenet import LeNet  # noqa: F401
 
-from .resnet import *
-from .mobilenetv1 import *
-from .mobilenetv2 import *
-from .vgg import *
-from .lenet import *
-
-__all__ = resnet.__all__ \
-        + vgg.__all__ \
-        + mobilenetv1.__all__ \
-        + mobilenetv2.__all__ \
-        + lenet.__all__
+__all__ = [ #noqa
+    'ResNet',
+    'resnet18',
+    'resnet34',
+    'resnet50',
+    'resnet101',
+    'resnet152',
+    'VGG',
+    'vgg11',
+    'vgg13',
+    'vgg16',
+    'vgg19',
+    'MobileNetV1',
+    'mobilenet_v1',
+    'MobileNetV2',
+    'mobilenet_v2',
+    'LeNet'
+]
diff --git a/python/paddle/vision/models/lenet.py b/python/paddle/vision/models/lenet.py
index 2fb50fc17b9e9f1f9c8af3d5c22d8f0e35c3958a..46212f46f3a487c4ea567d049d7bc200331d34b4 100644
--- a/python/paddle/vision/models/lenet.py
+++ b/python/paddle/vision/models/lenet.py
@@ -15,7 +15,7 @@
 import paddle
 import paddle.nn as nn
 
-__all__ = ['LeNet']
+__all__ = []
 
 
 class LeNet(nn.Layer):
diff --git a/python/paddle/vision/models/mobilenetv1.py b/python/paddle/vision/models/mobilenetv1.py
index 22d177248e8b3708a37eb04b1b0eeeece8d154cf..671a2cd8dfd5f4cebf756edb397ad1f182b895ad 100644
--- a/python/paddle/vision/models/mobilenetv1.py
+++ b/python/paddle/vision/models/mobilenetv1.py
@@ -17,7 +17,7 @@ import paddle.nn as nn
 
 from paddle.utils.download import get_weights_path_from_url
 
-__all__ = ['MobileNetV1', 'mobilenet_v1']
+__all__ = []
 
 model_urls = {
     'mobilenetv1_1.0':
diff --git a/python/paddle/vision/models/mobilenetv2.py b/python/paddle/vision/models/mobilenetv2.py
index f1cbaab1f90accc616f5a93bba8d3fd6126770fb..74071fc121688eafbf17833a6410b94d34191ec4 100644
--- a/python/paddle/vision/models/mobilenetv2.py
+++ b/python/paddle/vision/models/mobilenetv2.py
@@ -20,7 +20,7 @@ import paddle.nn.functional as F
 
 from paddle.utils.download import get_weights_path_from_url
 
-__all__ = ['MobileNetV2', 'mobilenet_v2']
+__all__ = []
 
 model_urls = {
     'mobilenetv2_1.0':
diff --git a/python/paddle/vision/models/resnet.py b/python/paddle/vision/models/resnet.py
index 1f44e0bc6dfeb18cd1eb99489860500a390c33de..5be69c93e8b5f05f17f7d8c4503a794682a12d15 100644
--- a/python/paddle/vision/models/resnet.py
+++ b/python/paddle/vision/models/resnet.py
@@ -20,9 +20,7 @@ import paddle.nn as nn
 
 from paddle.utils.download import get_weights_path_from_url
 
-__all__ = [
-    'ResNet', 'resnet18', 'resnet34', 'resnet50', 'resnet101', 'resnet152'
-]
+__all__ = []
 
 model_urls = {
     'resnet18': ('https://paddle-hapi.bj.bcebos.com/models/resnet18.pdparams',
diff --git a/python/paddle/vision/models/vgg.py b/python/paddle/vision/models/vgg.py
index f6b4c75e84f01379264fb2066b218747204fd6da..d526de8208329fb23ff4fad219db5dd706958ad8 100644
--- a/python/paddle/vision/models/vgg.py
+++ b/python/paddle/vision/models/vgg.py
@@ -17,13 +17,7 @@ import paddle.nn as nn
 
 from paddle.utils.download import get_weights_path_from_url
 
-__all__ = [
-    'VGG',
-    'vgg11',
-    'vgg13',
-    'vgg16',
-    'vgg19',
-]
+__all__ = []
 
 model_urls = {
     'vgg16': ('https://paddle-hapi.bj.bcebos.com/models/vgg16.pdparams',
diff --git a/python/paddle/vision/ops.py b/python/paddle/vision/ops.py
index 60a7a90c9be89591e681192f5e886f9c5443a8c0..ef3c7efa5c7ab6ad4b3be349b80a4ab2759879e8 100644
--- a/python/paddle/vision/ops.py
+++ b/python/paddle/vision/ops.py
@@ -22,8 +22,12 @@ from ..fluid.initializer import Normal
 
 from paddle.common_ops_import import *
 
-__all__ = [
-    'yolo_loss', 'yolo_box', 'deform_conv2d', 'DeformConv2D', 'read_file',
+__all__ = [ #noqa
+    'yolo_loss',
+    'yolo_box',
+    'deform_conv2d',
+    'DeformConv2D',
+    'read_file',
     'decode_jpeg'
 ]
 
@@ -247,7 +251,9 @@ def yolo_box(x,
              downsample_ratio,
              clip_bbox=True,
              name=None,
-             scale_x_y=1.):
+             scale_x_y=1.,
+             iou_aware=False,
+             iou_aware_factor=0.5):
     r"""
 
     This operator generates YOLO detection boxes from output of YOLOv3 network.
@@ -256,7 +262,8 @@ def yolo_box(x,
     should be the same, H and W specify the grid size, each grid point predict 
     given number boxes, this given number, which following will be represented as S,
     is specified by the number of anchors. In the second dimension(the channel
-    dimension), C should be equal to S * (5 + class_num), class_num is the object 
+    dimension), C should be equal to S * (5 + class_num) if :attr:`iou_aware` is false,
+    otherwise C should be equal to S * (6 + class_num). class_num is the object
     category number of source dataset(such as 80 in coco dataset), so the 
     second(channel) dimension, apart from 4 box location coordinates x, y, w, h, 
     also includes confidence score of the box and class one-hot key of each anchor 
@@ -292,6 +299,15 @@ def yolo_box(x,
     score_{pred} = score_{conf} * score_{class}
     $$
 
+    where the confidence scores follow the formula bellow
+
+    .. math::
+
+        score_{conf} = \begin{case}
+                         obj, \text{if } iou_aware == flase \\
+                         obj^{1 - iou_aware_factor} * iou^{iou_aware_factor}, \text{otherwise}
+                       \end{case}
+
     Args:
         x (Tensor): The input tensor of YoloBox operator is a 4-D tensor with
                       shape of [N, C, H, W]. The second dimension(C) stores box
@@ -313,13 +329,14 @@ def yolo_box(x,
                                 should be set for the first, second, and thrid
                                 :attr:`yolo_box` layer.
         clip_bbox (bool): Whether clip output bonding box in :attr:`img_size`
-                          boundary. Default true."
-        "
+                          boundary. Default true.
         scale_x_y (float): Scale the center point of decoded bounding box.
                            Default 1.0
         name (string): The default value is None.  Normally there is no need 
                        for user to set this property.  For more information, 
                        please refer to :ref:`api_guide_Name`
+        iou_aware (bool): Whether use iou aware. Default false
+        iou_aware_factor (float): iou aware factor. Default 0.5
 
     Returns:
         Tensor: A 3-D tensor with shape [N, M, 4], the coordinates of boxes,
@@ -358,7 +375,8 @@ def yolo_box(x,
         boxes, scores = core.ops.yolo_box(
             x, img_size, 'anchors', anchors, 'class_num', class_num,
             'conf_thresh', conf_thresh, 'downsample_ratio', downsample_ratio,
-            'clip_bbox', clip_bbox, 'scale_x_y', scale_x_y)
+            'clip_bbox', clip_bbox, 'scale_x_y', scale_x_y, 'iou_aware',
+            iou_aware, 'iou_aware_factor', iou_aware_factor)
         return boxes, scores
 
     helper = LayerHelper('yolo_box', **locals())
@@ -378,6 +396,8 @@ def yolo_box(x,
         "downsample_ratio": downsample_ratio,
         "clip_bbox": clip_bbox,
         "scale_x_y": scale_x_y,
+        "iou_aware": iou_aware,
+        "iou_aware_factor": iou_aware_factor
     }
 
     helper.append_op(
diff --git a/python/paddle/vision/transforms/__init__.py b/python/paddle/vision/transforms/__init__.py
index f7c5b63b19ed081ee6887850c1aa3ef918715222..413f09f78699ee995f490e86a94006cd1a48c6a0 100644
--- a/python/paddle/vision/transforms/__init__.py
+++ b/python/paddle/vision/transforms/__init__.py
@@ -12,11 +12,70 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-from . import transforms
-from . import functional
+from .transforms import BaseTransform  # noqa: F401
+from .transforms import Compose  # noqa: F401
+from .transforms import Resize  # noqa: F401
+from .transforms import RandomResizedCrop  # noqa: F401
+from .transforms import CenterCrop  # noqa: F401
+from .transforms import RandomHorizontalFlip  # noqa: F401
+from .transforms import RandomVerticalFlip  # noqa: F401
+from .transforms import Transpose  # noqa: F401
+from .transforms import Normalize  # noqa: F401
+from .transforms import BrightnessTransform  # noqa: F401
+from .transforms import SaturationTransform  # noqa: F401
+from .transforms import ContrastTransform  # noqa: F401
+from .transforms import HueTransform  # noqa: F401
+from .transforms import ColorJitter  # noqa: F401
+from .transforms import RandomCrop  # noqa: F401
+from .transforms import Pad  # noqa: F401
+from .transforms import RandomRotation  # noqa: F401
+from .transforms import Grayscale  # noqa: F401
+from .transforms import ToTensor  # noqa: F401
+from .functional import to_tensor  # noqa: F401
+from .functional import hflip  # noqa: F401
+from .functional import vflip  # noqa: F401
+from .functional import resize  # noqa: F401
+from .functional import pad  # noqa: F401
+from .functional import rotate  # noqa: F401
+from .functional import to_grayscale  # noqa: F401
+from .functional import crop  # noqa: F401
+from .functional import center_crop  # noqa: F401
+from .functional import adjust_brightness  # noqa: F401
+from .functional import adjust_contrast  # noqa: F401
+from .functional import adjust_hue  # noqa: F401
+from .functional import normalize  # noqa: F401
 
-from .transforms import *
-from .functional import *
-
-__all__ = transforms.__all__ \
-        + functional.__all__
+__all__ = [ #noqa
+    'BaseTransform',
+    'Compose',
+    'Resize',
+    'RandomResizedCrop',
+    'CenterCrop',
+    'RandomHorizontalFlip',
+    'RandomVerticalFlip',
+    'Transpose',
+    'Normalize',
+    'BrightnessTransform',
+    'SaturationTransform',
+    'ContrastTransform',
+    'HueTransform',
+    'ColorJitter',
+    'RandomCrop',
+    'Pad',
+    'RandomRotation',
+    'Grayscale',
+    'ToTensor',
+    'to_tensor',
+    'hflip',
+    'vflip',
+    'resize',
+    'pad',
+    'rotate',
+    'to_grayscale',
+    'crop',
+    'center_crop',
+    'adjust_brightness',
+    'adjust_contrast',
+    'adjust_hue',
+    'normalize'
+]
diff --git a/python/paddle/vision/transforms/functional.py b/python/paddle/vision/transforms/functional.py
index 18a35915c99da505678a2ab836d21dd0ace56ee6..3087d5c3ed57702e9bd4d8de7a9a2273876101c7 100644
--- a/python/paddle/vision/transforms/functional.py
+++ b/python/paddle/vision/transforms/functional.py
@@ -29,11 +29,7 @@ from . import functional_pil as F_pil
 from . import functional_cv2 as F_cv2
 from . import functional_tensor as F_t
 
-__all__ = [
-    'to_tensor', 'hflip', 'vflip', 'resize', 'pad', 'rotate', 'to_grayscale',
-    'crop', 'center_crop', 'adjust_brightness', 'adjust_contrast', 'adjust_hue',
-    'normalize'
-]
+__all__ = []
 
 
 def _is_pil_image(img):
diff --git a/python/paddle/vision/transforms/functional_cv2.py b/python/paddle/vision/transforms/functional_cv2.py
index 99cbfd6dc4f8dd195960b776864bc523bdca2c71..38b50898be606787977c0ac0b32d7e4d6aafa050 100644
--- a/python/paddle/vision/transforms/functional_cv2.py
+++ b/python/paddle/vision/transforms/functional_cv2.py
@@ -33,6 +33,8 @@ else:
     Sequence = collections.abc.Sequence
     Iterable = collections.abc.Iterable
 
+__all__ = []
+
 
 def to_tensor(pic, data_format='CHW'):
     """Converts a ``numpy.ndarray`` to paddle.Tensor.
@@ -49,7 +51,7 @@ def to_tensor(pic, data_format='CHW'):
 
     """
 
-    if not data_format in ['CHW', 'HWC']:
+    if data_format not in ['CHW', 'HWC']:
         raise ValueError('data_format should be CHW or HWC. Got {}'.format(
             data_format))
 
@@ -392,7 +394,8 @@ def adjust_hue(img, hue_factor):
     cv2 = try_import('cv2')
 
     if not (-0.5 <= hue_factor <= 0.5):
-        raise ValueError('hue_factor is not in [-0.5, 0.5].'.format(hue_factor))
+        raise ValueError('hue_factor:{} is not in [-0.5, 0.5].'.format(
+            hue_factor))
 
     dtype = img.dtype
     img = img.astype(np.uint8)
diff --git a/python/paddle/vision/transforms/functional_pil.py b/python/paddle/vision/transforms/functional_pil.py
index eee60c5452b2de1235c577b2eabb8de1cfdc1467..b3ff37d7ea3bb12a09da7bec9c93c6f2dd5ebd6b 100644
--- a/python/paddle/vision/transforms/functional_pil.py
+++ b/python/paddle/vision/transforms/functional_pil.py
@@ -41,6 +41,8 @@ _pil_interp_from_str = {
     'hamming': Image.HAMMING
 }
 
+__all__ = []
+
 
 def to_tensor(pic, data_format='CHW'):
     """Converts a ``PIL.Image`` to paddle.Tensor.
@@ -57,7 +59,7 @@ def to_tensor(pic, data_format='CHW'):
 
     """
 
-    if not data_format in ['CHW', 'HWC']:
+    if data_format not in ['CHW', 'HWC']:
         raise ValueError('data_format should be CHW or HWC. Got {}'.format(
             data_format))
 
@@ -378,7 +380,8 @@ def adjust_hue(img, hue_factor):
 
     """
     if not (-0.5 <= hue_factor <= 0.5):
-        raise ValueError('hue_factor is not in [-0.5, 0.5].'.format(hue_factor))
+        raise ValueError('hue_factor:{} is not in [-0.5, 0.5].'.format(
+            hue_factor))
 
     input_mode = img.mode
     if input_mode in {'L', '1', 'I', 'F'}:
diff --git a/python/paddle/vision/transforms/functional_tensor.py b/python/paddle/vision/transforms/functional_tensor.py
index 7f490d57916fbcb67475cd433b09771d13261128..1ec67416998a3d03e391922ad078b827812661bf 100644
--- a/python/paddle/vision/transforms/functional_tensor.py
+++ b/python/paddle/vision/transforms/functional_tensor.py
@@ -23,6 +23,8 @@ import paddle.nn.functional as F
 import sys
 import collections
 
+__all__ = []
+
 
 def _assert_image_tensor(img, data_format):
     if not isinstance(
diff --git a/python/paddle/vision/transforms/transforms.py b/python/paddle/vision/transforms/transforms.py
index 00e12689c4d9fe41e67798309fee42ce63d0f7a5..27eca19c28be6ccfada7aeaa56a5615d1b6aa2f7 100644
--- a/python/paddle/vision/transforms/transforms.py
+++ b/python/paddle/vision/transforms/transforms.py
@@ -35,13 +35,7 @@ else:
     Sequence = collections.abc.Sequence
     Iterable = collections.abc.Iterable
 
-__all__ = [
-    "BaseTransform", "Compose", "Resize", "RandomResizedCrop", "CenterCrop",
-    "RandomHorizontalFlip", "RandomVerticalFlip", "Transpose", "Normalize",
-    "BrightnessTransform", "SaturationTransform", "ContrastTransform",
-    "HueTransform", "ColorJitter", "RandomCrop", "Pad", "RandomRotation",
-    "Grayscale", "ToTensor"
-]
+__all__ = []
 
 
 def _get_image_size(img):
@@ -854,13 +848,13 @@ class ColorJitter(BaseTransform):
     """Randomly change the brightness, contrast, saturation and hue of an image.
 
     Args:
-        brightness: How much to jitter brightness.
+        brightness (float): How much to jitter brightness.
             Chosen uniformly from [max(0, 1 - brightness), 1 + brightness]. Should be non negative numbers.
-        contrast: How much to jitter contrast.
+        contrast (float): How much to jitter contrast.
             Chosen uniformly from [max(0, 1 - contrast), 1 + contrast]. Should be non negative numbers.
-        saturation: How much to jitter saturation.
+        saturation (float): How much to jitter saturation.
             Chosen uniformly from [max(0, 1 - saturation), 1 + saturation]. Should be non negative numbers.
-        hue: How much to jitter hue.
+        hue (float): How much to jitter hue.
             Chosen uniformly from [-hue, hue]. Should have 0<= hue <= 0.5.
         keys (list[str]|tuple[str], optional): Same as ``BaseTransform``. Default: None.
 
diff --git a/python/setup.py.in b/python/setup.py.in
index 98d05c367f162330fd4f19d046c311bf01480399..866c2b400d5ca62559a24e7eb50fc2370c8bbe40 100644
--- a/python/setup.py.in
+++ b/python/setup.py.in
@@ -213,6 +213,7 @@ packages=['paddle',
           'paddle.nn',
           'paddle.nn.functional',
           'paddle.nn.layer',
+          'paddle.nn.quant',
           'paddle.nn.initializer',
           'paddle.nn.utils',
           'paddle.metric',
diff --git a/tools/get_pr_ut.py b/tools/get_pr_ut.py
index 78d9978c4bc45e0917c6de71a8220ca62959f028..93337978393498af1cba17d638e6076fa7fa7b84 100644
--- a/tools/get_pr_ut.py
+++ b/tools/get_pr_ut.py
@@ -112,7 +112,7 @@ class PRChecker(object):
                 print(e)
                 print(
                     'PREC download {} error, retry {} time(s) after {} secs.[proxy_option={}]'.
-                    format(url, ix, ix * 10, proxy))
+                    format(url, ix, ix * 10, cur_proxy))
                 continue
             else:
                 return True
@@ -179,7 +179,7 @@ class PRChecker(object):
     def get_comment_of_file(self, f):
         #content = self.repo.get_contents(f.replace(PADDLE_ROOT, ''), 'pull/').decoded_content
         #todo: get file from github
-        with open(f) as fd:
+        with open(f, encoding="utf-8") as fd:
             lines = fd.readlines()
         lineno = 1
         inputs = ''
diff --git a/tools/parallel_UT_rule.py b/tools/parallel_UT_rule.py
index 70d7fb98cb5387d228805119e2c6a07347e5d45d..fbc0b767eff44fcfebf5cf248eee50ba8c0374ad 100644
--- a/tools/parallel_UT_rule.py
+++ b/tools/parallel_UT_rule.py
@@ -634,9 +634,6 @@ TETRAD_PARALLEL_JOB = [
     'test_analyzer_bert',
     'test_analyzer_googlenet',
     'test_fleet_base',
-    'test_imperative_container_layerdict',
-    'test_set_value_op',
-    'test_view_op_reuse_allocation',
     'test_sequential',
     'test_sequential',
     'test_imperative_layers',
diff --git a/tools/print_signatures.py b/tools/print_signatures.py
index 6de9d84379fea595aa7497f9d22ce66f0e9f0c6f..3fa9e9b782c1ae07994bd867ad124bca95780913 100644
--- a/tools/print_signatures.py
+++ b/tools/print_signatures.py
@@ -27,11 +27,25 @@ import pydoc
 import hashlib
 import platform
 import functools
+import pkgutil
+import logging
+import paddle
 
 member_dict = collections.OrderedDict()
 
 visited_modules = set()
 
+logger = logging.getLogger()
+if logger.handlers:
+    # we assume the first handler is the one we want to configure
+    console = logger.handlers[0]
+else:
+    console = logging.StreamHandler(sys.stderr)
+    logger.addHandler(console)
+console.setFormatter(
+    logging.Formatter(
+        "%(asctime)s - %(funcName)s:%(lineno)d - %(levelname)s - %(message)s"))
+
 
 def md5(doc):
     try:
@@ -199,11 +213,124 @@ def visit_all_module(mod):
                 visit_member(mod.__name__, instance)
 
 
+# all from gen_doc.py
+api_info_dict = {}  # used by get_all_api
+
+
+# step 1: walkthrough the paddle package to collect all the apis in api_set
+def get_all_api(root_path='paddle', attr="__all__"):
+    """
+    walk through the paddle package to collect all the apis.
+    """
+    global api_info_dict
+    api_counter = 0
+    for filefinder, name, ispkg in pkgutil.walk_packages(
+            path=paddle.__path__, prefix=paddle.__name__ + '.'):
+        try:
+            if name in sys.modules:
+                m = sys.modules[name]
+            else:
+                # importlib.import_module(name)
+                m = eval(name)
+                continue
+        except AttributeError:
+            logger.warning("AttributeError occurred when `eval(%s)`", name)
+            pass
+        else:
+            api_counter += process_module(m, attr)
+
+    api_counter += process_module(paddle, attr)
+
+    logger.info('%s: collected %d apis, %d distinct apis.', attr, api_counter,
+                len(api_info_dict))
+
+    return [api_info['all_names'][0] for api_info in api_info_dict.values()]
+
+
+def insert_api_into_dict(full_name, gen_doc_anno=None):
+    """
+    insert add api into the api_info_dict
+    Return:
+        api_info object or None
+    """
+    try:
+        obj = eval(full_name)
+        fc_id = id(obj)
+    except AttributeError:
+        logger.warning("AttributeError occurred when `id(eval(%s))`", full_name)
+        return None
+    except:
+        logger.warning("Exception occurred when `id(eval(%s))`", full_name)
+        return None
+    else:
+        logger.debug("adding %s to api_info_dict.", full_name)
+        if fc_id in api_info_dict:
+            api_info_dict[fc_id]["all_names"].add(full_name)
+        else:
+            api_info_dict[fc_id] = {
+                "all_names": set([full_name]),
+                "id": fc_id,
+                "object": obj,
+                "type": type(obj).__name__,
+            }
+            docstr = inspect.getdoc(obj)
+            if docstr:
+                api_info_dict[fc_id]["docstring"] = inspect.cleandoc(docstr)
+            if gen_doc_anno:
+                api_info_dict[fc_id]["gen_doc_anno"] = gen_doc_anno
+        return api_info_dict[fc_id]
+
+
+# step 1 fill field : `id` & `all_names`, type, docstring
+def process_module(m, attr="__all__"):
+    api_counter = 0
+    if hasattr(m, attr):
+        # may have duplication of api
+        for api in set(getattr(m, attr)):
+            if api[0] == '_': continue
+            # Exception occurred when `id(eval(paddle.dataset.conll05.test, get_dict))`
+            if ',' in api: continue
+
+            # api's fullname
+            full_name = m.__name__ + "." + api
+            api_info = insert_api_into_dict(full_name)
+            if api_info is not None:
+                api_counter += 1
+                if inspect.isclass(api_info['object']):
+                    for name, value in inspect.getmembers(api_info['object']):
+                        if (not name.startswith("_")) and hasattr(value,
+                                                                  '__name__'):
+                            method_full_name = full_name + '.' + name  # value.__name__
+                            method_api_info = insert_api_into_dict(
+                                method_full_name, 'class_method')
+                            if method_api_info is not None:
+                                api_counter += 1
+    return api_counter
+
+
+def get_all_api_from_modulelist():
+    modulelist = [
+        paddle, paddle.amp, paddle.nn, paddle.nn.functional,
+        paddle.nn.initializer, paddle.nn.utils, paddle.static, paddle.static.nn,
+        paddle.io, paddle.jit, paddle.metric, paddle.distribution,
+        paddle.optimizer, paddle.optimizer.lr, paddle.regularizer, paddle.text,
+        paddle.utils, paddle.utils.download, paddle.utils.profiler,
+        paddle.utils.cpp_extension, paddle.sysconfig, paddle.vision,
+        paddle.distributed, paddle.distributed.fleet,
+        paddle.distributed.fleet.utils, paddle.distributed.parallel,
+        paddle.distributed.utils, paddle.callbacks, paddle.hub, paddle.autograd
+    ]
+    for m in modulelist:
+        visit_all_module(m)
+
+    return member_dict
+
+
 if __name__ == '__main__':
-    import paddle
-    modules = sys.argv[1].split(",")
-    for m in modules:
-        visit_all_module(importlib.import_module(m))
+    # modules = sys.argv[1].split(",")
+    # for m in modules:
+    #    visit_all_module(importlib.import_module(m))
+    get_all_api_from_modulelist()
 
     for name in member_dict:
         print(name, member_dict[name])
diff --git a/tools/sampcd_processor.py b/tools/sampcd_processor.py
index a1658e3c2edf790ba817ae3a098a5d660a94a050..0ac6c929c5d758eaa3775c2032dada30e909869d 100644
--- a/tools/sampcd_processor.py
+++ b/tools/sampcd_processor.py
@@ -39,14 +39,13 @@ if logger.handlers:
     console = logger.handlers[
         0]  # we assume the first handler is the one we want to configure
 else:
-    console = logging.StreamHandler()
+    console = logging.StreamHandler(stream=sys.stderr)
     logger.addHandler(console)
 console.setFormatter(logging.Formatter("%(message)s"))
 
 RUN_ON_DEVICE = 'cpu'
 SAMPLE_CODE_TEST_CAPACITY = set()
 GPU_ID = 0
-methods = []
 whl_error = []
 API_DEV_SPEC_FN = 'paddle/fluid/API_DEV.spec'
 API_PR_SPEC_FN = 'paddle/fluid/API_PR.spec'
@@ -247,13 +246,15 @@ def is_required_match(requirestr, cbtitle='not-specified'):
         False - not match
         None - skipped  # trick
     """
-    global SAMPLE_CODE_TEST_CAPACITY  # readonly
+    global SAMPLE_CODE_TEST_CAPACITY, RUN_ON_DEVICE  # readonly
     requires = set(['cpu'])
     if requirestr:
         for r in requirestr.split(','):
             rr = r.strip().lower()
             if rr:
                 requires.add(rr)
+    else:
+        requires.add(RUN_ON_DEVICE)
     if 'skip' in requires or 'skiptest' in requires:
         logger.info('%s: skipped', cbtitle)
         return None
@@ -283,8 +284,8 @@ def insert_codes_into_codeblock(codeblock, apiname='not-specified'):
         cpu_str = '\nimport os\nos.environ["CUDA_VISIBLE_DEVICES"] = ""\n'
         gpu_str = '\nimport os\nos.environ["CUDA_VISIBLE_DEVICES"] = "{}"\n'.format(
             GPU_ID)
-        if 'required' in codeblock:
-            if codeblock['required'] is None or codeblock['required'] == 'cpu':
+        if 'required' in codeblock and codeblock['required']:
+            if codeblock['required'] == 'cpu':
                 inserted_codes_f = cpu_str
             elif codeblock['required'] == 'gpu':
                 inserted_codes_f = gpu_str
@@ -426,20 +427,25 @@ stdout: %s
     return result, tfname, msg, end_time - start_time
 
 
-def get_filenames():
+def get_filenames(full_test=False):
     '''
     this function will get the sample code files that pending for check.
 
+    Args:
+        full_test: the full apis or the increment
+
     Returns:
 
         dict: the sample code files pending for check .
 
     '''
-    global methods  # write
     global whl_error
     import paddle
     whl_error = []
-    get_incrementapi()
+    if full_test:
+        get_full_api()
+    else:
+        get_incrementapi()
     all_sample_code_filenames = {}
     with open(API_DIFF_SPEC_FN) as f:
         for line in f.readlines():
@@ -472,8 +478,9 @@ def get_api_md5(path):
         api_md5(dict): key is the api's real fullname, value is the md5sum.
     """
     api_md5 = {}
-    API_spec = '%s/%s' % (os.path.abspath(os.path.join(os.getcwd(), "..")),
-                          path)
+    API_spec = os.path.abspath(os.path.join(os.getcwd(), "..", path))
+    if not os.path.isfile(API_spec):
+        return api_md5
     pat = re.compile(r'\((paddle[^,]+)\W*document\W*([0-9a-z]{32})')
     patArgSpec = re.compile(
         r'^(paddle[^,]+)\s+\(ArgSpec.*document\W*([0-9a-z]{32})')
@@ -487,6 +494,28 @@ def get_api_md5(path):
     return api_md5
 
 
+def get_full_api():
+    """
+    get all the apis
+    """
+    global API_DIFF_SPEC_FN  ## readonly
+    from print_signatures import get_all_api_from_modulelist
+    member_dict = get_all_api_from_modulelist()
+    with open(API_DIFF_SPEC_FN, 'w') as f:
+        f.write("\n".join(member_dict.keys()))
+
+
+def get_full_api_by_walk():
+    """
+    get all the apis
+    """
+    global API_DIFF_SPEC_FN  ## readonly
+    from print_signatures import get_all_api
+    apilist = get_all_api()
+    with open(API_DIFF_SPEC_FN, 'w') as f:
+        f.write("\n".join(apilist))
+
+
 def get_incrementapi():
     '''
     this function will get the apis that difference between API_DEV.spec and API_PR.spec.
@@ -526,6 +555,7 @@ def parse_args():
     #                     help='Use CPU mode (overrides --gpu)')
     # parser.add_argument('--gpu', dest='gpu_mode', action="store_true")
     parser.add_argument('--debug', dest='debug', action="store_true")
+    parser.add_argument('--full-test', dest='full_test', action="store_true")
     parser.add_argument('mode', type=str, help='run on device', default='cpu')
     for item in arguments:
         parser.add_argument(
@@ -545,6 +575,8 @@ if __name__ == '__main__':
     args = parse_args()
     if args.debug:
         logger.setLevel(logging.DEBUG)
+    else:
+        logger.setLevel(logging.INFO)
     if args.logf:
         logfHandler = logging.FileHandler(args.logf)
         logfHandler.setFormatter(
@@ -573,7 +605,7 @@ if __name__ == '__main__':
     else:
         os.mkdir(SAMPLECODE_TEMPDIR)
 
-    filenames = get_filenames()
+    filenames = get_filenames(args.full_test)
     if len(filenames) == 0 and len(whl_error) == 0:
         logger.info("-----API_PR.spec is the same as API_DEV.spec-----")
         exit(0)
@@ -593,6 +625,8 @@ if __name__ == '__main__':
     if not args.debug:
         shutil.rmtree(SAMPLECODE_TEMPDIR)
 
+    stdout_handler = logging.StreamHandler(stream=sys.stdout)
+    logger.addHandler(stdout_handler)
     logger.info("----------------End of the Check--------------------")
     if len(whl_error) != 0:
         logger.info("%s is not in whl.", whl_error)
diff --git a/tools/windows/run_unittests.sh b/tools/windows/run_unittests.sh
index 68d7ef336edba6ee805eb83753d8cd0d0ac383f6..4dbacbaa59a5da3f8025f7dd8ace1dfd46519c04 100644
--- a/tools/windows/run_unittests.sh
+++ b/tools/windows/run_unittests.sh
@@ -195,7 +195,7 @@ if [ ${WITH_GPU:-OFF} == "ON" ];then
     num=$(ctest -N | awk -F ': ' '{print $2}' | sed '/^$/d' | sed '$d' | wc -l)
     echo "Windows 1 card TestCases count is $num"
     if [ ${PRECISION_TEST:-OFF} == "ON" ]; then
-        python ${PADDLE_ROOT}/tools/get_pr_ut.py
+        python ${PADDLE_ROOT}/tools/get_pr_ut.py || echo "Failed to obtain ut_list !"
         if [[ -f "ut_list" ]]; then
             echo "PREC length: "`wc -l ut_list`
             precision_cases=`cat ut_list`