diff --git a/CMakeLists.txt b/CMakeLists.txt
index b39e48505c653e3163be2f49810dc3dc6ffb2c2f..bbdba79d2f2ded208aa70c5d99a51b4d364992b7 100755
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -79,7 +79,6 @@ find_package(Threads REQUIRED)
 include(simd)
 
 ################################ Exposed Configurations #######################################
-option(WITH_DSO         "Compile PaddlePaddle with dynamic linked CUDA" ON)
 option(WITH_AVX         "Compile PaddlePaddle with AVX intrinsics"      ${AVX_FOUND})
 option(WITH_PYTHON      "Compile PaddlePaddle with python interpreter"  ON)
 option(WITH_TESTING     "Compile PaddlePaddle with unit testing"        OFF)
@@ -107,6 +106,7 @@ option(SANITIZER_TYPE "Choose the type of sanitizer, options are: Address, Leak,
 option(WITH_LITE   "Compile Paddle Fluid with Lite Engine" OFF)
 option(WITH_NCCL   "Compile PaddlePaddle with NCCL support"             ON)
 option(WITH_CRYPTO   "Compile PaddlePaddle with crypto support"         ON)
+option(WITH_ARM   "Compile PaddlePaddle with arm support"         OFF)
 
 # PY_VERSION
 if(NOT PY_VERSION)
@@ -168,6 +168,9 @@ if(WITH_BRPC_RDMA)
     endif()
 endif()
 
+# lite subgraph compilation depends on CUDNN_ROOT,
+# so include(cudnn) needs to be in front of include(third_party/lite)
+include(cudnn)              # set cudnn libraries, must before configure
 include(third_party)        # download, build, install third_party
 
 if(WITH_DISTRIBUTE)
@@ -187,7 +190,6 @@ if(NOT WIN32)
 endif()
 
 include(flags)              # set paddle compile flags
-include(cudnn)              # set cudnn libraries, must before configure
 
 if(WITH_GPU)
     include(cuda)
@@ -213,6 +215,12 @@ if(WITH_AMD_GPU)
     include(hip)
 endif(WITH_AMD_GPU)
 
+if(WITH_ARM)
+    set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -fPIC")
+    set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fPIC")
+    add_definitions(-DPADDLE_WITH_ARM)
+endif()
+
 set(PADDLE_PYTHON_BUILD_DIR "${CMAKE_CURRENT_BINARY_DIR}/python/build")
 
 set(CMAKE_CXX_FLAGS_RELWITHDEBINFO "-O3 -g -DNDEBUG")
diff --git a/cmake/configure.cmake b/cmake/configure.cmake
index 1814656d244088bbf06611109d18caea73ad6a45..bb57b42dcc74114312a400a0f6cc95df307de6bb 100644
--- a/cmake/configure.cmake
+++ b/cmake/configure.cmake
@@ -16,10 +16,6 @@ if(NOT WITH_PYTHON)
     add_definitions(-DPADDLE_NO_PYTHON)
 endif(NOT WITH_PYTHON)
 
-if(WITH_DSO)
-    add_definitions(-DPADDLE_USE_DSO)
-endif(WITH_DSO)
-
 if(WITH_TESTING)
     add_definitions(-DPADDLE_WITH_TESTING)
 endif(WITH_TESTING)
@@ -70,10 +66,6 @@ endif()
 if(WITH_GPU)
     add_definitions(-DPADDLE_WITH_CUDA)
     add_definitions(-DEIGEN_USE_GPU)
-    # The compiler fully support const expressions since c++14,
-    # but Eigen use some const expressions such as std::max and std::min, which are not supported in c++11
-    # use following definition to set EIGEN_HAS_CONSTEXPR=0 to avoid compilation error in c++11
-    add_definitions(-DEIGEN_MAX_CPP_VER=11)
 
     FIND_PACKAGE(CUDA REQUIRED)
 
diff --git a/cmake/cuda.cmake b/cmake/cuda.cmake
index 6dcdb0f853f609fe61f6bc28055f53e9071d9075..1688d9d98b7a079b32322d03c46cd6d89b717881 100644
--- a/cmake/cuda.cmake
+++ b/cmake/cuda.cmake
@@ -188,12 +188,6 @@ endif()
 
 add_definitions("-DPADDLE_CUDA_BINVER=\"${CUDA_VERSION_MAJOR}${CUDA_VERSION_MINOR}\"")
 
-if(NOT WITH_DSO)
-    if(WIN32)
-      set_property(GLOBAL PROPERTY CUDA_MODULES ${CUDNN_LIBRARY} ${CUDA_CUBLAS_LIBRARIES} ${CUDA_curand_LIBRARY} ${CUDA_cusolver_LIBRARY})
-    endif(WIN32)
-endif(NOT WITH_DSO)
-
 # setting nvcc arch flags
 select_nvcc_arch_flags(NVCC_FLAGS_EXTRA)
 set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} ${NVCC_FLAGS_EXTRA}")
diff --git a/cmake/external/eigen.cmake b/cmake/external/eigen.cmake
index 30e74a3c241dcb48aa3bb9356dcc42a46283045c..631803da31d5a0fad1534d266f70f4708882f7e8 100644
--- a/cmake/external/eigen.cmake
+++ b/cmake/external/eigen.cmake
@@ -49,9 +49,14 @@ elseif(LINUX)
     # refer to: https://gitlab.com/libeigen/eigen/-/blob/4da2c6b1974827b1999bab652a3d4703e1992d26/Eigen/src/Core/arch/SSE/PacketMath.h#L33-60
     # add -fabi-version=4 could avoid above error, but will cause "double free corruption" when compile with gcc8
     # so use following patch to solve compilation error with different version of gcc.
-    file(TO_NATIVE_PATH ${PADDLE_SOURCE_DIR}/patches/eigen/Geometry_SSE.h native_src)
-    file(TO_NATIVE_PATH ${EIGEN_SOURCE_DIR}/Eigen/src/Geometry/arch/Geometry_SSE.h native_dst)
-    set(EIGEN_PATCH_COMMAND cp ${native_src} ${native_dst})
+    file(TO_NATIVE_PATH ${PADDLE_SOURCE_DIR}/patches/eigen/Geometry_SSE.h native_src1)
+    file(TO_NATIVE_PATH ${EIGEN_SOURCE_DIR}/Eigen/src/Geometry/arch/Geometry_SSE.h native_dst1)
+    # The compiler fully support const expressions since c++14,
+    # but Eigen use some const expressions such as std::max and std::min, which are not supported in c++11
+    # add patch to avoid compilation error in c++11
+    file(TO_NATIVE_PATH ${PADDLE_SOURCE_DIR}/patches/eigen/MathFunctions.h native_src2)
+    file(TO_NATIVE_PATH ${EIGEN_SOURCE_DIR}/Eigen/src/Core/MathFunctions.h native_dst2)
+    set(EIGEN_PATCH_COMMAND cp ${native_src1} ${native_dst1} && cp ${native_src2} ${native_dst2})
 endif()
 
 set(EIGEN_INCLUDE_DIR ${EIGEN_SOURCE_DIR})
diff --git a/cmake/external/lite.cmake b/cmake/external/lite.cmake
index 451c621eac1394743e189cedc36b28a713444d5e..9d7c21108601491744e81b15b48ec0cd31d9bf1d 100644
--- a/cmake/external/lite.cmake
+++ b/cmake/external/lite.cmake
@@ -25,7 +25,7 @@ if (NOT LITE_SOURCE_DIR OR NOT LITE_BINARY_DIR)
   set(LITE_INSTALL_DIR ${THIRD_PARTY_PATH}/install/lite)
 
   if(NOT LITE_GIT_TAG)
-    set(LITE_GIT_TAG 34c29406c27ee00cef033a98887403443eb2565f)
+    set(LITE_GIT_TAG ab8af5c4b4dc5b40217633e0aa436315912d7b53)
   endif()
 
   if(NOT CUDA_ARCH_NAME)
@@ -93,6 +93,7 @@ function(external_lite_static_libs alias path)
 endfunction()
 
 external_lite_static_libs(lite_full_static ${LITE_BINARY_DIR}/inference_lite_lib/cxx/lib/libpaddle_full_api_shared.so)
+set(LITE_SHARED_LIB ${LITE_BINARY_DIR}/inference_lite_lib/cxx/lib/libpaddle_full_api_shared.so)
 
 add_definitions(-DPADDLE_WITH_LITE)
 add_definitions(-DLITE_WITH_LOG)
diff --git a/cmake/external/mkldnn.cmake b/cmake/external/mkldnn.cmake
index 98bbf59cefbd623df26a0620a24d400399cf31a4..9f3606138defa04f979d8bea348e7bfda181af68 100644
--- a/cmake/external/mkldnn.cmake
+++ b/cmake/external/mkldnn.cmake
@@ -36,28 +36,12 @@ SET(CMAKE_INSTALL_RPATH "${CMAKE_INSTALL_RPATH}" "${MKLDNN_INSTALL_DIR}/${LIBDIR
 
 INCLUDE_DIRECTORIES(${MKLDNN_INC_DIR}) # For MKLDNN code to include internal headers.
 
-IF(${CBLAS_PROVIDER} STREQUAL "MKLML")
-    SET(MKLDNN_DEPENDS   ${MKLML_PROJECT})
-    MESSAGE(STATUS "Build MKLDNN with MKLML ${MKLML_ROOT}")
-ELSE()
-    MESSAGE(STATUS "Build MKLDNN without MKLML")
-ENDIF()
 
 IF(NOT WIN32)
     SET(MKLDNN_FLAG "-Wno-error=strict-overflow -Wno-error=unused-result -Wno-error=array-bounds")
     SET(MKLDNN_FLAG "${MKLDNN_FLAG} -Wno-unused-result -Wno-unused-value")
     SET(MKLDNN_CFLAG "${CMAKE_C_FLAGS} ${MKLDNN_FLAG}")
     SET(MKLDNN_CXXFLAG "${CMAKE_CXX_FLAGS} ${MKLDNN_FLAG}")
-
-    IF(${CBLAS_PROVIDER} STREQUAL "MKLML")
-    # Force libmkldnn.so to link libiomp5.so (provided by intel mkl) instead of libgomp.so (provided by gcc),
-    # since core_avx.so links libiomp5.so 
-        set(MKLDNN_SHARED_LINKER_FLAG "${CMAKE_SHARED_LINKER_FLAGS} -Wl,--no-as-needed -L${MKLML_LIB_DIR} -liomp5")
-        set(FORBID "-fopenmp")
-    ELSE()
-        set(MKLDNN_SHARED_LINKER_FLAG "${CMAKE_SHARED_LINKER_FLAGS}")
-        set(FORBID "")        
-    ENDIF()
 ELSE()
     SET(MKLDNN_CXXFLAG "${CMAKE_CXX_FLAGS} /EHsc")
 ENDIF(NOT WIN32)
@@ -91,8 +75,6 @@ ExternalProject_Add(
                         -DCMAKE_C_FLAGS=${MKLDNN_CFLAG}
                         -DCMAKE_CXX_FLAGS=${MKLDNN_CXXFLAG}
                         -DDNNL_BUILD_TESTS=OFF -DDNNL_BUILD_EXAMPLES=OFF
-                        -DCMAKE_SHARED_LINKER_FLAGS=${MKLDNN_SHARED_LINKER_FLAG}
-                        -DCMAKE_CXX_CREATE_SHARED_LIBRARY_FORBIDDEN_FLAGS=${FORBID}
     CMAKE_CACHE_ARGS    -DCMAKE_INSTALL_PREFIX:PATH=${MKLDNN_INSTALL_DIR}
 )
 if(WIN32)
diff --git a/cmake/external/openblas.cmake b/cmake/external/openblas.cmake
index 162ea532faca8477e15f899a163caaf154a88098..5e47f268a36699b7e2310c5f5b2c20bcf6f18f1b 100644
--- a/cmake/external/openblas.cmake
+++ b/cmake/external/openblas.cmake
@@ -19,6 +19,9 @@ SET(CBLAS_SOURCE_DIR  ${THIRD_PARTY_PATH}/openblas/src/extern_openblas)
 SET(CBLAS_INSTALL_DIR ${THIRD_PARTY_PATH}/install/openblas)
 SET(CBLAS_REPOSITORY  https://github.com/xianyi/OpenBLAS.git)
 SET(CBLAS_TAG         v0.3.7)
+IF(WITH_ARM)
+    SET(CBLAS_TAG v0.2.18)
+ENDIF()
 cache_third_party(extern_openblas
     REPOSITORY    ${CBLAS_REPOSITORY}
     TAG           ${CBLAS_TAG}
diff --git a/cmake/flags.cmake b/cmake/flags.cmake
index 0c12846efe5dcf8cc7f433c1ce2d3cfa5458d881..e6a77c38ab5c0f5178669d9a4d18c571b638fb21 100644
--- a/cmake/flags.cmake
+++ b/cmake/flags.cmake
@@ -187,7 +187,7 @@ set(GPU_COMMON_FLAGS
     -Wno-error=unused-function  # Warnings in Numpy Header.
     -Wno-error=array-bounds # Warnings in Eigen::array
 )
-if (NOT WITH_NV_JETSON)
+if (NOT WITH_NV_JETSON AND NOT WITH_ARM)
   set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -m64")
 endif()
 endif(NOT WIN32)
diff --git a/cmake/generic.cmake b/cmake/generic.cmake
index a47ffa71b6c674e019e9fe895d2bda113f765079..69f4ccae88471dfd5caf1ef2410c5aeefab7db3c 100644
--- a/cmake/generic.cmake
+++ b/cmake/generic.cmake
@@ -89,6 +89,8 @@
 
 # including binary directory for generated headers.
 include_directories(${CMAKE_CURRENT_BINARY_DIR})
+# including io directory for inference lib paddle_api.h
+include_directories("${PADDLE_SOURCE_DIR}/paddle/fluid/framework/io")
 
 if(NOT APPLE)
   find_package(Threads REQUIRED)
diff --git a/cmake/hip.cmake b/cmake/hip.cmake
index c3a748db502037f926dc241e4c3bc26a83ad3468..27ecd50e886b722efeb8c8e92b6dc3d854602650 100644
--- a/cmake/hip.cmake
+++ b/cmake/hip.cmake
@@ -13,10 +13,6 @@ include_directories("/opt/rocm/thrust")
 
 set(HIP_HCC_FLAGS "${HIP_HCC_FLAGS} -fPIC -DPADDLE_WITH_HIP -std=c++11" )
 
-if(WITH_DSO)
-  set(HIP_HCC_FLAGS "${HIP_HCC_FLAGS} -DPADDLE_USE_DSO")
-endif(WITH_DSO)
-
 if(WITH_TESTING)
   set(HIP_HCC_FLAGS "${HIP_HCC_FLAGS} -DPADDLE_WITH_TESTING")
 endif(WITH_TESTING)
diff --git a/cmake/inference_lib.cmake b/cmake/inference_lib.cmake
index 8c2549cc0516f88093fa432848516e39a38f0959..6fc81f2387b78cce10f9c099a022b2372993c4f9 100644
--- a/cmake/inference_lib.cmake
+++ b/cmake/inference_lib.cmake
@@ -107,6 +107,11 @@ function(copy_part_of_thrid_party TARGET DST)
             SRCS ${GLOG_INCLUDE_DIR} ${GLOG_LIBRARIES}
             DSTS ${dst_dir} ${dst_dir}/lib)
 
+        set(dst_dir "${DST}/third_party/install/cryptopp")
+        copy(${TARGET}
+        SRCS ${CRYPTOPP_INCLUDE_DIR} ${CRYPTOPP_LIBRARIES}
+        DSTS ${dst_dir} ${dst_dir}/lib)
+
     set(dst_dir "${DST}/third_party/install/xxhash")
     copy(${TARGET}
         SRCS ${XXHASH_INCLUDE_DIR} ${XXHASH_LIBRARIES}
@@ -178,7 +183,10 @@ endif()
 copy(inference_lib_dist
         SRCS  ${CMAKE_BINARY_DIR}/paddle/fluid/framework/framework.pb.h
         DSTS  ${FLUID_INFERENCE_INSTALL_DIR}/paddle/include/internal)
-
+copy(inference_lib_dist
+        SRCS  ${CMAKE_BINARY_DIR}/../paddle/fluid/framework/io/crypto/cipher.h
+        DSTS  ${FLUID_INFERENCE_INSTALL_DIR}/paddle/include/crypto/)
+include_directories(${CMAKE_BINARY_DIR}/../paddle/fluid/framework/io)
 # CAPI inference library for only inference
 set(FLUID_INFERENCE_C_INSTALL_DIR "${CMAKE_BINARY_DIR}/fluid_inference_c_install_dir" CACHE STRING
 "A path setting CAPI fluid inference shared")
diff --git a/cmake/tensorrt.cmake b/cmake/tensorrt.cmake
index b209c0d0b6fe56b9b212d78f43b20e9bff350493..d715dfd0dbe6d1efba395fc375d124d8900aeae5 100644
--- a/cmake/tensorrt.cmake
+++ b/cmake/tensorrt.cmake
@@ -37,16 +37,10 @@ find_library(TENSORRT_LIBRARY NAMES ${TR_INFER_LIB} ${TR_INFER_RT}
     DOC "Path to TensorRT library.")
 
 if(TENSORRT_INCLUDE_DIR AND TENSORRT_LIBRARY)
-    if(WITH_DSO)
-        set(TENSORRT_FOUND ON)
-    endif(WITH_DSO)
+    set(TENSORRT_FOUND ON)
 else()
     set(TENSORRT_FOUND OFF)
-    if(WITH_DSO)
-        message(WARNING "TensorRT is NOT found when WITH_DSO is ON.")
-    else(WITH_DSO)
-        message(STATUS "TensorRT is disabled because WITH_DSO is OFF.")
-    endif(WITH_DSO)
+    message(STATUS "TensorRT is disabled.")
 endif()
 
 if(TENSORRT_FOUND)
diff --git a/paddle/fluid/API.spec b/paddle/fluid/API.spec
new file mode 100644
index 0000000000000000000000000000000000000000..fc5aa1114835944468ee1c2c67072551c2425e28
--- /dev/null
+++ b/paddle/fluid/API.spec
@@ -0,0 +1,3 @@
+paddle.fluid.optimizer.PipelineOptimizer (paddle.fluid.optimizer.PipelineOptimizer, ('document', '2e55a29dbeb874934f7a1a1af3a22b8c'))
+paddle.fluid.optimizer.PipelineOptimizer.__init__ (ArgSpec(args=['self', 'optimizer', 'num_microbatches', 'start_cpu_core_id'], varargs=None, keywords=None, defaults=(1, 0)), ('document', '6adf97f83acf6453d4a6a4b1070f3754'))
+paddle.fluid.optimizer.PipelineOptimizer.minimize (ArgSpec(args=['self', 'loss', 'startup_program', 'parameter_list', 'no_grad_set'], varargs=None, keywords=None, defaults=(None, None, None)), ('document', '6adf97f83acf6453d4a6a4b1070f3754'))
diff --git a/paddle/fluid/framework/CMakeLists.txt b/paddle/fluid/framework/CMakeLists.txt
index cfc1c2a5f5ffb218dc5bd801046ab8495bcff09d..e1cb683e1ecf12d507a954003a8fae6312b85324 100644
--- a/paddle/fluid/framework/CMakeLists.txt
+++ b/paddle/fluid/framework/CMakeLists.txt
@@ -155,22 +155,31 @@ nv_test(op_registry_test SRCS op_registry_test.cc DEPS op_registry)
 if(WITH_PYTHON)
   py_proto_compile(framework_py_proto SRCS framework.proto data_feed.proto)
   py_proto_compile(trainer_py_proto SRCS trainer_desc.proto data_feed.proto)
+  py_proto_compile(distributed_strategy_py_proto SRCS distributed_strategy.proto)
 #Generate an empty \
     #__init__.py to make framework_py_proto as a valid python module.
   add_custom_target(framework_py_proto_init ALL COMMAND ${CMAKE_COMMAND} -E touch __init__.py)
-  add_dependencies(framework_py_proto framework_py_proto_init trainer_py_proto)
+  add_dependencies(framework_py_proto framework_py_proto_init trainer_py_proto distributed_strategy_py_proto)
   if (NOT WIN32)
     add_custom_command(TARGET framework_py_proto POST_BUILD
       COMMAND ${CMAKE_COMMAND} -E make_directory ${PADDLE_BINARY_DIR}/python/paddle/fluid/proto
+      COMMAND ${CMAKE_COMMAND} -E make_directory ${PADDLE_BINARY_DIR}/python/paddle/fleet/proto
+      COMMAND ${CMAKE_COMMAND} -E touch ${PADDLE_BINARY_DIR}/python/paddle/fleet/proto/__init__.py
       COMMAND cp *.py ${PADDLE_BINARY_DIR}/python/paddle/fluid/proto/
+      COMMAND cp distributed_strategy_*.py ${PADDLE_BINARY_DIR}/python/paddle/fleet/proto
       COMMENT "Copy generated python proto into directory paddle/fluid/proto."
       WORKING_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR})
   else(NOT WIN32)
     string(REPLACE "/" "\\" proto_dstpath "${PADDLE_BINARY_DIR}/python/paddle/fluid/proto/")
+    string(REPLACE "/" "\\" fleet_proto_dstpath "${PADDLE_BINARY_DIR}/python/paddle/fleet/proto/")
     add_custom_command(TARGET framework_py_proto POST_BUILD
           COMMAND ${CMAKE_COMMAND} -E make_directory ${PADDLE_BINARY_DIR}/python/paddle/fluid/proto
+	  COMMAND ${CMAKE_COMMAND} -E make_directory ${PADDLE_BINARY_DIR}/python/paddle/fleet/proto
+	  COMMAND ${CMAKE_COMMAND} -E touch ${PADDLE_BINARY_DIR}/python/paddle/fleet/proto/__init__.py
           COMMAND copy /Y *.py ${proto_dstpath}
+	  COMMAND copy /Y distributed_strategy_*.py ${fleet_proto_dstpath}
           COMMENT "Copy generated python proto into directory paddle/fluid/proto."
+	  COMMENT "Copy generated python proto into directory paddle/fleet/proto."
           WORKING_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR})
   endif(NOT WIN32)
 endif()
diff --git a/paddle/fluid/framework/details/CMakeLists.txt b/paddle/fluid/framework/details/CMakeLists.txt
index b9535ee493892667c99d3da35c8ab8462c4e589e..4d8bd101258664f6cafd71784ae070e0cb8b9215 100644
--- a/paddle/fluid/framework/details/CMakeLists.txt
+++ b/paddle/fluid/framework/details/CMakeLists.txt
@@ -101,6 +101,8 @@ cc_library(fast_threaded_ssa_graph_executor SRCS fast_threaded_ssa_graph_executo
         DEPS fetch_op_handle ssa_graph_executor scope simple_threadpool device_context)
 cc_test(fused_broadcast_op_test SRCS fused_broadcast_op_handle_test.cc DEPS fused_broadcast_op_handle)
 
+cc_test(exception_holder_test SRCS exception_holder_test.cc )
+
 set(IR_PASS_DEPS graph_viz_pass multi_devices_graph_pass
     multi_devices_graph_print_pass multi_devices_graph_check_pass
     fuse_elewise_add_act_pass fuse_bn_act_pass 
diff --git a/paddle/fluid/framework/details/exception_holder.h b/paddle/fluid/framework/details/exception_holder.h
index 6bb5a2954b17beba1703f9eacd4bf36bf58faa8c..f378566b60ec6b25bac0f6ef01b36d4964e4e9a0 100644
--- a/paddle/fluid/framework/details/exception_holder.h
+++ b/paddle/fluid/framework/details/exception_holder.h
@@ -15,9 +15,11 @@
 #pragma once
 
 #include <memory>
+#include <mutex>  // NOLINT
 #include <string>
 
 #include "glog/logging.h"
+#include "paddle/fluid/memory/allocation/allocator.h"
 #include "paddle/fluid/platform/enforce.h"
 
 namespace paddle {
@@ -29,15 +31,16 @@ class ExceptionHolder {
   void Catch(std::exception_ptr eptr) {
     try {
       std::rethrow_exception(eptr);
+    } catch (memory::allocation::BadAlloc& exp) {
+      Catch(exp);
     } catch (platform::EOFException& exp) {
       Catch(exp);
     } catch (platform::EnforceNotMet& exp) {
       Catch(exp);
     } catch (std::exception& ex) {
-      PADDLE_THROW(platform::errors::Fatal(
-          "Unknown std::exception caught:\n%s.", ex.what()));
+      Catch(ex);
     } catch (...) {
-      PADDLE_THROW(platform::errors::Fatal("Unknown exception caught."));
+      LOG(FATAL) << "Unknown exception caught.";
     }
   }
 
@@ -59,6 +62,15 @@ class ExceptionHolder {
         auto e = *static_cast<platform::EOFException*>(exception_.get());
         throw e;
       }
+      case kBadAlloc: {
+        auto e = *static_cast<paddle::memory::allocation::BadAlloc*>(
+            exception_.get());
+        throw e;
+      }
+      case kBaseException: {
+        auto e = *static_cast<std::exception*>(exception_.get());
+        throw e;
+      }
     }
     ClearImpl();
   }
@@ -79,6 +91,12 @@ class ExceptionHolder {
       case kEOF: {
         return "EOF";
       }
+      case kBadAlloc: {
+        return "BadAlloc";
+      }
+      case kBaseException: {
+        return "BaseException";
+      }
     }
     return "unknown";
   }
@@ -89,10 +107,31 @@ class ExceptionHolder {
     type_ = kNone;
   }
 
+  // NOTE: currently in PE, multiple exceptions may occured  in multiple
+  // threads, and the exception that occur later will overwrite that
+  // occur earlier, but what we want should be the first triggered exception.
+  // However, EOF exception is lower priority exception and can be overwritten,
+  // but other exceptions should not be prioritized.
   void Catch(const platform::EnforceNotMet& exp) {
     std::lock_guard<std::mutex> lock(mu_);
-    exception_.reset(new platform::EnforceNotMet(exp));
-    type_ = kEnforceNotMet;
+    if (exception_.get() == nullptr || type_ == kEOF) {
+      exception_.reset(new platform::EnforceNotMet(exp));
+      type_ = kEnforceNotMet;
+    } else {
+      VLOG(2) << "Non-first exception is discarded, the error message is"
+              << exception_->what();
+    }
+  }
+
+  void Catch(const memory::allocation::BadAlloc& exp) {
+    std::lock_guard<std::mutex> lock(mu_);
+    if (exception_.get() == nullptr || type_ == kEOF) {
+      exception_.reset(new paddle::memory::allocation::BadAlloc(exp));
+      type_ = kBadAlloc;
+    } else {
+      VLOG(2) << "Non-first exception is discarded, the error message is"
+              << exception_->what();
+    }
   }
 
   void Catch(const platform::EOFException& exp) {
@@ -101,10 +140,24 @@ class ExceptionHolder {
     if (exception_.get() == nullptr) {
       exception_.reset(new platform::EOFException(exp));
       type_ = kEOF;
+    } else {
+      VLOG(2) << "EOFException is skip, the error message of EOFException is "
+              << exception_->what();
+    }
+  }
+
+  void Catch(const std::exception& exp) {
+    std::lock_guard<std::mutex> lock(mu_);
+    if (exception_.get() == nullptr || type_ == kEOF) {
+      exception_.reset(new std::exception(exp));
+      type_ = kBaseException;
+    } else {
+      VLOG(2) << "Non-first exception is discarded, the error message is"
+              << exception_->what();
     }
   }
 
-  enum ExceptionType { kNone, kEnforceNotMet, kEOF };
+  enum ExceptionType { kNone, kEnforceNotMet, kEOF, kBadAlloc, kBaseException };
   ExceptionType type_{kNone};
 
   std::unique_ptr<std::exception> exception_;
diff --git a/paddle/fluid/framework/details/exception_holder_test.cc b/paddle/fluid/framework/details/exception_holder_test.cc
new file mode 100644
index 0000000000000000000000000000000000000000..c20563a08605086d6fd65506b5d0176bb8dce8bb
--- /dev/null
+++ b/paddle/fluid/framework/details/exception_holder_test.cc
@@ -0,0 +1,151 @@
+//   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/fluid/framework/details/exception_holder.h"
+#include <memory>
+#include <unordered_map>
+#include "gtest/gtest.h"
+#include "paddle/fluid/memory/allocation/allocator.h"
+
+namespace paddle {
+namespace framework {
+namespace details {
+namespace f = paddle::framework;
+namespace p = paddle::platform;
+
+TEST(ExceptionHolderTester, TestEnforceNotMetCatch) {
+  ExceptionHolder exception_holder;
+
+  try {
+    throw platform::EnforceNotMet("enforce not met test", "test_file", 0);
+  } catch (...) {
+    exception_holder.Catch(std::current_exception());
+  }
+  ASSERT_TRUE(exception_holder.IsCaught());
+  ASSERT_EQ(exception_holder.Type(), "EnforceNotMet");
+
+  bool catch_enforce_not_met = false;
+  try {
+    exception_holder.ReThrow();
+  } catch (platform::EnforceNotMet& ex) {
+    catch_enforce_not_met = true;
+  } catch (...) {
+    catch_enforce_not_met = false;
+  }
+
+  ASSERT_TRUE(catch_enforce_not_met);
+}
+
+TEST(ExceptionHolderTester, TestBadAllocCatch) {
+  ExceptionHolder exception_holder;
+
+  try {
+    throw memory::allocation::BadAlloc("bad alloc test", "test_file", 0);
+  } catch (...) {
+    exception_holder.Catch(std::current_exception());
+  }
+  ASSERT_TRUE(exception_holder.IsCaught());
+  ASSERT_EQ(exception_holder.Type(), "BadAlloc");
+
+  bool catch_bad_alloc = false;
+  try {
+    exception_holder.ReThrow();
+  } catch (memory::allocation::BadAlloc& ex) {
+    catch_bad_alloc = true;
+  } catch (...) {
+    catch_bad_alloc = false;
+  }
+
+  ASSERT_TRUE(catch_bad_alloc);
+}
+
+TEST(ExceptionHolderTester, TestBaseExpceptionCatch) {
+  ExceptionHolder exception_holder;
+
+  try {
+    throw std::exception();
+  } catch (...) {
+    exception_holder.Catch(std::current_exception());
+  }
+  ASSERT_TRUE(exception_holder.IsCaught());
+  ASSERT_EQ(exception_holder.Type(), "BaseException");
+
+  bool catch_base_exception = false;
+  try {
+    exception_holder.ReThrow();
+  } catch (std::exception& ex) {
+    catch_base_exception = true;
+  } catch (...) {
+    catch_base_exception = false;
+  }
+
+  ASSERT_TRUE(catch_base_exception);
+}
+
+TEST(ExceptionHolderTester, TestExceptionReplace) {
+  ExceptionHolder exception_holder;
+
+  try {
+    throw platform::EnforceNotMet("enforce not met test", "test_file", 0);
+  } catch (...) {
+    exception_holder.Catch(std::current_exception());
+  }
+  ASSERT_TRUE(exception_holder.IsCaught());
+  ASSERT_EQ(exception_holder.Type(), "EnforceNotMet");
+
+  try {
+    throw std::exception();
+  } catch (...) {
+    exception_holder.Catch(std::current_exception());
+  }
+  ASSERT_TRUE(exception_holder.IsCaught());
+  ASSERT_EQ(exception_holder.Type(), "EnforceNotMet");
+
+  try {
+    throw memory::allocation::BadAlloc("bad alloc test", "test_file", 0);
+  } catch (...) {
+    exception_holder.Catch(std::current_exception());
+  }
+  ASSERT_TRUE(exception_holder.IsCaught());
+  ASSERT_EQ(exception_holder.Type(), "EnforceNotMet");
+
+  try {
+    throw platform::EOFException("eof test", "test_file", 0);
+  } catch (...) {
+    exception_holder.Catch(std::current_exception());
+  }
+  ASSERT_EQ(exception_holder.Type(), "EnforceNotMet");
+
+  exception_holder.Clear();
+
+  try {
+    throw memory::allocation::BadAlloc("bad alloc test", "test_file", 0);
+  } catch (...) {
+    exception_holder.Catch(std::current_exception());
+  }
+  ASSERT_TRUE(exception_holder.IsCaught());
+  ASSERT_EQ(exception_holder.Type(), "BadAlloc");
+
+  try {
+    throw platform::EnforceNotMet("enforce not met test", "test_file", 0);
+  } catch (...) {
+    exception_holder.Catch(std::current_exception());
+  }
+  ASSERT_TRUE(exception_holder.IsCaught());
+  ASSERT_EQ(exception_holder.Type(), "BadAlloc");
+}
+
+}  // namespace details
+}  // namespace framework
+}  // namespace paddle
diff --git a/paddle/fluid/framework/details/fast_threaded_ssa_graph_executor.cc b/paddle/fluid/framework/details/fast_threaded_ssa_graph_executor.cc
index 9d1395c0356bd38d91c7d7378888921dcf85ee5b..f5ec78f44b5ebb780cc569c24ccdca6336195961 100644
--- a/paddle/fluid/framework/details/fast_threaded_ssa_graph_executor.cc
+++ b/paddle/fluid/framework/details/fast_threaded_ssa_graph_executor.cc
@@ -269,7 +269,14 @@ void FastThreadedSSAGraphExecutor::RecordOps(OpHandleBase *op) {
 void FastThreadedSSAGraphExecutor::ExecutionFinal(
     std::vector<OpHandleBase *> *fetch_ops) {
   VLOG(3) << "caught exception " << exception_.Type() << ", rethrow it";
-  ClearFetchOp(graph_, fetch_ops);
+  // NOTE: If a new exception occurs in this ClearFetchOp operation, it will
+  // cause the loss of exception triggered firstly not thrown.
+  // Instead, the cleanup operation should only be performed when an EOF
+  // exception is caught. If other exceptions are triggered, the ClearFetchOp
+  // should not be continued.
+  if (exception_.Type() == "EOF") {
+    ClearFetchOp(graph_, fetch_ops);
+  }
   exception_.ReThrow();
 }
 
diff --git a/paddle/fluid/framework/details/op_handle_base.cc b/paddle/fluid/framework/details/op_handle_base.cc
index 39303447d2fbf9eca5942f032feca155a2e4000f..35fe5d631fbaad61ce64ccf70d58d176aa3d3a20 100644
--- a/paddle/fluid/framework/details/op_handle_base.cc
+++ b/paddle/fluid/framework/details/op_handle_base.cc
@@ -36,7 +36,7 @@ OpHandleBase::~OpHandleBase() PADDLE_MAY_THROW {
 #ifdef PADDLE_WITH_CUDA
   for (auto &ev : events_) {
     if (ev.second) {
-      PADDLE_ENFORCE(cudaEventDestroy(ev.second));
+      PADDLE_ENFORCE_CUDA_SUCCESS(cudaEventDestroy(ev.second));
     }
   }
 #endif
diff --git a/paddle/fluid/framework/device_worker.h b/paddle/fluid/framework/device_worker.h
index 2b4751691bbdd33d204c2b41a4c37a24b6aef37c..d6d53a8858030734812587f6bbd03a108c5cf8ce 100644
--- a/paddle/fluid/framework/device_worker.h
+++ b/paddle/fluid/framework/device_worker.h
@@ -51,10 +51,6 @@ bool CheckValidOutput(LoDTensor* tensor, size_t batch_size);
 
 class FleetWrapper;
 
-#define SEC_LOG                                                              \
-  VLOG(3) << "[s" << section_id_ << "p" << pipeline_id_ << "t" << thread_id_ \
-          << "]: "
-
 class PullDenseWorker {
  public:
   virtual ~PullDenseWorker() {}
@@ -311,40 +307,9 @@ class DownpourWorkerOpt : public DownpourWorker {
 };
 
 #if defined(PADDLE_WITH_NCCL)
-using ScopeQueue = operators::reader::BlockingQueue<Scope*>;
-
-class SyncFunctor {
- public:
-  SyncFunctor(int rank_id, int rank_num, int sync_steps);
-  virtual ~SyncFunctor() {}
-
-  void SetSyncParam(const std::vector<std::string>& sync_param) {
-    sync_param_ = &sync_param;
-  }
-  void SetNcclCtxMap(platform::NCCLContextMap* nccl_ctx_map) {
-    nccl_ctx_map_ = nccl_ctx_map;
-  }
-
-  int operator()(Scope* scope);
-  static std::vector<Scope*> pipeline_scopes_;
-  static uint64_t sync_flag_;
-
- protected:
-  const int rank_id_;
-  const int rank_num_;
-  const std::vector<std::string>* sync_param_ = nullptr;
-  platform::NCCLContextMap* nccl_ctx_map_ = nullptr;
-
-  uint64_t sync_signal_;
-  const int sync_steps_;
-  int counter_;
-
-  void Synchronize();
-};
-
 class SectionWorker : public DeviceWorker {
  public:
-  SectionWorker() {}
+  SectionWorker() { local_batch_id_ = 0; }
   ~SectionWorker() override {}
 
   void Initialize(const TrainerDesc& desc) override;
@@ -360,50 +325,39 @@ class SectionWorker : public DeviceWorker {
   const platform::Place& place() const { return place_; }
 
   void SetSectionIndex(int section_id) { section_id_ = section_id; }
-  void SetDeviceIndex(int tid) override { pipeline_id_ = tid; }
+  void SetDeviceIndex(int tid) override {}
   void SetThreadIndex(int thread_id) { thread_id_ = thread_id; }
-  void SetVarNames(const std::vector<std::string>& in_var_names,
-                   const std::vector<std::string>& out_var_names) {
-    in_var_names_ = &in_var_names;
-    out_var_names_ = &out_var_names;
-  }
-  void SetScopeQueue(ScopeQueue* in_scope_queue, ScopeQueue* out_scope_queue) {
-    in_scope_queue_ = in_scope_queue;
-    out_scope_queue_ = out_scope_queue;
+  void SetMicrobatchNum(int num) { num_microbatches_ = num; }
+  void SetMicrobatchScopes(const std::vector<Scope*>& scope) {
+    microbatch_scopes_ = scope;
   }
-  void SetCountMutex(std::mutex* mutex) { worker_count_mutex_ = mutex; }
-  void SetWorkerCount(int* worker_count) { worker_count_ = worker_count; }
-  void SetSectionNum(int section_num) { section_num_ = section_num; }
-  void SetPipelineNum(int pipeline_num) { pipeline_num_ = pipeline_num; }
-  void SetNextSectionPlace(const paddle::platform::Place& place) {
-    next_section_place_ = place;
+  void SetMinibatchScope(const Scope* scope) { minibatch_scope_ = scope; }
+  void SetSkipVars(const std::vector<std::string>& skip_vars) {
+    skip_vars_ = skip_vars;
   }
-  SyncFunctor* sync_func_ = nullptr;
-  void SetSyncFunctor(SyncFunctor* sync_func) { sync_func_ = sync_func; }
 
   static std::atomic<int> cpu_id_;
 
  protected:
   void AutoSetCPUAffinity(bool reuse);
   int section_id_;
-  int pipeline_id_;
-  int section_num_;
-  int pipeline_num_;
   int thread_id_;
-  // This worker will consume scope from in_scope_queue_
-  // and produce scope to out_scope_queue_
-  ScopeQueue* in_scope_queue_ = nullptr;
-  ScopeQueue* out_scope_queue_ = nullptr;
-  const std::vector<std::string>* in_var_names_ = nullptr;
-  const std::vector<std::string>* out_var_names_ = nullptr;
-  std::mutex* worker_count_mutex_ = nullptr;
-  int* worker_count_ = nullptr;
-  paddle::platform::Place next_section_place_;
+  int num_microbatches_;
+  std::vector<Scope*> microbatch_scopes_;
+  std::vector<std::string> skip_vars_;
+  const Scope* minibatch_scope_;
 
   std::vector<std::unique_ptr<OperatorBase>> ops_;
+  static std::mutex thread_mutex;
+  static std::condition_variable thread_condition;
+  static bool threads_completed;
+  std::shared_ptr<framework::ProgramDesc> program_;
+  static uint64_t batch_id_;
+  uint64_t local_batch_id_;
 
   platform::DeviceContext* dev_ctx_ = nullptr;
 };
 #endif
+
 }  // namespace framework
 }  // namespace paddle
diff --git a/paddle/fluid/framework/distributed_strategy.proto b/paddle/fluid/framework/distributed_strategy.proto
new file mode 100644
index 0000000000000000000000000000000000000000..9bcd79cd34f07cb38ea28e1068bb6045cb82d27a
--- /dev/null
+++ b/paddle/fluid/framework/distributed_strategy.proto
@@ -0,0 +1,87 @@
+// Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+syntax = "proto2";
+package paddle.fleet;
+
+enum Mode {
+  COLLECTIVE = 1;
+  PS = 2;
+  PIPELINE = 3;
+  HETER = 4; // support XPU and GPU computing server
+}
+
+message DistributedStrategy {
+  optional Mode mode = 1 [ default = COLLECTIVE ]; // just for serialization
+  // collective training strategy
+  optional bool amp = 2 [ default = false ];
+  optional int32 amp_loss_scaling = 3 [ default = 32768 ];
+  optional bool recompute = 4 [ default = false ];
+  repeated string recompute_checkpoints = 5;
+  optional bool localsgd = 6 [ default = false ];
+  optional int32 localsgd_k_step = 7 [ default = 4 ];
+  optional bool dgc = 8 [ default = false ];
+  optional bool hierachical_allreduce = 9 [ default = false ];
+  optional int32 nccl_comm_num = 10 [ default = 1 ];
+  optional bool gradient_merge = 11 [ default = false ];
+  optional int32 gradient_merge_k_step = 12 [ default = 1 ];
+  optional bool sequential_execution = 13 [ default = false ];
+  optional bool enable_backward_optimizer_op_deps = 14 [ default = true ];
+  optional bool lars = 15 [ default = false ];
+  optional bool lamb = 16 [ default = false ];
+  optional bool fuse_elewise_add_act_ops = 17 [ default = false ];
+  optional bool fuse_bn_act_ops = 18 [ default = false ];
+  optional bool enable_auto_fusion = 19 [ default = false ];
+  optional bool fuse_relu_depthwise_conv = 20 [ default = false ];
+  optional bool enable_inplace = 21 [ default = false ];
+  optional bool fuse_all_reduce_ops = 22 [ default = false ];
+  optional int32 num_iteration_per_drop_scope = 23 [ default = 1 ];
+  optional bool sync_batch_norm = 24 [ default = false ];
+  optional bool fuse_all_optimizer_ops = 25 [ default = false ];
+
+  // pipeline training
+  optional bool pipeline = 101 [ default = false ];
+  optional int32 pipeline_micro_batch = 102;
+
+  // parameter server training
+  optional bool sync = 201 [ default = false ];
+  optional bool async = 202 [ default = true ];
+  optional int32 async_k_step = 203 [ default = -1 ];
+  optional int32 max_merge_var_num = 204 [ default = 1 ];
+  optional int32 send_queue_size = 205 [ default = 16 ];
+  optional bool independent_recv_thread = 206 [ default = false ];
+  optional int32 min_send_grad_num_before_recv = 207 [ default = 1 ];
+  optional int32 thread_pool_size = 208 [ default = 1 ];
+  optional int32 send_wait_times = 209 [ default = 1 ];
+  optional bool runtime_split_send_recv = 210 [ default = false ];
+  optional bool use_thread_barrier = 211 [ default = false ];
+
+  // elastic deep learning strategies
+  optional bool elastic = 301 [ default = false ];
+
+  // auto parallel
+  optional bool auto = 401 [ default = false ];
+}
+
+message DistributedJobInfo {
+  optional int32 worker_num = 1;
+  optional int32 server_num = 2;
+  repeated string worker_ips = 3;
+  repeated string server_endpoints = 4;
+  optional string origin_startup = 5;
+  optional string origin_main = 6; // without backpropagation and optimization
+  optional string distributed_main = 7; // with backpropagation and optimization
+  optional string optimizer_name = 8;   // optimizer name
+  optional DistributedStrategy strategy = 101;
+}
diff --git a/paddle/fluid/framework/fleet/box_wrapper.cu b/paddle/fluid/framework/fleet/box_wrapper.cu
index c315abd737c9bd42106f27b0ba11fece8163820d..31809532a69760c7398e19572694c03b8a1ae67e 100644
--- a/paddle/fluid/framework/fleet/box_wrapper.cu
+++ b/paddle/fluid/framework/fleet/box_wrapper.cu
@@ -23,9 +23,6 @@
 
 namespace paddle {
 namespace framework {
-#define CUDA_KERNEL_LOOP(i, n)                                 \
-  for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < (n); \
-       i += blockDim.x * gridDim.x)
 
 template <size_t EMBEDX_DIM, size_t EXPAND_EMBED_DIM>
 __global__ void PullCopy(
diff --git a/paddle/fluid/framework/io/crypto/aes_cipher_test.cc b/paddle/fluid/framework/io/crypto/aes_cipher_test.cc
index 3ba55f4f45561a80d712ae064f976c270be340dd..393c1bffdd0d5778fe5c33b2bc98a9d8a78596b0 100644
--- a/paddle/fluid/framework/io/crypto/aes_cipher_test.cc
+++ b/paddle/fluid/framework/io/crypto/aes_cipher_test.cc
@@ -18,9 +18,9 @@ limitations under the License. */
 #include <glog/logging.h>
 #include <gtest/gtest.h>
 
+#include <fstream>
 #include <string>
 #include <vector>
-
 #include "paddle/fluid/framework/io/crypto/cipher.h"
 #include "paddle/fluid/framework/io/crypto/cipher_utils.h"
 
diff --git a/paddle/fluid/framework/io/crypto/cipher.cc b/paddle/fluid/framework/io/crypto/cipher.cc
index eca175c020cb6f85eac2970aa9734c0a6850ebef..c258028e25066d14820017edaaa103b39c57158d 100644
--- a/paddle/fluid/framework/io/crypto/cipher.cc
+++ b/paddle/fluid/framework/io/crypto/cipher.cc
@@ -16,7 +16,9 @@
 #include "paddle/fluid/framework/io/crypto/aes_cipher.h"
 #include "paddle/fluid/framework/io/crypto/cipher_utils.h"
 #include "paddle/fluid/platform/enforce.h"
-
+#ifdef ON_INFER
+#include "paddle/fluid/inference/api/paddle_api.h"
+#endif
 namespace paddle {
 namespace framework {
 
@@ -57,4 +59,9 @@ std::shared_ptr<Cipher> CipherFactory::CreateCipher(
 }
 
 }  // namespace framework
+#ifdef ON_INFER
+std::shared_ptr<framework::Cipher> MakeCipher(const std::string& config_file) {
+  return framework::CipherFactory::CreateCipher(config_file);
+}
+#endif
 }  // namespace paddle
diff --git a/paddle/fluid/framework/io/crypto/cipher.h b/paddle/fluid/framework/io/crypto/cipher.h
index 9072cb1180d5c579b9cca9ebee3dbe810ebea2cf..fc31653c2402eaafc8f28697bcec805d694a2a07 100644
--- a/paddle/fluid/framework/io/crypto/cipher.h
+++ b/paddle/fluid/framework/io/crypto/cipher.h
@@ -46,6 +46,5 @@ class CipherFactory {
   CipherFactory() = default;
   static std::shared_ptr<Cipher> CreateCipher(const std::string& config_file);
 };
-
 }  // namespace framework
 }  // namespace paddle
diff --git a/paddle/fluid/framework/io/crypto/cipher_utils.h b/paddle/fluid/framework/io/crypto/cipher_utils.h
index 0533275798f3bf0870fdb0989c96852fffc07628..936f62f6ba65cfe26be56ed2dd828f1e42e8d1b2 100644
--- a/paddle/fluid/framework/io/crypto/cipher_utils.h
+++ b/paddle/fluid/framework/io/crypto/cipher_utils.h
@@ -14,11 +14,9 @@
 
 #pragma once
 
+#include <sstream>
 #include <string>
 #include <unordered_map>
-
-#include "paddle/fluid/platform/enforce.h"
-
 namespace paddle {
 namespace framework {
 
diff --git a/paddle/fluid/framework/io/shell.cc b/paddle/fluid/framework/io/shell.cc
index 995d19387ba6b367628198a13930b10c7385f318..937f053bf848cc29261fbac4708e636653803eb4 100644
--- a/paddle/fluid/framework/io/shell.cc
+++ b/paddle/fluid/framework/io/shell.cc
@@ -21,7 +21,7 @@ namespace framework {
 
 std::shared_ptr<FILE> shell_fopen(const std::string& path,
                                   const std::string& mode) {
-#if defined _WIN32 || defined __APPLE__ || defined PADDLE_ARM
+#if defined(_WIN32) || defined(__APPLE__) || defined(PADDLE_ARM)
   return nullptr;
 #else
   if (shell_verbose()) {
@@ -48,7 +48,7 @@ std::shared_ptr<FILE> shell_fopen(const std::string& path,
 // The implementation is async signal safe
 // Mostly copy from CPython code
 static int close_open_fds_internal() {
-#if defined _WIN32 || defined __APPLE__ || defined PADDLE_ARM
+#if defined(_WIN32) || defined(__APPLE__) || defined(PADDLE_ARM)
   return 0;
 #else
   struct linux_dirent {
@@ -103,8 +103,9 @@ static int close_open_fds_internal() {
 }
 
 static int shell_popen_fork_internal(const char* real_cmd, bool do_read,
-                                     int parent_end, int child_end) {
-#if defined _WIN32 || defined __APPLE__
+                                     int parent_end, int child_end,
+                                     bool redirect_stderr = false) {
+#if defined(_WIN32) || defined(__APPLE__) || defined(PADDLE_ARM)
   return 0;
 #else
   int child_pid = -1;
@@ -125,18 +126,41 @@ static int shell_popen_fork_internal(const char* real_cmd, bool do_read,
 
   if (child_end != child_std_end) {
     PCHECK(dup2(child_end, child_std_end) == child_std_end);
+    if (redirect_stderr && do_read) {
+      PCHECK(dup2(child_end, 2) == 2);
+    }
     close(child_end);
   }
 
   close_open_fds_internal();
   PCHECK(execl("/bin/bash", "bash", "-c", real_cmd, NULL) >= 0);
-  exit(127);
+  // Note: just for compilation. the child don't run this line.
+  _exit(0);
 #endif
 }
 
+static int read_from_pipe(FILE* fp, std::string* output) {
+  char buf[4096];
+  while (1) {
+    int n = fread(buf, 1, 4096, fp);
+    if (n <= 0) {
+      break;
+    }
+
+    output->append(buf, n);
+  }
+
+  if (!feof(fp)) {
+    return -1;
+  }
+
+  return 0;
+}
+
 std::shared_ptr<FILE> shell_popen(const std::string& cmd,
-                                  const std::string& mode, int* err_no) {
-#if defined _WIN32 || defined __APPLE__
+                                  const std::string& mode, int* err_no,
+                                  int* status, bool redirect_stderr) {
+#if defined(_WIN32) || defined(__APPLE__) || defined(PADDLE_ARM)
   return nullptr;
 #else
   bool do_read = mode == "r";
@@ -146,9 +170,7 @@ std::shared_ptr<FILE> shell_popen(const std::string& cmd,
     return NULL;
   }
 
-  if (shell_verbose()) {
-    LOG(INFO) << "Opening pipe[" << cmd << "] with mode[" << mode << "]";
-  }
+  VLOG(3) << "Opening pipe[" << cmd << "] with mode[" << mode << "]";
 
   std::string real_cmd = "set -o pipefail; " + cmd;
 
@@ -168,43 +190,54 @@ std::shared_ptr<FILE> shell_popen(const std::string& cmd,
     child_end = pipe_fds[0];
   }
 
-  int child_pid = shell_popen_fork_internal(real_cmd.c_str(), do_read,
-                                            parent_end, child_end);
-  close(child_end);
+  sighandler_t old_handler;
+  old_handler = signal(SIGCHLD, SIG_DFL);
+
   fcntl(parent_end, F_SETFD, FD_CLOEXEC);
-  FILE* fp;
+
+  int child_pid = shell_popen_fork_internal(
+      real_cmd.c_str(), do_read, parent_end, child_end, redirect_stderr);
+
+  close(child_end);
+
+  FILE* fp = NULL;
   if ((fp = fdopen(parent_end, mode.c_str())) == NULL) {
     *err_no = -1;
+    signal(SIGCHLD, old_handler);
     return NULL;
   }
-  return {fp, [child_pid, cmd, err_no](FILE* fp) {
-            if (shell_verbose()) {
-              LOG(INFO) << "Closing pipe[" << cmd << "]";
-            }
 
-            if (fclose(fp) != 0) {
+  return {fp, [cmd, child_pid, old_handler, err_no, status](FILE* fp) {
+            VLOG(3) << "Closing pipe[" << cmd << "]";
+            if (fclose(fp)) {
               *err_no = -1;
             }
+
             int wstatus = -1;
+            // don't do this before parent read data from child pipe
+            // or when get the large data, it will hang!
             waitpid(child_pid, &wstatus, 0);
-            if (wstatus == 0 || wstatus == (128 + SIGPIPE) * 256 ||
-                (wstatus == -1 && errno == ECHILD)) {
+
+            if (status) {
+              *status = wstatus;
+            }
+
+            if (WIFEXITED(wstatus) || wstatus == (128 + SIGPIPE) * 256) {
             } else {
+              PADDLE_ENFORCE_NE(
+                  errno, ECHILD,
+                  platform::errors::Fatal("Must not be ECHILD errno here!"));
               *err_no = -1;
-              LOG(WARNING) << "status[" << wstatus << "], cmd[" << cmd << "]"
-                           << ", err_no[" << *err_no << "]";
-            }
-            if (wstatus == -1 && errno == ECHILD) {
-              // temporarily remove this warning
-              // LOG(WARNING) << "errno is ECHILD";
             }
+
+            signal(SIGCHLD, old_handler);
           }};
 #endif
 }
 
 static int shell_p2open_fork_internal(const char* real_cmd, int pipein_fds[2],
                                       int pipeout_fds[2]) {
-#if defined _WIN32 || defined __APPLE__
+#if defined(_WIN32) || defined(__APPLE__) || defined(PADDLE_ARM)
   return 0;
 #else
   int child_pid = -1;
@@ -243,7 +276,7 @@ static int shell_p2open_fork_internal(const char* real_cmd, int pipein_fds[2],
 
 std::pair<std::shared_ptr<FILE>, std::shared_ptr<FILE>> shell_p2open(
     const std::string& cmd) {
-#if defined _WIN32 || defined __APPLE__
+#if defined(_WIN32) || defined(__APPLE__) || defined(PADDLE_ARM)
   return {};
 #else
   if (shell_verbose()) {
@@ -301,51 +334,102 @@ std::pair<std::shared_ptr<FILE>, std::shared_ptr<FILE>> shell_p2open(
 #endif
 }
 
-std::string shell_get_command_output(const std::string& cmd, int time_out,
-                                     int sleep_inter, bool print_cmd) {
-#if defined _WIN32 || defined __APPLE__
+#if defined(_WIN32) || defined(__APPLE__) || defined(PADDLE_ARM)
+#else
+static int _get_err_no(int err_no, int status) {
+  if (err_no == 0) {
+    if (WIFEXITED(status)) {
+      return WEXITSTATUS(status);
+    }
+    return -1;
+  }
+
+  return err_no;
+}
+#endif
+
+static int _shell_execute_cmd(const std::string& cmd, std::string* output,
+                              int time_out, int sleep_inter,
+                              bool redirect_stderr = false) {
+#if defined(_WIN32) || defined(__APPLE__) || defined(PADDLE_ARM)
   PADDLE_THROW(platform::errors::Unimplemented(
       "This function(shell_get_command_output) is not implemented under _WIN32 "
       "or __APPLE__."));
 #else
   int err_no = 0;
+  int status = 0;
+  int cmd_status = 0;
   platform::Timer timer;
   do {
-    if (print_cmd) {
-      LOG(INFO) << "exec cmd:[" << cmd << "]";
-    }
+    VLOG(3) << "exec cmd:[" << cmd << "]";
+
     err_no = 0;
-    std::shared_ptr<FILE> pipe = shell_popen(cmd, "r", &err_no);
-    string::LineFileReader reader;
+    status = 0;
+    *output = "";
+    auto pipe = shell_popen(cmd, "r", &err_no, &status, redirect_stderr);
 
-    char* buf = reader.getdelim(&*pipe, 0);
     if (err_no == 0) {
-      if (buf) {
-        return reader.get();
+      // read file
+      err_no = read_from_pipe(&*pipe, output);
+      if (err_no) {
+        LOG(WARNING) << "status[" << status << "], cmd[" << cmd << "]"
+                     << ", err_no[" << err_no << "]";
       }
-      return "";
     }
 
-    if (sleep_inter > 0) {
-      usleep(sleep_inter);
+    // close file and etc.
+    pipe = nullptr;
+    if (err_no) {
+      LOG(WARNING) << "status[" << status << "], cmd[" << cmd << "]"
+                   << ", err_no[" << err_no << "]";
+    }
+
+    cmd_status = _get_err_no(err_no, status);
+    // cmd run ok!
+    if (cmd_status == 0) {
+      return cmd_status;
     }
 
+    // time out
     timer.Pause();
-    if (time_out > 0 && timer.ElapsedMS() >= time_out) {
-      PADDLE_THROW(paddle::platform::errors::ExecutionTimeout(
-          "shell_get_command_output execute  error errno:%d and try until "
-          "timeout.",
-          errno));
-      return "";
+    if ((time_out > 0 && timer.ElapsedMS() >= time_out) || time_out == 0) {
+      break;
     }
     timer.Resume();
 
-    pipe = nullptr;
-  } while (err_no);
+    if (sleep_inter > 0) {
+      usleep(sleep_inter * 1000);
+    }
+  } while (cmd_status);
+
+  // log when check timeout!
+  if (time_out != 0) {
+    *output += string::Sprintf(
+        " _shell_execute_cmd execute cmd:%s ElapsedMS:%d, err_no:%d status:%d",
+        cmd, timer.ElapsedMS(), err_no, cmd_status);
+    LOG(WARNING) << *output;
+  }
+
+  return cmd_status;
 
-  return "";
 #endif
 }
 
+std::string shell_get_command_output(const std::string& cmd, int time_out,
+                                     int sleep_inter) {
+  std::string output;
+  _shell_execute_cmd(cmd, &output, time_out, sleep_inter);
+  return output;
+}
+
+std::vector<std::string> shell_execute_cmd(const std::string& cmd, int time_out,
+                                           int sleep_inter,
+                                           bool redirect_stderr) {
+  std::string output;
+  int ret =
+      _shell_execute_cmd(cmd, &output, time_out, sleep_inter, redirect_stderr);
+  return std::vector<std::string>({string::Sprintf("%d", ret), output});
+}
+
 }  // end namespace framework
 }  // end namespace paddle
diff --git a/paddle/fluid/framework/io/shell.h b/paddle/fluid/framework/io/shell.h
index 194b1c0edafc328a43fe6f733af1b76986cea38c..5b3e9a4df1d11b957d656181844f17a06574556f 100644
--- a/paddle/fluid/framework/io/shell.h
+++ b/paddle/fluid/framework/io/shell.h
@@ -28,6 +28,7 @@
 #include <memory>
 #include <string>
 #include <utility>
+#include <vector>
 #include "paddle/fluid/platform/port.h"
 #include "paddle/fluid/string/string_helper.h"
 
@@ -51,8 +52,10 @@ inline void shell_set_verbose(bool x) { shell_verbose_internal() = x; }
 extern std::shared_ptr<FILE> shell_fopen(const std::string& path,
                                          const std::string& mode);
 
-extern std::shared_ptr<FILE> shell_popen(const std::string& cmd,
-                                         const std::string& mode, int* err_no);
+std::shared_ptr<FILE> shell_popen(const std::string& cmd,
+                                  const std::string& mode, int* err_no,
+                                  int* status = NULL,
+                                  bool redirect_stderr = false);
 
 extern std::pair<std::shared_ptr<FILE>, std::shared_ptr<FILE>> shell_p2open(
     const std::string& cmd);
@@ -65,12 +68,17 @@ inline void shell_execute(const std::string& cmd) {
   } while (err_no == -1);
 }
 
-// timeout:ms, default -1 means forever.
+// time_out:ms, default value:-1 means forever.
 // sleep_inter:ms, default -1 means not sleep.
 extern std::string shell_get_command_output(const std::string& cmd,
-                                            int time_out = -1,
-                                            int sleep_inter = -1,
-                                            bool print_cmd = false);
+                                            int time_out = 10 * 60 * 1000,
+                                            int sleep_inter = 1000);
+// time_out:ms, default -1 means forever.
+// sleep_inter:ms, default -1 means not sleep.
+extern std::vector<std::string> shell_execute_cmd(const std::string& cmd,
+                                                  int time_out = 0,
+                                                  int sleep_inter = 0,
+                                                  bool redirect_stderr = false);
 
 }  // namespace framework
 }  // namespace paddle
diff --git a/paddle/fluid/framework/ir/attention_lstm_fuse_pass.cc b/paddle/fluid/framework/ir/attention_lstm_fuse_pass.cc
index a56fcd1a523391ce801bb2b8c3e9dfa424abdd54..a4b43086785b3fbc7acc82ac8b6952cae2bc7c11 100644
--- a/paddle/fluid/framework/ir/attention_lstm_fuse_pass.cc
+++ b/paddle/fluid/framework/ir/attention_lstm_fuse_pass.cc
@@ -135,7 +135,9 @@ void PrepareLSTMBias(const LoDTensor& B_forget, const LoDTensor& B_input,
 
 void PrepareParameters(Graph* graph, const Param& param, ir::Node* lstm_op) {
   // Check parameters
-  PADDLE_ENFORCE(graph->Has(kParamScopeAttr));
+  PADDLE_ENFORCE_EQ(graph->Has(kParamScopeAttr), true,
+                    platform::errors::InvalidArgument(
+                        "Graph have no attribute: kParamScopeAttr."));
   auto& scope = graph->Get<Scope>(kParamScopeAttr);
 
   // Create new parameters.
@@ -193,7 +195,10 @@ void PrepareParameters(Graph* graph, const Param& param, ir::Node* lstm_op) {
   // reshape attention_bias
   auto* attention_bias_t =
       scope.FindVar(param.AttentionBias)->GetMutable<LoDTensor>();
-  PADDLE_ENFORCE_EQ(attention_bias_t->dims().size(), 1);
+  PADDLE_ENFORCE_EQ(attention_bias_t->dims().size(), 1,
+                    platform::errors::InvalidArgument(
+                        "Tensor attention bias dimension size(%d) must be 1.",
+                        attention_bias_t->dims().size()));
   attention_bias_t->Resize(make_ddim({1, attention_bias_t->dims()[0]}));
 
   auto* attention_scalar_bias_t =
@@ -252,7 +257,10 @@ void PrepareLSTMBias(const LoDTensor& B_forget, const LoDTensor& B_input,
       B_forget.data<float>(), B_input.data<float>(), B_output.data<float>(),
       B_cell.data<float>()};
 
-  PADDLE_ENFORCE_EQ(B_forget.dims().size(), 1);
+  PADDLE_ENFORCE_EQ(B_forget.dims().size(), 1,
+                    platform::errors::InvalidArgument(
+                        "Tensor B forget dimension size(%d) must be 1.",
+                        B_forget.dims().size()));
   int D = B_forget.dims()[0];
   out->Resize(make_ddim({1, 4 * D}));
   auto* out_data = out->mutable_data<float>(platform::CPUPlace());
diff --git a/paddle/fluid/framework/ir/coalesce_grad_tensor_pass.cc b/paddle/fluid/framework/ir/coalesce_grad_tensor_pass.cc
index d7faf2ee648336982a6d0f3711298527a780f0b2..f3634f90e6c6984f494d0f571d0b11ecc713696d 100644
--- a/paddle/fluid/framework/ir/coalesce_grad_tensor_pass.cc
+++ b/paddle/fluid/framework/ir/coalesce_grad_tensor_pass.cc
@@ -119,9 +119,11 @@ class CoalesceGradTensorPass : public ir::Pass {
       p_g_dense_grad.insert(p_g_dense_grad.end(), group_p_g.begin(),
                             group_p_g.end());
     }
-    PADDLE_ENFORCE_EQ(
-        p_g_dense_grad.size(), num_of_p_g_dense_grad,
-        "The number of p_g_dense_grad is not consistent with before.");
+    PADDLE_ENFORCE_EQ(p_g_dense_grad.size(), num_of_p_g_dense_grad,
+                      platform::errors::InvalidArgument(
+                          "The number of dense grads is not consistent with "
+                          "previous. Previous(%d), now(%d).",
+                          p_g_dense_grad.size(), num_of_p_g_dense_grad));
 
     auto &pinned_var_set =
         graph->GetOrInit<details::PinnedVars>(details::kPinnedVars);
@@ -131,8 +133,11 @@ class CoalesceGradTensorPass : public ir::Pass {
     } else {
       for (auto &sub_param_grad : group_params_grads) {
         RecordGradients(p_g_dense_grad, vars_info, &pinned_var_set);
-        PADDLE_ENFORCE_EQ(IsUnifiedDtype(sub_param_grad, vars_info), true,
-                          "The data type of the same group is not consistent.");
+        PADDLE_ENFORCE_EQ(
+            IsUnifiedDtype(sub_param_grad, vars_info), true,
+            platform::errors::InvalidArgument("All gradient variable in "
+                                              "kGroupParamsAndDenseGrads, must "
+                                              "have same type."));
         CoalesceTensors(vars_info, sub_param_grad, &result);
       }
     }
@@ -145,15 +150,25 @@ class CoalesceGradTensorPass : public ir::Pass {
     // The Gradients should not be reused during memory optimization.
     for (auto &p_g : sub_param_grad) {
       auto iter = vars_info.find(p_g.second);
-      PADDLE_ENFORCE_EQ(iter != vars_info.end(), true, "%s is not found.",
-                        p_g.second);
-      PADDLE_ENFORCE_EQ(!iter->second.empty(), true);
+      PADDLE_ENFORCE_EQ(iter != vars_info.end(), true,
+                        platform::errors::NotFound(
+                            "Parameter@Grad %s is not found.", p_g.second));
+      PADDLE_ENFORCE_EQ(
+          !iter->second.empty(), true,
+          platform::errors::InvalidArgument(
+              "Parameter@Grad %s's var node is empty.", p_g.second));
       for (auto it : iter->second) {
-        PADDLE_ENFORCE_NOT_NULL(it->Var());
+        PADDLE_ENFORCE_NOT_NULL(
+            it->Var(),
+            platform::errors::InvalidArgument(
+                "A node of Parameter@Grad %s does not hold variable.",
+                p_g.second));
         pinned_var_set->insert(it->Var()->Name());
       }
       PADDLE_ENFORCE_EQ(IsLoDTensorType(GetTypeOfVar(vars_info, p_g.second)),
-                        true);
+                        true,
+                        platform::errors::InvalidArgument(
+                            "Parameter@Grad %s is not LoDTensor.", p_g.second));
     }
   }
 
@@ -192,8 +207,10 @@ class CoalesceGradTensorPass : public ir::Pass {
     auto fused_grad_var_name = std::string(details::kFusedVarNamePrefix) +
                                "@GRAD@" + params_grads.begin()->second;
     auto &fused_var_set = result->Get<details::FusedVars>(details::kFusedVars);
-    PADDLE_ENFORCE_EQ(fused_var_set.count(fused_grad_var_name), 0,
-                      "%s is duplicate in FusedVars.", fused_grad_var_name);
+    PADDLE_ENFORCE_EQ(
+        fused_var_set.count(fused_grad_var_name), 0,
+        platform::errors::AlreadyExists("Var(%s) is duplicate in FusedVars.",
+                                        fused_grad_var_name));
     fused_var_set.insert(fused_grad_var_name);
     result->Get<details::FusedGrads>(details::kFusedGrads)
         .emplace_back(fused_grad_var_name);
@@ -420,11 +437,16 @@ class CoalesceGradTensorPass : public ir::Pass {
       const std::unordered_map<std::string, std::vector<Node *>> &vars_info,
       const std::string &var_name) const {
     auto grad_iter = vars_info.find(var_name);
-    PADDLE_ENFORCE_EQ(grad_iter != vars_info.end(), true, "%s is not found.",
-                      var_name);
-    PADDLE_ENFORCE_EQ(!grad_iter->second.empty(), true, "%s is not found.",
-                      var_name);
-    PADDLE_ENFORCE_NOT_NULL(grad_iter->second.front()->Var());
+    PADDLE_ENFORCE_EQ(
+        grad_iter != vars_info.end(), true,
+        platform::errors::NotFound("Variable %s is not found.", var_name));
+    PADDLE_ENFORCE_EQ(!grad_iter->second.empty(), true,
+                      platform::errors::InvalidArgument(
+                          "Variable %s's node is empty.", var_name));
+    PADDLE_ENFORCE_NOT_NULL(
+        grad_iter->second.front()->Var(),
+        platform::errors::InvalidArgument(
+            "A node of %s does not hold variable.", var_name));
     return grad_iter->second.front()->Var();
   }
 
@@ -464,7 +486,12 @@ class CoalesceGradTensorPass : public ir::Pass {
       params_name.emplace_back(p_g.first);
       grads_name.emplace_back(p_g.second);
       auto next_dtype = GetDtypeOfVar(vars_info, p_g.second);
-      PADDLE_ENFORCE_EQ(next_dtype, dtype);
+      PADDLE_ENFORCE_EQ(
+          next_dtype, dtype,
+          platform::errors::InvalidArgument(
+              "All Parameter@Grad should have same dtype, but "
+              "there are two different type: %s, %s.",
+              DataTypeToString(next_dtype), DataTypeToString(dtype)));
     }
 
     result->Get<details::ProgramDescs>(details::kProgramDescs).emplace_back();
diff --git a/paddle/fluid/framework/ir/conv_affine_channel_fuse_pass.cc b/paddle/fluid/framework/ir/conv_affine_channel_fuse_pass.cc
index fecc159adef1992a90b6ee88b3b7ffceea116243..079fb1479861ca0840b47470339f2f7a5b6bffa8 100644
--- a/paddle/fluid/framework/ir/conv_affine_channel_fuse_pass.cc
+++ b/paddle/fluid/framework/ir/conv_affine_channel_fuse_pass.cc
@@ -50,7 +50,12 @@ void recompute_bias_and_weights(const Scope* scope, ir::Node* conv_weight,
       Eigen::Array<float, Eigen::Dynamic, Eigen::Dynamic, Eigen::RowMajor>>;
 
   // Re-compute bias of conv2d from AffineChannel
-  PADDLE_ENFORCE_EQ(eltwise_y_in_tensor->dims(), ac_bias_tensor.dims());
+  PADDLE_ENFORCE_EQ(
+      eltwise_y_in_tensor->dims(), ac_bias_tensor.dims(),
+      platform::errors::InvalidArgument(
+          "Tensor elementwise y(%d) and activation bias(%d) must have same "
+          "dimension.",
+          eltwise_y_in_tensor->dims().size(), ac_bias_tensor.dims().size()));
 
   auto* scale_tensor = scope->FindVar(ac_scale.Name())->GetMutable<LoDTensor>();
 
@@ -78,11 +83,13 @@ void recompute_bias_and_weights(const Scope* scope, ir::Node* conv_weight,
 }
 
 void ConvAffineChannelFusePass::ApplyImpl(ir::Graph* graph) const {
-  PADDLE_ENFORCE(graph);
+  PADDLE_ENFORCE_NOT_NULL(
+      graph, platform::errors::InvalidArgument("Graph cannot be nullptr."));
   FusePassBase::Init(name_scope_, graph);
 
   auto* scope = param_scope();
-  PADDLE_ENFORCE(scope);
+  PADDLE_ENFORCE_NOT_NULL(
+      scope, platform::errors::InvalidArgument("Scope cannot be nullptr."));
 
   GraphPatternDetector gpd;
   auto* conv_input =
@@ -152,11 +159,13 @@ void ConvAffineChannelFusePass::ApplyImpl(ir::Graph* graph) const {
 }
 
 void ConvEltwiseAddAffineChannelFusePass::ApplyImpl(ir::Graph* graph) const {
-  PADDLE_ENFORCE(graph);
+  PADDLE_ENFORCE_NOT_NULL(
+      graph, platform::errors::InvalidArgument("Graph cannot be nullptr."));
   FusePassBase::Init(name_scope_, graph);
 
   auto* scope = param_scope();
-  PADDLE_ENFORCE(scope);
+  PADDLE_ENFORCE_NOT_NULL(
+      scope, platform::errors::InvalidArgument("Scope cannot be nullptr."));
 
   GraphPatternDetector gpd;
   auto* conv_input =
diff --git a/paddle/fluid/framework/ir/conv_bn_fuse_pass.cc b/paddle/fluid/framework/ir/conv_bn_fuse_pass.cc
index 7313ef2cc35dd7c386c11252def211db34d665ad..60e4ac8cbcfd8cc8f1d14363538fe1e118b953cd 100644
--- a/paddle/fluid/framework/ir/conv_bn_fuse_pass.cc
+++ b/paddle/fluid/framework/ir/conv_bn_fuse_pass.cc
@@ -61,7 +61,12 @@ void recompute_bias_and_weights(const Scope* scope,
       Eigen::Array<float, Eigen::Dynamic, Eigen::Dynamic, Eigen::RowMajor>>;
 
   // Re-compute bias of conv2d from BN
-  PADDLE_ENFORCE_EQ(eltwise_y_in_tensor->dims(), bn_bias_tensor.dims());
+  PADDLE_ENFORCE_EQ(
+      eltwise_y_in_tensor->dims(), bn_bias_tensor.dims(),
+      platform::errors::InvalidArgument("Tensor elementwise y(%d) and batch "
+                                        "norm bias(%d) must have same dims.",
+                                        eltwise_y_in_tensor->dims().size(),
+                                        bn_bias_tensor.dims().size()));
 
   auto* scale_tensor = scope->FindVar(bn_scale.Name())->GetMutable<LoDTensor>();
   auto* variance_tensor =
@@ -116,11 +121,13 @@ void recompute_bias_and_weights(const Scope* scope,
 }
 
 void ConvBNFusePass::ApplyImpl(ir::Graph* graph) const {
-  PADDLE_ENFORCE(graph);
+  PADDLE_ENFORCE_NOT_NULL(
+      graph, platform::errors::InvalidArgument("Graph cannot be nullptr."));
   FusePassBase::Init(name_scope_, graph);
 
   auto* scope = param_scope();
-  PADDLE_ENFORCE(scope);
+  PADDLE_ENFORCE_NOT_NULL(
+      scope, platform::errors::InvalidArgument("Scope cannot be nullptr."));
 
   GraphPatternDetector gpd;
   auto* conv_input =
@@ -186,11 +193,18 @@ void ConvBNFusePass::ApplyImpl(ir::Graph* graph) const {
       if (has_bias && conv->Op()->Input("Bias").size() > 0) {
         // reuse existing conv bias node
         auto conv_bias_names = conv->Op()->Input("Bias");
-        PADDLE_ENFORCE_EQ(conv_bias_names.size(), 1UL);
+        PADDLE_ENFORCE_EQ(
+            conv_bias_names.size(), 1UL,
+            platform::errors::InvalidArgument("Find input var Bais error."));
         auto* conv_bias_var = scope->FindVar(conv_bias_names[0]);
         auto* conv_bias_tensor = conv_bias_var->GetMutable<LoDTensor>();
-        PADDLE_ENFORCE_EQ(conv_bias_tensor->dims(),
-                          eltwise_y_in_tensor->dims());
+        PADDLE_ENFORCE_EQ(
+            conv_bias_tensor->dims(), eltwise_y_in_tensor->dims(),
+            platform::errors::InvalidArgument(
+                "Tensor convolution bias(%d) and elementwise y(%d) "
+                "must have same dims.",
+                conv_bias_tensor->dims().size(),
+                eltwise_y_in_tensor->dims().size()));
 
         auto eigen_conv_bias = EigenVector<float>::From(*conv_bias_tensor);
         eigen_conv_bias += EigenVector<float>::From(*eltwise_y_in_tensor);
@@ -236,11 +250,13 @@ void ConvBNFusePass::ApplyImpl(ir::Graph* graph) const {
 }
 
 void ConvEltwiseAddBNFusePass::ApplyImpl(ir::Graph* graph) const {
-  PADDLE_ENFORCE(graph);
+  PADDLE_ENFORCE_NOT_NULL(
+      graph, platform::errors::InvalidArgument("Graph cannot be nullptr."));
   FusePassBase::Init(name_scope_, graph);
 
   auto* scope = param_scope();
-  PADDLE_ENFORCE(scope);
+  PADDLE_ENFORCE_NOT_NULL(
+      scope, platform::errors::InvalidArgument("Scope cannot be nullptr."));
 
   GraphPatternDetector gpd;
   auto* conv_input =
diff --git a/paddle/fluid/framework/ir/conv_bn_fuse_pass_tester.cc b/paddle/fluid/framework/ir/conv_bn_fuse_pass_tester.cc
index 168d0afb26d98626296bd6df9e151e6ad5aaa5dd..74dd6a7cdc5a64087e57b21bf175c983bea77a9d 100644
--- a/paddle/fluid/framework/ir/conv_bn_fuse_pass_tester.cc
+++ b/paddle/fluid/framework/ir/conv_bn_fuse_pass_tester.cc
@@ -71,8 +71,16 @@ void TestMain(const std::string& conv_type) {
   int num_bn_nodes_after = GetNumOpNodes(graph, "batch_norm");
   VLOG(3) << DebugString(graph);
 
-  PADDLE_ENFORCE_EQ(num_bn_nodes_before, 1);
-  PADDLE_ENFORCE_EQ(num_bn_nodes_after, 0);
+  PADDLE_ENFORCE_EQ(
+      num_bn_nodes_before, 1,
+      platform::errors::InvalidArgument(
+          "Before conv_bn_fuse_pass, number of batch norm op(%d) must be 1.",
+          num_bn_nodes_before));
+  PADDLE_ENFORCE_EQ(
+      num_bn_nodes_after, 0,
+      platform::errors::InvalidArgument(
+          "After conv_bn_fuse_pass, number of batch norm op(%d) must be 0.",
+          num_bn_nodes_after));
 }
 
 TEST(ConvBNFusePass, conv2d) { TestMain("conv"); }
diff --git a/paddle/fluid/framework/ir/conv_elementwise_add2_act_fuse_pass.cc b/paddle/fluid/framework/ir/conv_elementwise_add2_act_fuse_pass.cc
index b00be79a2a7da9c71084df5a9cacd8b7b7034950..2627da7dc40f19a9df22d2f44a4b1032df5cea01 100644
--- a/paddle/fluid/framework/ir/conv_elementwise_add2_act_fuse_pass.cc
+++ b/paddle/fluid/framework/ir/conv_elementwise_add2_act_fuse_pass.cc
@@ -91,7 +91,9 @@ void ConvElementwiseAdd2ActFusePass::ApplyImpl(ir::Graph* graph) const {
     auto* new_conv_op = graph->CreateOpNode(&new_op_desc);
 
     // Link inputs and outputs.
-    PADDLE_ENFORCE(subgraph.count(x));
+    PADDLE_ENFORCE_NE(
+        subgraph.count(x), 0,
+        platform::errors::NotFound("Detector did not find input x of conv2d."));
     auto* conv_in_node = subgraph.at(x);
 
     IR_NODE_LINK_TO(conv_in_node, new_conv_op);            // Input
diff --git a/paddle/fluid/framework/ir/conv_elementwise_add_act_fuse_pass.cc b/paddle/fluid/framework/ir/conv_elementwise_add_act_fuse_pass.cc
index b15871ef03fbb3834160b0e118ecded6b568e1ca..0b454a0407e48fcf2693975b00c60ee5448786e4 100644
--- a/paddle/fluid/framework/ir/conv_elementwise_add_act_fuse_pass.cc
+++ b/paddle/fluid/framework/ir/conv_elementwise_add_act_fuse_pass.cc
@@ -78,7 +78,9 @@ void ConvElementwiseAddActFusePass::ApplyImpl(ir::Graph* graph) const {
     auto* new_conv_op = graph->CreateOpNode(&new_op_desc);
 
     // Link inputs and outputs.
-    PADDLE_ENFORCE(subgraph.count(x));
+    PADDLE_ENFORCE_NE(
+        subgraph.count(x), 0,
+        platform::errors::NotFound("Detector did not find input x of conv2d."));
     auto* conv_in_node = subgraph.at(x);
 
     IR_NODE_LINK_TO(conv_in_node, new_conv_op);          // Input
diff --git a/paddle/fluid/framework/ir/conv_elementwise_add_fuse_pass.cc b/paddle/fluid/framework/ir/conv_elementwise_add_fuse_pass.cc
index 8c491d4f58b4d3a1d93fe075fd0d118feeb6f8c2..007770cf57d278d155650c00996413e3bc8e7b53 100644
--- a/paddle/fluid/framework/ir/conv_elementwise_add_fuse_pass.cc
+++ b/paddle/fluid/framework/ir/conv_elementwise_add_fuse_pass.cc
@@ -66,7 +66,9 @@ void ConvElementwiseAddFusePass::ApplyImpl(ir::Graph* graph) const {
     auto* new_conv_op = graph->CreateOpNode(&new_op_desc);
 
     // Link inputs and outputs.
-    PADDLE_ENFORCE(subgraph.count(x));
+    PADDLE_ENFORCE_NE(
+        subgraph.count(x), 0,
+        platform::errors::NotFound("Detector did not find input x of conv2d."));
     auto* conv_in_node = subgraph.at(x);
 
     IR_NODE_LINK_TO(conv_in_node, new_conv_op);          // Input
diff --git a/paddle/fluid/framework/ir/embedding_fc_lstm_fuse_pass.cc b/paddle/fluid/framework/ir/embedding_fc_lstm_fuse_pass.cc
index 85e2f2bad323f7d3bddaa29b98e9f2dc41cd95a9..c50b7476c6a9616a784646b3ef6a43140ac2d401 100644
--- a/paddle/fluid/framework/ir/embedding_fc_lstm_fuse_pass.cc
+++ b/paddle/fluid/framework/ir/embedding_fc_lstm_fuse_pass.cc
@@ -64,17 +64,23 @@ static int BuildFusion(Graph* graph, const std::string& name_scope,
 #undef SET_IN
 
     // Multiply embeddings with Weights
-    PADDLE_ENFORCE(scope);
+    PADDLE_ENFORCE_NOT_NULL(
+        scope, platform::errors::InvalidArgument("Scope cannot be nullptr."));
     const std::string& embeddings = patterns::UniqueKey("Embeddings");
     auto* embeddings_var = scope->Var(embeddings);
-    PADDLE_ENFORCE(embeddings_var);
+    PADDLE_ENFORCE_NOT_NULL(
+        embeddings_var,
+        platform::errors::InvalidArgument(
+            "Embeddings variable's pointer cannot be nullptr."));
     auto* embeddings_tensor =
         embeddings_var->GetMutable<framework::LoDTensor>();
     // Get WeightX size: [single_embedding, fc_size]
     // and embedding size: [dict_size, single_embedding]
     // and create new size of embeddings eg. [dict_size , hidden_size]
     auto* embedding_var = scope->FindVar(W->Name());
-    PADDLE_ENFORCE(embedding_var);
+    PADDLE_ENFORCE_NOT_NULL(
+        embedding_var, platform::errors::InvalidArgument(
+                           "Embedding variable's pointer cannot be nullptr."));
     const auto& embedding_tensor = embedding_var->Get<framework::LoDTensor>();
 
     const auto& weightx_tensor =
@@ -90,7 +96,9 @@ static int BuildFusion(Graph* graph, const std::string& name_scope,
 
     // Adding biases to GEMM result to be
     auto* lstm_bias_var = scope->FindVar(bias->Name());
-    PADDLE_ENFORCE(lstm_bias_var);
+    PADDLE_ENFORCE_NOT_NULL(lstm_bias_var,
+                            platform::errors::InvalidArgument(
+                                "Lstm bias var ptr cannot be nullptr."));
     const auto& lstm_bias_tensor = lstm_bias_var->Get<framework::LoDTensor>();
 
     auto alpha = 1.0f;
diff --git a/paddle/fluid/framework/ir/fc_elementwise_layernorm_fuse_pass_tester.cc b/paddle/fluid/framework/ir/fc_elementwise_layernorm_fuse_pass_tester.cc
index c1f822d7ca5cdc0a1bba1dbb5c646c61be244810..51e9545bf92e8310794898faaf45099237808e43 100644
--- a/paddle/fluid/framework/ir/fc_elementwise_layernorm_fuse_pass_tester.cc
+++ b/paddle/fluid/framework/ir/fc_elementwise_layernorm_fuse_pass_tester.cc
@@ -56,8 +56,17 @@ TEST(FCElementwiseLayerNormFusePass, basic) {
       GetNumOpNodes(graph, "fused_fc_elementwise_layernorm");
   VLOG(3) << DebugString(graph);
 
-  PADDLE_ENFORCE_EQ(num_nodes_before, num_nodes_after + 6);
-  PADDLE_ENFORCE_EQ(num_fused_nodes_after, 1);
+  PADDLE_ENFORCE_EQ(
+      num_nodes_before, num_nodes_after + 6,
+      platform::errors::InvalidArgument(
+          "After pass, the number of nodes should be reduced by 6, but the "
+          "number before pass is %d, after pass is %d.",
+          num_nodes_before, num_nodes_after));
+  PADDLE_ENFORCE_EQ(num_fused_nodes_after, 1,
+                    platform::errors::InvalidArgument(
+                        "After pass, the number of nodes of type "
+                        "'fused_fc_elementwise_layernorm' should be 1, not %d.",
+                        num_fused_nodes_after));
 }
 
 }  // namespace ir
diff --git a/paddle/fluid/framework/ir/fc_fuse_pass.cc b/paddle/fluid/framework/ir/fc_fuse_pass.cc
index 6a9c64e3a7f24d7d8f1848a959a0be8ab7544e5e..066a8fb975740ad5e45b4840a7404160d086b6f0 100644
--- a/paddle/fluid/framework/ir/fc_fuse_pass.cc
+++ b/paddle/fluid/framework/ir/fc_fuse_pass.cc
@@ -25,7 +25,8 @@ namespace framework {
 namespace ir {
 
 void FCFusePass::ApplyImpl(ir::Graph* graph) const {
-  PADDLE_ENFORCE_NOT_NULL(graph);
+  PADDLE_ENFORCE_NOT_NULL(
+      graph, platform::errors::InvalidArgument("Graph cannot be nullptr."));
   FusePassBase::Init("fc_fuse", graph);
 
   int found_fc_count = 0;
diff --git a/paddle/fluid/framework/ir/fc_fuse_pass_tester.cc b/paddle/fluid/framework/ir/fc_fuse_pass_tester.cc
index dfae572d4634e43fb288f5cc21bf53efc3834f5e..cf35c1ac772da079159cb4ced2edc234d7325b1e 100644
--- a/paddle/fluid/framework/ir/fc_fuse_pass_tester.cc
+++ b/paddle/fluid/framework/ir/fc_fuse_pass_tester.cc
@@ -79,9 +79,17 @@ TEST(FCFusePass, basic) {
   int num_fc_nodes_after = GetNumOpNodes(graph, "fc");
   VLOG(3) << DebugString(graph);
 
-  PADDLE_ENFORCE_EQ(num_nodes_before, num_nodes_after + 6);
-  PADDLE_ENFORCE_EQ(num_fc_nodes_after, 2);
-  PADDLE_ENFORCE_EQ(num_mul_nodes_before, num_fc_nodes_after);
+  PADDLE_ENFORCE_EQ(num_nodes_before, num_nodes_after + 6,
+                    platform::errors::InvalidArgument(
+                        "num_nodes_before=%d, num_nodes_after=%d.",
+                        num_nodes_before, num_nodes_after));
+  PADDLE_ENFORCE_EQ(num_fc_nodes_after, 2,
+                    platform::errors::InvalidArgument("num_fc_nodes_after=%d.",
+                                                      num_fc_nodes_after));
+  PADDLE_ENFORCE_EQ(num_mul_nodes_before, num_fc_nodes_after,
+                    platform::errors::InvalidArgument(
+                        "num_mul_nodes_before=%d, num_fc_nodes_after=%d.",
+                        num_mul_nodes_before, num_fc_nodes_after));
 }
 
 }  // namespace ir
diff --git a/paddle/fluid/framework/ir/fc_gru_fuse_pass.cc b/paddle/fluid/framework/ir/fc_gru_fuse_pass.cc
index d26998e6fc99d67e305f315d6994a6bc1133b2ef..08dd0302b4b49e6b434beb0141abd974d2c7888d 100644
--- a/paddle/fluid/framework/ir/fc_gru_fuse_pass.cc
+++ b/paddle/fluid/framework/ir/fc_gru_fuse_pass.cc
@@ -68,18 +68,27 @@ static int BuildFusion(Graph* graph, const std::string& name_scope,
 #undef SET_IMTERMEDIATE_OUT
 
     auto* op = graph->CreateOpNode(&op_desc);
-    PADDLE_ENFORCE(graph->Has(kParamScopeAttr));
+    PADDLE_ENFORCE_EQ(graph->Has(kParamScopeAttr), true,
+                      platform::errors::InvalidArgument(
+                          "Graph have no attr kParamScopeAttr."));
     auto& scope = graph->Get<Scope>(kParamScopeAttr);
     if (with_fc_bias) {
       // Fusion GRU bias = fcbias + grubias
       auto* fusion_bias_var = scope.Var(NEW_NAME(bias) + bias->Name());
       auto* out_bias_tensor =
           fusion_bias_var->GetMutable<framework::LoDTensor>();
-      PADDLE_ENFORCE(fusion_bias_var);
+      PADDLE_ENFORCE_NOT_NULL(
+          fusion_bias_var,
+          platform::errors::InvalidArgument(
+              "Fusion bias variable's pointer cannot be nullptr."));
       auto* gru_bias_var = scope.FindVar(bias->Name());
       auto* fc_bias_var = scope.FindVar(fc_bias->Name());
-      PADDLE_ENFORCE(gru_bias_var);
-      PADDLE_ENFORCE(fc_bias_var);
+      PADDLE_ENFORCE_NOT_NULL(gru_bias_var,
+                              platform::errors::InvalidArgument(
+                                  "Gru bias var ptr cannot be nullptr."));
+      PADDLE_ENFORCE_NOT_NULL(fc_bias_var,
+                              platform::errors::InvalidArgument(
+                                  "Fc bias var ptr cannot be nullptr."));
       const auto& gru_bias_tenosr = gru_bias_var->Get<framework::LoDTensor>();
       const auto& fc_bias_tensor = fc_bias_var->Get<framework::LoDTensor>();
       // new bias = fc bias + gru bias
diff --git a/paddle/fluid/framework/ir/fc_lstm_fuse_pass.cc b/paddle/fluid/framework/ir/fc_lstm_fuse_pass.cc
index 44306a729544dcbe19a8949d1b32242c39c9ceb9..12c7fc051e23a946ec9049e061499056f009bfa3 100644
--- a/paddle/fluid/framework/ir/fc_lstm_fuse_pass.cc
+++ b/paddle/fluid/framework/ir/fc_lstm_fuse_pass.cc
@@ -52,13 +52,17 @@ int BuildFusion(Graph* graph, const std::string& name_scope, Scope* scope,
 #undef SET_IN
     if (with_fc_bias) {
       // Add FC-bias with LSTM-bias and create a new weight
-      PADDLE_ENFORCE(scope);
+      PADDLE_ENFORCE_NOT_NULL(
+          scope, platform::errors::InvalidArgument("Scope cannot be nullptr."));
       const std::string& new_bias_var = patterns::UniqueKey("NewBias");
       auto* bias_var = scope->Var(new_bias_var);
-      PADDLE_ENFORCE(bias_var);
+      PADDLE_ENFORCE_NOT_NULL(bias_var, platform::errors::InvalidArgument(
+                                            "Bias var ptr cannot be nullptr."));
       auto* bias_tensor = bias_var->GetMutable<framework::LoDTensor>();
       auto* lstm_bias_var = scope->FindVar(bias->Name());
-      PADDLE_ENFORCE(lstm_bias_var);
+      PADDLE_ENFORCE_NOT_NULL(lstm_bias_var,
+                              platform::errors::InvalidArgument(
+                                  "Lstm bias var ptr cannot be nullptr."));
       const auto& lstm_bias_tensor = lstm_bias_var->Get<framework::LoDTensor>();
       bias_tensor->Resize(lstm_bias_tensor.dims());
 
diff --git a/paddle/fluid/framework/ir/fuse_optimizer_ops_pass/fuse_adam_op_pass.cc b/paddle/fluid/framework/ir/fuse_optimizer_ops_pass/fuse_adam_op_pass.cc
index 482d8cf3d2f19a02f760661e5779be6386271345..c284c1f4587cd6dd5c8eacc43968f45e4fbef699 100644
--- a/paddle/fluid/framework/ir/fuse_optimizer_ops_pass/fuse_adam_op_pass.cc
+++ b/paddle/fluid/framework/ir/fuse_optimizer_ops_pass/fuse_adam_op_pass.cc
@@ -50,18 +50,25 @@ class FuseAdamOpPass : public FuseOptimizerOpPass {
                                  fused_scale2->inputs.end());
     for (auto &out_node : fused_scale1->outputs) {
       if (fused_scale2_in_nodes.count(out_node)) {
-        PADDLE_ENFORCE(out_node->IsCtrlVar(),
-                       "The dependency var only should be ctrl var.");
+        PADDLE_ENFORCE_EQ(out_node->IsCtrlVar(), true,
+                          platform::errors::PreconditionNotMet(
+                              "In adam op pass, the dependency var(%s) only "
+                              "should be ctrl var.",
+                              out_node->Name()));
         not_need_ctrl_var_nodes.insert(out_node);
       }
     }
 
     for (auto &node : not_need_ctrl_var_nodes) {
       // remove this node from the input op node.
-      PADDLE_ENFORCE(!node->inputs.empty(),
-                     "The input should not be empty here.");
+      PADDLE_ENFORCE_EQ(
+          node->inputs.empty(), false,
+          platform::errors::PreconditionNotMet(
+              "Node(%s)'s input should not be empty here.", node->Name()));
       auto op_node = node->inputs.front();
-      PADDLE_ENFORCE(op_node->IsOp());
+      PADDLE_ENFORCE_EQ(op_node->IsOp(), true,
+                        platform::errors::PreconditionNotMet(
+                            "Node(%s) should be an OP node.", op_node->Name()));
       op_node->outputs.erase(
           remove_if(
               op_node->outputs.begin(), op_node->outputs.end(),
@@ -85,7 +92,9 @@ class FuseAdamOpPass : public FuseOptimizerOpPass {
       const std::unordered_map<std::string, std::vector<std::string>> &vars_set,
       const std::unordered_map<std::string, std::string> &fused_vars_name,
       const std::vector<ir::Node *> &adam_ops, ir::Graph *graph) const {
-    PADDLE_ENFORCE_GT(adam_ops.size(), static_cast<size_t>(0));
+    PADDLE_ENFORCE_GT(
+        adam_ops.size(), static_cast<size_t>(0),
+        platform::errors::InvalidArgument("No adam op in the graph."));
 
     // Check attributions
     // NOTE: If new attribution is added, the following code maybe need change.
@@ -102,22 +111,58 @@ class FuseAdamOpPass : public FuseOptimizerOpPass {
         int64_t, adam_ops[0]->Op()->GetAttr("min_row_size_to_use_multithread"));
     for (auto &adam_op : adam_ops) {
       PADDLE_ENFORCE_EQ(
-          beta1, BOOST_GET_CONST(float, adam_op->Op()->GetAttr("beta1")));
+          beta1, BOOST_GET_CONST(float, adam_op->Op()->GetAttr("beta1")),
+          platform::errors::PreconditionNotMet(
+              "All adam Op's attr(beta1) must be same, but there are two "
+              "different "
+              "value: %f, %f.",
+              beta1, BOOST_GET_CONST(float, adam_op->Op()->GetAttr("beta1"))));
       PADDLE_ENFORCE_EQ(
-          beta2, BOOST_GET_CONST(float, adam_op->Op()->GetAttr("beta2")));
+          beta2, BOOST_GET_CONST(float, adam_op->Op()->GetAttr("beta2")),
+          platform::errors::PreconditionNotMet(
+              "All adam Op's attr(beta2) must be same, but there are two "
+              "different "
+              "value: %f, %f.",
+              beta2, BOOST_GET_CONST(float, adam_op->Op()->GetAttr("beta2"))));
       PADDLE_ENFORCE_EQ(
-          epsilon, BOOST_GET_CONST(float, adam_op->Op()->GetAttr("epsilon")));
+          epsilon, BOOST_GET_CONST(float, adam_op->Op()->GetAttr("epsilon")),
+          platform::errors::PreconditionNotMet(
+              "All adam Op's attr(epsilon) must be same, but there are two "
+              "different "
+              "value: %f, %f.",
+              epsilon,
+              BOOST_GET_CONST(float, adam_op->Op()->GetAttr("epsilon"))));
       PADDLE_ENFORCE_EQ(
-          lazy_mode,
-          BOOST_GET_CONST(bool, adam_op->Op()->GetAttr("lazy_mode")));
+          lazy_mode, BOOST_GET_CONST(bool, adam_op->Op()->GetAttr("lazy_mode")),
+          platform::errors::PreconditionNotMet(
+              "All adam Op's attr(lazy_mode) must be same, but there are two "
+              "different "
+              "value: %d, %d.",
+              lazy_mode,
+              BOOST_GET_CONST(bool, adam_op->Op()->GetAttr("lazy_mode"))));
       PADDLE_ENFORCE_EQ(
           min_row_size_to_use_multithread,
           BOOST_GET_CONST(int64_t, adam_op->Op()->GetAttr(
-                                       "min_row_size_to_use_multithread")));
+                                       "min_row_size_to_use_multithread")),
+          platform::errors::PreconditionNotMet(
+              "All adam Op's attr(min_row_size_to_use_multithread) must be "
+              "same, but there are two different value: %I64, %I64.",
+              min_row_size_to_use_multithread,
+              BOOST_GET_CONST(
+                  int64_t,
+                  adam_op->Op()->GetAttr("min_row_size_to_use_multithread"))));
       PADDLE_ENFORCE_EQ(
           op_role,
           BOOST_GET_CONST(int, adam_op->Op()->GetAttr(
-                                   OpProtoAndCheckerMaker::OpRoleAttrName())));
+                                   OpProtoAndCheckerMaker::OpRoleAttrName())),
+          platform::errors::PreconditionNotMet(
+              "All adam Op's attr(op_role) must be same, but there are two "
+              "different "
+              "value: %d, %d.",
+              op_role,
+              BOOST_GET_CONST(int,
+                              adam_op->Op()->GetAttr(
+                                  OpProtoAndCheckerMaker::OpRoleAttrName()))));
     }
 
     // NOTE: fused_var is only exist in scope, so the graph doesn't have
@@ -154,7 +199,10 @@ class FuseAdamOpPass : public FuseOptimizerOpPass {
                          const std::string &fused_var_name,
                          const std::vector<ir::Node *> &adam_ops,
                          ir::Graph *graph) const {
-    PADDLE_ENFORCE_EQ(beta_name.size(), adam_ops.size());
+    PADDLE_ENFORCE_EQ(beta_name.size(), adam_ops.size(),
+                      platform::errors::InvalidArgument(
+                          "Beta name size(%d) must equal to adam op size(%d).",
+                          beta_name.size(), adam_ops.size()));
     const std::string scale_op_name = "scale";
 
     // Get the scale_ops of dealing the adam's beta var.
@@ -168,7 +216,9 @@ class FuseAdamOpPass : public FuseOptimizerOpPass {
             return var_node->Var() &&
                    var_node->Var()->Name() == beta_1_pow_name;
           });
-      PADDLE_ENFORCE(beta_pow_iter != adam_ops[i]->inputs.end());
+      PADDLE_ENFORCE_NE(beta_pow_iter, adam_ops[i]->inputs.end(),
+                        platform::errors::NotFound(
+                            "Can not find %s in adam ops.", beta_1_pow_name));
 
       auto beta_pow_node = *beta_pow_iter;
       auto scale_op_iter = std::find_if(
@@ -176,11 +226,18 @@ class FuseAdamOpPass : public FuseOptimizerOpPass {
           [&scale_op_name](ir::Node *op_node) -> bool {
             return op_node->Op() && op_node->Op()->Type() == scale_op_name;
           });
-      PADDLE_ENFORCE(scale_op_iter != beta_pow_node->outputs.end());
+      PADDLE_ENFORCE_NE(
+          scale_op_iter, beta_pow_node->outputs.end(),
+          platform::errors::NotFound("Can not find %s in beta pow node.",
+                                     scale_op_name));
 
       scale_ops.emplace_back(*scale_op_iter);
     }
-    PADDLE_ENFORCE_EQ(scale_ops.size(), beta_name.size());
+    PADDLE_ENFORCE_EQ(
+        scale_ops.size(), beta_name.size(),
+        platform::errors::PreconditionNotMet(
+            "Beta name size(%d) must equal to scale ops size(%d).",
+            beta_name.size(), scale_ops.size()));
     VLOG(6) << "The number of scale op is " << scale_ops.size() << ".";
     // Check attributions
     // NOTE: If new attribution is added, the following code maybe need change.
@@ -193,16 +250,40 @@ class FuseAdamOpPass : public FuseOptimizerOpPass {
         BOOST_GET_CONST(bool, scale_ops[0]->Op()->GetAttr("bias_after_scale"));
     for (auto &scale_op : scale_ops) {
       PADDLE_ENFORCE_EQ(
-          scale, BOOST_GET_CONST(float, scale_op->Op()->GetAttr("scale")));
+          scale, BOOST_GET_CONST(float, scale_op->Op()->GetAttr("scale")),
+          platform::errors::PreconditionNotMet(
+              "All scale Op's attr(scale) must be same, but there are two "
+              "different "
+              "value: %f, %f.",
+              scale, BOOST_GET_CONST(float, scale_op->Op()->GetAttr("scale"))));
       PADDLE_ENFORCE_EQ(
-          bias, BOOST_GET_CONST(float, scale_op->Op()->GetAttr("bias")));
+          bias, BOOST_GET_CONST(float, scale_op->Op()->GetAttr("bias")),
+          platform::errors::PreconditionNotMet(
+              "All scale Op's attr(bias) must be same, but there are two "
+              "different "
+              "value: %f, %f.",
+              bias, BOOST_GET_CONST(float, scale_op->Op()->GetAttr("bias"))));
       PADDLE_ENFORCE_EQ(
           bias_after_scale,
-          BOOST_GET_CONST(bool, scale_op->Op()->GetAttr("bias_after_scale")));
+          BOOST_GET_CONST(bool, scale_op->Op()->GetAttr("bias_after_scale")),
+          platform::errors::PreconditionNotMet(
+              "All scale Op's attr(bias_after_scale) must be same, but there "
+              "are two different value: %d, %d.",
+              bias_after_scale,
+              BOOST_GET_CONST(bool,
+                              scale_op->Op()->GetAttr("bias_after_scale"))));
       PADDLE_ENFORCE_EQ(
           op_role,
           BOOST_GET_CONST(int, scale_op->Op()->GetAttr(
-                                   OpProtoAndCheckerMaker::OpRoleAttrName())));
+                                   OpProtoAndCheckerMaker::OpRoleAttrName())),
+          platform::errors::PreconditionNotMet(
+              "All scale Op's attr(op_role) must be same, but there are two "
+              "different "
+              "value: %d, %d.",
+              op_role,
+              BOOST_GET_CONST(int,
+                              scale_op->Op()->GetAttr(
+                                  OpProtoAndCheckerMaker::OpRoleAttrName()))));
     }
 
     // NOTE: fused_var is only exist in scope, so the graph doesn't have
diff --git a/paddle/fluid/framework/ir/fuse_optimizer_ops_pass/fuse_momentum_op_pass.cc b/paddle/fluid/framework/ir/fuse_optimizer_ops_pass/fuse_momentum_op_pass.cc
index f70745be1bd6097007d07152d3cce1707350ca14..43ec8bff5edc10cbfc48c06a2e35a5a46ed7043c 100644
--- a/paddle/fluid/framework/ir/fuse_optimizer_ops_pass/fuse_momentum_op_pass.cc
+++ b/paddle/fluid/framework/ir/fuse_optimizer_ops_pass/fuse_momentum_op_pass.cc
@@ -37,7 +37,9 @@ class FuseMomentumOpPass : public FuseOptimizerOpPass {
       const std::unordered_map<std::string, std::vector<std::string>> &vars_set,
       const std::unordered_map<std::string, std::string> &fused_vars_name,
       const std::vector<ir::Node *> &momentum_ops, ir::Graph *graph) const {
-    PADDLE_ENFORCE_GT(momentum_ops.size(), static_cast<size_t>(0));
+    PADDLE_ENFORCE_GT(
+        momentum_ops.size(), static_cast<size_t>(0),
+        platform::errors::InvalidArgument("Momentum ops must not be empyt."));
 
     // Check attributions
     // NOTE: If new attribution is added, the following code maybe need change.
@@ -50,14 +52,32 @@ class FuseMomentumOpPass : public FuseOptimizerOpPass {
 
     for (auto &momentum_op : momentum_ops) {
       PADDLE_ENFORCE_EQ(
-          mu, BOOST_GET_CONST(float, momentum_op->Op()->GetAttr("mu")));
+          mu, BOOST_GET_CONST(float, momentum_op->Op()->GetAttr("mu")),
+          platform::errors::InvalidArgument(
+              "All momentum Op's attr(mu) must be same, but there are two "
+              "different "
+              "value: %f, %f.",
+              mu, BOOST_GET_CONST(float, momentum_op->Op()->GetAttr("mu"))));
       PADDLE_ENFORCE_EQ(
           use_nesterov,
-          BOOST_GET_CONST(bool, momentum_op->Op()->GetAttr("use_nesterov")));
+          BOOST_GET_CONST(bool, momentum_op->Op()->GetAttr("use_nesterov")),
+          platform::errors::InvalidArgument(
+              "All momentum Op's attr(use_nesterov) must be same, but there "
+              "are two different value: %d, %d.",
+              use_nesterov, BOOST_GET_CONST(bool, momentum_op->Op()->GetAttr(
+                                                      "use_nesterov"))));
       PADDLE_ENFORCE_EQ(
           op_role,
           BOOST_GET_CONST(int, momentum_op->Op()->GetAttr(
-                                   OpProtoAndCheckerMaker::OpRoleAttrName())));
+                                   OpProtoAndCheckerMaker::OpRoleAttrName())),
+          platform::errors::InvalidArgument(
+              "All momentum Op's attr(op_role) must be same, but there are two "
+              "different "
+              "value: %d, %d.",
+              op_role,
+              BOOST_GET_CONST(int,
+                              momentum_op->Op()->GetAttr(
+                                  OpProtoAndCheckerMaker::OpRoleAttrName()))));
     }
 
     // NOTE: fused_var is only exist in scope, so the graph doesn't have
diff --git a/paddle/fluid/framework/ir/fuse_optimizer_ops_pass/fuse_optimizer_op_pass.cc b/paddle/fluid/framework/ir/fuse_optimizer_ops_pass/fuse_optimizer_op_pass.cc
index 35bdfde96bc3c8a0a9247378849730d9ef4f54aa..fa86db891f88108f96d42ca3f1640a5b878d16aa 100644
--- a/paddle/fluid/framework/ir/fuse_optimizer_ops_pass/fuse_optimizer_op_pass.cc
+++ b/paddle/fluid/framework/ir/fuse_optimizer_ops_pass/fuse_optimizer_op_pass.cc
@@ -41,10 +41,12 @@ void FuseOptimizerOpPass::ApplyImpl(ir::Graph *graph) const {
   for (auto &node : topo_nodes) {
     if (node->Op()->Type() == fuse_op_type) {
       auto grad_name = node->Op()->Input(kGrad);
-      PADDLE_ENFORCE_EQ(grad_name.size(), static_cast<size_t>(1),
-                        "The %s operator has multiple gradient input. Expected "
-                        "it to only have one gradient input.",
-                        fuse_op_type);
+      PADDLE_ENFORCE_EQ(
+          grad_name.size(), static_cast<size_t>(1),
+          platform::errors::InvalidArgument(
+              "The %s operator has multiple gradient input. Expected "
+              "it to only have one gradient input.",
+              fuse_op_type));
       if (IsLoDTensorType(GetTypeOfVar(vars_info, grad_name[0]))) {
         opt_nodes.emplace_back(node);
       }
@@ -96,7 +98,8 @@ void FuseOptimizerOpPass::ApplyImpl(ir::Graph *graph) const {
     VLOG(6) << var_name << ": " << fused_var_name;
     PADDLE_ENFORCE_EQ(
         fused_var_set.count(fused_var_name), 0,
-        platform::errors::AlreadyExists("The fused variable already exists."));
+        platform::errors::AlreadyExists(
+            "The fused variable(%s) already exists.", fused_var_name));
     fused_var_set.insert(fused_var_name);
     fused_vars_name.emplace(var_name, fused_var_name);
   }
@@ -110,7 +113,10 @@ void FuseOptimizerOpPass::ApplyImpl(ir::Graph *graph) const {
         result.Get<details::ParamsAndGrads>(details::kParamsAndDenseGrads);
     PADDLE_ENFORCE_LE(
         params_and_dense_grads.size(), aux_var_map.at(kGrad).size(),
-        "The number of dense gradients should be little than optimizer ops.");
+        platform::errors::InvalidArgument(
+            "The number of dense gradients(%d) should be "
+            "little than optimizer ops(%d).",
+            params_and_dense_grads.size(), aux_var_map.at(kGrad).size()));
 
     std::unordered_set<std::string> opt_grad_set(aux_var_map.at(kGrad).size());
     for (auto &p_g : params_and_dense_grads) {
@@ -130,13 +136,14 @@ void FuseOptimizerOpPass::ApplyImpl(ir::Graph *graph) const {
     // some gradient's name maybe changed.
     if (new_grad_idx.size() == 0) {
       if (!result.Has(details::kFusedGrads)) {
-        PADDLE_THROW(
+        PADDLE_THROW(platform::errors::PreconditionNotMet(
             "The coalesce_grad_tensor_pass should "
-            "be called before this pass.");
+            "be called before this pass."));
       }
       auto &fused_grad = result.Get<details::FusedGrads>(details::kFusedGrads);
       PADDLE_ENFORCE_NE(fused_grad.size(), 0,
-                        "The fused gradient should not be empty.");
+                        platform::errors::NotFound(
+                            "The fused gradient should not be empty."));
       if (fused_grad.size() > 1) {
         // Note(chenweihang): Because the dtype of those gradients is not
         //   unified,so the number of fused gradients is more than one,
@@ -146,8 +153,9 @@ void FuseOptimizerOpPass::ApplyImpl(ir::Graph *graph) const {
       auto &fused_vars = result.Get<details::FusedVars>(details::kFusedVars);
       auto iter =
           std::find(fused_vars.begin(), fused_vars.end(), fused_grad.front());
-      PADDLE_ENFORCE_EQ(iter != fused_vars.end(), true,
-                        "Not found the fused gradient variable.");
+      PADDLE_ENFORCE_EQ(
+          iter != fused_vars.end(), true,
+          platform::errors::NotFound("Not found the fused gradient variable."));
       fused_vars_name[kGrad] = fused_grad.front();
 
       // Sort the parameters and auxiliary variables according
@@ -334,16 +342,24 @@ void FuseOptimizerOpPass::FuseGradientsToContinuousSpace(
   // The Gradients should not be reused during memory optimization.
   for (auto &grad_var_name : grads) {
     auto iter = vars_info.find(grad_var_name);
-    PADDLE_ENFORCE_EQ(iter != vars_info.end(), true,
-                      "The gradient variable %s is not found.", grad_var_name);
-    PADDLE_ENFORCE_EQ(!iter->second.empty(), true,
-                      "The gradient var node %s is not found.", grad_var_name);
-    PADDLE_ENFORCE_NOT_NULL(iter->second.front()->Var(),
-                            "The gradient var node is null.");
+    PADDLE_ENFORCE_EQ(
+        iter != vars_info.end(), true,
+        platform::errors::NotFound("The gradient variable %s is not found.",
+                                   grad_var_name));
+    PADDLE_ENFORCE_EQ(
+        !iter->second.empty(), true,
+        platform::errors::NotFound("The gradient var node %s is not found.",
+                                   grad_var_name));
+    PADDLE_ENFORCE_NOT_NULL(
+        iter->second.front()->Var(),
+        platform::errors::InvalidArgument("The gradient var(%s) node is null.",
+                                          grad_var_name));
     PADDLE_ENFORCE_EQ(
         IsLoDTensorType(iter->second.front()->Var()->GetType()), true,
-        "Currently the gradient type only should be LoDTensor when "
-        "fusing optimizer ops.");
+        platform::errors::InvalidArgument(
+            "Currently the gradient(%s) type only should be LoDTensor when "
+            "fusing optimizer ops.",
+            grad_var_name));
     for (auto var : iter->second) {
       pinned_var_set.insert(var->Var()->Name());
     }
@@ -382,11 +398,14 @@ const VarDesc *FuseOptimizerOpPass::GetVarDescFromVarsInfo(
     const std::string &var_name) const {
   auto grad_iter = vars_info.find(var_name);
   PADDLE_ENFORCE_EQ(grad_iter != vars_info.end(), true,
-                    "The gradient variable %s is not found.", var_name);
+                    platform::errors::NotFound(
+                        "The gradient variable %s is not found.", var_name));
   PADDLE_ENFORCE_EQ(!grad_iter->second.empty(), true,
-                    "The gradient var node %s is not found.", var_name);
+                    platform::errors::NotFound(
+                        "The gradient var node %s is not found.", var_name));
   PADDLE_ENFORCE_NOT_NULL(grad_iter->second.front()->Var(),
-                          "The gradient var node is null.");
+                          platform::errors::InvalidArgument(
+                              "The gradient var(%s) node is null.", var_name));
   return grad_iter->second.front()->Var();
 }
 
@@ -428,8 +447,9 @@ void FuseOptimizerOpPass::SortParametersAndAuxVars(
     const std::vector<std::pair<std::string, std::string>> &params_grads,
     std::unordered_map<std::string, std::vector<std::string>> *aux_var_map,
     std::vector<ir::Node *> *ops) const {
-  PADDLE_ENFORCE_NE(aux_var_map->count(kGrad), static_cast<size_t>(0),
-                    "The gradient variable doesn‘t exist.");
+  PADDLE_ENFORCE_NE(
+      aux_var_map->count(kGrad), static_cast<size_t>(0),
+      platform::errors::NotFound("The gradient variable doesn‘t exist."));
   auto &grad_vec = aux_var_map->at(kGrad);
 
   std::vector<size_t> grad_sort_idx;
@@ -437,8 +457,10 @@ void FuseOptimizerOpPass::SortParametersAndAuxVars(
 
   for (auto &p_g : params_grads) {
     auto iter = std::find(grad_vec.begin(), grad_vec.end(), p_g.second);
-    PADDLE_ENFORCE_EQ(iter != grad_vec.end(), true,
-                      "%s is not found in gradient vector", p_g.second);
+    PADDLE_ENFORCE_EQ(
+        iter != grad_vec.end(), true,
+        platform::errors::NotFound(
+            "Parameter@Grad(%s) is not found in gradient vector.", p_g.second));
     auto idx = std::distance(grad_vec.begin(), iter);
     grad_sort_idx.emplace_back(idx);
   }
@@ -477,9 +499,10 @@ void FuseOptimizerOpPass::GetFusingVarNamesMap(
     for (auto &var_n : aux_vars_name) {
       auto arg_names = node->Op()->Input(var_n);
       PADDLE_ENFORCE_EQ(arg_names.size(), static_cast<size_t>(1),
-                        "The input variable of optimizer to be fused is "
-                        "invalid. Excepted %s only has one %s input.",
-                        node->Op()->Type(), var_n);
+                        platform::errors::InvalidArgument(
+                            "The input variable of optimizer to be fused is "
+                            "invalid. Excepted %s only has one %s input.",
+                            node->Op()->Type(), var_n));
       (*aux_args_name)[var_n].emplace_back(arg_names[0]);
     }
   }
@@ -525,10 +548,14 @@ void FuseOptimizerOpPass::InsertInputAndOutputForFusedOpNode(
   auto deal_with_ctrl_vars = [&out_dep_vars, &not_useful_vars,
                               &fused_opt_node](ir::Node *ctr_var_node) {
     PADDLE_ENFORCE_EQ(ctr_var_node->inputs.size(), 1,
-                      "The control var node has nultiple inputs.");
+                      platform::errors::InvalidArgument(
+                          "The control var(%s) node has multiple inputs.",
+                          ctr_var_node->Name()));
     if (ctr_var_node->inputs.front() == fused_opt_node) {
-      PADDLE_ENFORCE_GT(ctr_var_node->outputs.size(), 0,
-                        "The control var node has no output.");
+      PADDLE_ENFORCE_GT(
+          ctr_var_node->outputs.size(), 0,
+          platform::errors::InvalidArgument(
+              "The control var(%s) node has no output.", ctr_var_node->Name()));
       auto output_ops = ctr_var_node->outputs;
       output_ops.erase(std::remove_if(output_ops.begin(), output_ops.end(),
                                       [&fused_opt_node](const ir::Node *node) {
diff --git a/paddle/fluid/framework/ir/fuse_optimizer_ops_pass/fuse_sgd_op_pass.cc b/paddle/fluid/framework/ir/fuse_optimizer_ops_pass/fuse_sgd_op_pass.cc
index 1504f00b27cd6a416761a4227f6c504bb38278bb..70d4d2b865230078889115b809d8617b4415cc99 100644
--- a/paddle/fluid/framework/ir/fuse_optimizer_ops_pass/fuse_sgd_op_pass.cc
+++ b/paddle/fluid/framework/ir/fuse_optimizer_ops_pass/fuse_sgd_op_pass.cc
@@ -35,7 +35,9 @@ class FuseSgdOpPass : public FuseOptimizerOpPass {
       const std::unordered_map<std::string, std::vector<std::string>> &vars_set,
       const std::unordered_map<std::string, std::string> &fused_vars_name,
       const std::vector<ir::Node *> &sgd_ops, ir::Graph *graph) const {
-    PADDLE_ENFORCE_GT(sgd_ops.size(), static_cast<size_t>(0));
+    PADDLE_ENFORCE_GT(
+        sgd_ops.size(), static_cast<size_t>(0),
+        platform::errors::InvalidArgument("SGD ops must not be empyt."));
 
     // NOTE: fused_var is only exist in scope, so the graph doesn't have
     // fused_var node.
diff --git a/paddle/fluid/framework/ir/graph_pattern_detector.cc b/paddle/fluid/framework/ir/graph_pattern_detector.cc
index f58e6c8bff93da0b27cd147108bc57a452269188..ff6dffa704eeceeabfc5eb1d6786f40b2e523e98 100644
--- a/paddle/fluid/framework/ir/graph_pattern_detector.cc
+++ b/paddle/fluid/framework/ir/graph_pattern_detector.cc
@@ -140,7 +140,7 @@ void GraphPatternDetector::ValidateByNodeRole(
           subgraphs->begin(), subgraphs->end(),
           [](const GraphPatternDetector::subgraph_t &subgraph) -> bool {
             // Collect the inputs and outputs.
-            std::unordered_set<Node *> ios;
+            std::set<Node *> ios;
             for (auto &item : subgraph) {
               if (!item.first->IsIntermediate()) {
                 ios.insert(item.second);
@@ -166,7 +166,7 @@ void GraphPatternDetector::ValidateByNodeRole(
 }
 
 struct HitGroup {
-  std::unordered_map<PDNode *, Node *> roles;
+  std::map<PDNode *, Node *> roles;
 
   bool Match(Node *node, PDNode *pat) {
     if (nodes_.count(node)) {
@@ -184,7 +184,7 @@ struct HitGroup {
   }
 
  private:
-  std::unordered_set<Node *> nodes_;
+  std::set<Node *> nodes_;
 };
 
 // Tell whether Node a links to b.
@@ -283,7 +283,7 @@ void GraphPatternDetector::UniquePatterns(
   if (subgraphs->empty()) return;
   std::vector<GraphPatternDetector::subgraph_t> result;
 
-  std::unordered_set<size_t> set;
+  std::set<size_t> set;
   std::hash<std::string> hasher;
   for (auto &g : *subgraphs) {
     // Sort the items in the sub-graph, and transform to a string key.
@@ -305,7 +305,7 @@ void GraphPatternDetector::UniquePatterns(
 void GraphPatternDetector::RemoveOverlappedMatch(
     std::vector<subgraph_t> *subgraphs) {
   std::vector<subgraph_t> result;
-  std::unordered_set<Node *> node_set;
+  std::set<Node *> node_set;
 
   for (const auto &subgraph : *subgraphs) {
     bool valid = true;
diff --git a/paddle/fluid/framework/ir/graph_pattern_detector.h b/paddle/fluid/framework/ir/graph_pattern_detector.h
index 422ad1ef47a84ff21a2568a2773c899733f34dc7..e1cce7848dd54b02a540b144ca1088f62eeb52cb 100644
--- a/paddle/fluid/framework/ir/graph_pattern_detector.h
+++ b/paddle/fluid/framework/ir/graph_pattern_detector.h
@@ -231,7 +231,7 @@ class PDPattern {
 
   std::vector<std::unique_ptr<PDNode>> nodes_;
   std::vector<edge_t> edges_;
-  std::unordered_map<std::string, PDNode*> node_map_;
+  std::map<std::string, PDNode*> node_map_;
   static size_t id_;
 };
 
@@ -263,7 +263,7 @@ class PDPattern {
  */
 class GraphPatternDetector {
  public:
-  using subgraph_t = std::unordered_map<PDNode*, Node*>;
+  using subgraph_t = std::map<PDNode*, Node*>;
 
   // Operate on the detected pattern.
   using handle_t =
diff --git a/paddle/fluid/framework/ir/memory_optimize_pass/buffer_shared_cross_op_memory_reuse_pass.cc b/paddle/fluid/framework/ir/memory_optimize_pass/buffer_shared_cross_op_memory_reuse_pass.cc
index 6ce14203629e0af20701fee1e589c898992d6cda..b1afa47910fadfaf3560d15cb0bbe88ae0da7371 100644
--- a/paddle/fluid/framework/ir/memory_optimize_pass/buffer_shared_cross_op_memory_reuse_pass.cc
+++ b/paddle/fluid/framework/ir/memory_optimize_pass/buffer_shared_cross_op_memory_reuse_pass.cc
@@ -116,7 +116,10 @@ std::vector<OpHandleBase *> BufferSharedCrossOpMemoryReusePass::SortOp(
   graph_view.BreadthFirstVisit(
       [&](OpHandleBase *cur_op) { sorted_ops.emplace_back(cur_op); });
   PADDLE_ENFORCE_EQ(sorted_ops.size(), graph_view.OpNumber(),
-                    "There are unvisited ops");
+                    platform::errors::InvalidArgument(
+                        "Sorted ops size(%d) not equal to graph op size(%d). "
+                        "There are unvisited ops.",
+                        sorted_ops.size(), graph_view.OpNumber()));
   return sorted_ops;
 }
 
@@ -181,7 +184,9 @@ void BufferSharedCrossOpMemoryReusePass::RunOnScopeIdx(size_t idx) const {
       auto *out_node = *(out_nodes.begin());
       auto *out_var =
           dynamic_cast<VarHandle *>(&(out_node->Wrapper<VarHandleBase>()));
-      PADDLE_ENFORCE_NOT_NULL(out_var);
+      PADDLE_ENFORCE_NOT_NULL(
+          out_var, platform::errors::NotFound(
+                       "Can not find a valid Var Node for Var %s.", out_arg));
 
       // If out_arg is not reusable, skip it
       if (!IsOutVarReusable(*out_var)) {
@@ -269,7 +274,8 @@ size_t BufferSharedCrossOpMemoryReusePass::ResolveDependencyBetween(
     auto op_dep = GetOpDep(prev_op, op);
     if (op_dep == NodeDependency::kBefore) continue;
     PADDLE_ENFORCE_EQ(op_dep, NodeDependency::kNoDep,
-                      "The graph has circle, this may be a bug");
+                      platform::errors::InvalidArgument(
+                          "The graph has circle, this may be a bug."));
 
     auto iter =
         std::find_if(prev_op->Outputs().begin(), prev_op->Outputs().end(),
@@ -316,9 +322,13 @@ size_t BufferSharedCrossOpMemoryReusePass::ResolveDependencyBetween(
 }
 
 void BufferSharedCrossOpMemoryReusePass::BuildOpDependencyMap() const {
-  PADDLE_ENFORCE(ops_.empty(), "ops_ must be initialized here");
-  PADDLE_ENFORCE(op_to_idx_.empty(), "op_to_idx_ must be initialized here");
-  PADDLE_ENFORCE(deps_.empty(), "deps_ must be initialized here");
+  PADDLE_ENFORCE_EQ(ops_.empty(), true, platform::errors::InvalidArgument(
+                                            "Ops must be initialized here."));
+  PADDLE_ENFORCE_EQ(
+      op_to_idx_.empty(), true,
+      platform::errors::InvalidArgument("Op to idx must be initialized here."));
+  PADDLE_ENFORCE_EQ(deps_.empty(), true, platform::errors::InvalidArgument(
+                                             "Deps must be initialized here."));
 
   // Toposort ops
   OpGraphView graph_view(ir::FilterByNodeWrapper<OpHandleBase>(*graph_));
@@ -344,7 +354,10 @@ void BufferSharedCrossOpMemoryReusePass::BuildOpDependencyMap() const {
                                          prev_preceding_ops.end());
     }
   });
-  PADDLE_ENFORCE_EQ(preceding_ops.size(), op_num);
+  PADDLE_ENFORCE_EQ(preceding_ops.size(), op_num,
+                    platform::errors::InvalidArgument(
+                        "Preceding ops size(%d) must equal to op num(%d).",
+                        preceding_ops.size(), op_num));
 
   // Find out ComputationOpHandles only
   ops_.resize(scope_num);
@@ -384,28 +397,43 @@ void BufferSharedCrossOpMemoryReusePass::BuildOpDependencyMap() const {
 size_t BufferSharedCrossOpMemoryReusePass::OpIndex(
     const ComputationOpHandle *op) const {
   auto iter = op_to_idx_[op->GetScopeIdx()].find(op);
-  PADDLE_ENFORCE(iter != op_to_idx_[op->GetScopeIdx()].end());
+  PADDLE_ENFORCE_NE(iter, op_to_idx_[op->GetScopeIdx()].end(),
+                    platform::errors::NotFound(
+                        "Can not find op(%s) in op_to_idx_.", op->Name()));
   return iter->second;
 }
 
 NodeDependency BufferSharedCrossOpMemoryReusePass::GetOpDep(
     const ComputationOpHandle *op1, const ComputationOpHandle *op2) const {
-  PADDLE_ENFORCE_EQ(op1->GetScopeIdx(), op2->GetScopeIdx());
+  PADDLE_ENFORCE_EQ(op1->GetScopeIdx(), op2->GetScopeIdx(),
+                    platform::errors::InvalidArgument(
+                        "Op(%s) and op(%s) must in the same scope.",
+                        op1->Name(), op2->Name()));
   return deps_[op1->GetScopeIdx()][OpIndex(op1)][OpIndex(op2)];
 }
 
 void BufferSharedCrossOpMemoryReusePass::SetOpDep(
     const ComputationOpHandle *op1, const ComputationOpHandle *op2,
     NodeDependency dep) const {
-  PADDLE_ENFORCE_EQ(op1->GetScopeIdx(), op2->GetScopeIdx());
+  PADDLE_ENFORCE_EQ(op1->GetScopeIdx(), op2->GetScopeIdx(),
+                    platform::errors::InvalidArgument(
+                        "Op(%s) and op(%s) must in the same scope.",
+                        op1->Name(), op2->Name()));
   if (op1 == op2) {
-    PADDLE_ENFORCE(dep == NodeDependency::kSame);
+    PADDLE_ENFORCE_EQ(
+        dep, NodeDependency::kSame,
+        platform::errors::InvalidArgument(
+            "Set Same Op(%s) Dep, dep must be kSame type.", op1->Name()));
     auto idx = OpIndex(op1);
     deps_[op1->GetScopeIdx()][idx][idx] = NodeDependency::kSame;
   } else {
     auto idx1 = OpIndex(op1);
     auto idx2 = OpIndex(op2);
-    PADDLE_ENFORCE(dep != NodeDependency::kSame && idx1 != idx2);
+    PADDLE_ENFORCE_EQ((dep != NodeDependency::kSame && idx1 != idx2), true,
+                      platform::errors::InvalidArgument(
+                          "Op(%s) and Op(%s) should not have same "
+                          "index(%d), and dep should not kSame type.",
+                          op1->Name(), op2->Name(), idx1));
     deps_[op1->GetScopeIdx()][idx1][idx2] = dep;
     deps_[op1->GetScopeIdx()][idx2][idx1] = ReverseNodeDependency(dep);
   }
diff --git a/paddle/fluid/framework/ir/memory_optimize_pass/buffer_shared_inplace_op_pass.cc b/paddle/fluid/framework/ir/memory_optimize_pass/buffer_shared_inplace_op_pass.cc
index 338a608b4ae3dc9e3bd10793e0882f5618471eef..0b42f2ebd5555a5c73527d9819ff254411a399d4 100644
--- a/paddle/fluid/framework/ir/memory_optimize_pass/buffer_shared_inplace_op_pass.cc
+++ b/paddle/fluid/framework/ir/memory_optimize_pass/buffer_shared_inplace_op_pass.cc
@@ -57,7 +57,9 @@ void BufferSharedInplaceOpPass::Run(Graph *graph) const {
       auto *op = *(pair.second.ops().begin());
       const std::string &op_type = op->GetOp()->Type();
       const framework::OpDesc *op_desc = op->Node()->Op();
-      PADDLE_ENFORCE_NOT_NULL(op_desc);
+      PADDLE_ENFORCE_NOT_NULL(
+          op_desc, platform::errors::NotFound("Op(%s) can not find opdesc.",
+                                              op->Name()));
 
       auto &infer_inplace = OpInfoMap::Instance().Get(op_type).infer_inplace_;
       if (!infer_inplace) {
diff --git a/paddle/fluid/framework/ir/memory_optimize_pass/eager_deletion_pass.cc b/paddle/fluid/framework/ir/memory_optimize_pass/eager_deletion_pass.cc
index 9a322bdc1dce1ba72763ed5face10f3e0fddd35c..7b9b5aa62307443789214b4cca2c6b367dc2a287 100644
--- a/paddle/fluid/framework/ir/memory_optimize_pass/eager_deletion_pass.cc
+++ b/paddle/fluid/framework/ir/memory_optimize_pass/eager_deletion_pass.cc
@@ -58,8 +58,12 @@ static int64_t GetMemorySize(
         &vars,
     const std::string &var_name) {
   auto *var_desc = TryGetLatestVarDesc(vars.at(var_name));
-  PADDLE_ENFORCE_NOT_NULL(var_desc);
-  PADDLE_ENFORCE(IsLoDTensor(var_desc));
+  PADDLE_ENFORCE_NOT_NULL(
+      var_desc,
+      platform::errors::NotFound("Var(%s) can not find VarDesc.", var_name));
+  PADDLE_ENFORCE_EQ(IsLoDTensor(var_desc), true,
+                    platform::errors::InvalidArgument(
+                        "Var(%s) must be LoDTensor.", var_name));
   auto dims = var_desc->GetShape();
   return SizeOfType(var_desc->GetDataType()) *
          std::accumulate(dims.begin(), dims.end(), static_cast<int64_t>(1),
diff --git a/paddle/fluid/framework/ir/memory_optimize_pass/memory_optimization_var_info.h b/paddle/fluid/framework/ir/memory_optimize_pass/memory_optimization_var_info.h
index 4f6bacecab4aac39b6f4cb01138560ca8378c13a..94842485440bdce17f47d3b2fc7000e57a37c3c8 100644
--- a/paddle/fluid/framework/ir/memory_optimize_pass/memory_optimization_var_info.h
+++ b/paddle/fluid/framework/ir/memory_optimize_pass/memory_optimization_var_info.h
@@ -42,8 +42,10 @@ class MemOptVarInfo {
   }
 
   void SetRefCnt(size_t ref_cnt) {
-    PADDLE_ENFORCE_GE(ref_cnt, 1,
-                      "Reference count must be larger than or equal to 1");
+    PADDLE_ENFORCE_GE(
+        ref_cnt, 1,
+        platform::errors::InvalidArgument(
+            "Reference count(%d) must be larger than or equal to 1.", ref_cnt));
     ref_cnt_ = ref_cnt;
     runtime_ref_cnt_ = ref_cnt;
   }
diff --git a/paddle/fluid/framework/ir/memory_optimize_pass/memory_reuse_pass.cc b/paddle/fluid/framework/ir/memory_optimize_pass/memory_reuse_pass.cc
index 20c7968d6ac56054e31c4f6f51e72e7ae02bea57..221b0a76e7ef5b01d87c63fb466a9b980f1e69b4 100644
--- a/paddle/fluid/framework/ir/memory_optimize_pass/memory_reuse_pass.cc
+++ b/paddle/fluid/framework/ir/memory_optimize_pass/memory_reuse_pass.cc
@@ -66,7 +66,11 @@ bool MemoryReusePass::TryReuseVar(details::VarHandle *in_var,
                                   details::VarHandle *out_var) const {
   auto *op =
       dynamic_cast<details::ComputationOpHandle *>(out_var->GeneratedOp());
-  PADDLE_ENFORCE_NOT_NULL(op);
+  PADDLE_ENFORCE_NOT_NULL(
+      op,
+      platform::errors::InvalidArgument(
+          "Var(%s) have no GeneratedOp, or it's op is not ComputationOpHandle.",
+          out_var->Name()));
   if (IsVarPairReusable(*in_var, *out_var)) {
     AddReuseVar(op, in_var, out_var);
     return true;
@@ -91,10 +95,13 @@ VarDesc *MemoryReusePass::GetVarDesc(const details::VarHandle &var) const {
   size_t scope_idx = var.scope_idx();
   auto iter = var_descs_[scope_idx].find(var_name);
   if (iter == var_descs_[scope_idx].end()) {
-    PADDLE_ENFORCE((*all_vars_)[scope_idx].count(var_name),
-                   "Variable %s not found", var_name);
+    PADDLE_ENFORCE_NE(
+        (*all_vars_)[scope_idx].count(var_name), 0,
+        platform::errors::NotFound("Variable %s not found.", var_name));
     auto *desc = TryGetLatestVarDesc((*all_vars_)[scope_idx].at(var_name));
-    PADDLE_ENFORCE_NOT_NULL(desc);
+    PADDLE_ENFORCE_NOT_NULL(
+        desc,
+        platform::errors::NotFound("Var(%s) can not find VarDesc.", var_name));
     var_descs_[scope_idx].emplace(var_name, desc);
     return desc;
   } else {
@@ -119,7 +126,9 @@ void MemoryReusePass::CollectShareTensorBufferOpHandles() const {
     if (share_buffer_op != nullptr) {
       auto *compute_op =
           details::GetUniquePendingComputationOpHandle(share_buffer_op);
-      PADDLE_ENFORCE(ops_.count(compute_op) == 0);
+      PADDLE_ENFORCE_EQ(
+          ops_.count(compute_op), 0,
+          platform::errors::AlreadyExists("Compute op already exists."));
       ops_.emplace(compute_op, share_buffer_op);
     }
   }
@@ -227,8 +236,11 @@ bool MemoryReusePass::IsInVarReusable(const details::VarHandle &in_var) const {
  */
 bool MemoryReusePass::IsOutVarReusable(
     const details::VarHandle &out_var) const {
-  PADDLE_ENFORCE_NOT_NULL(dynamic_cast<const details::ComputationOpHandle *>(
-      out_var.GeneratedOp()));
+  PADDLE_ENFORCE_NOT_NULL(
+      dynamic_cast<const details::ComputationOpHandle *>(out_var.GeneratedOp()),
+      platform::errors::InvalidArgument(
+          "Var(%s) have no GeneratedOp, or it's op is not ComputationOpHandle.",
+          out_var.Name()));
   const auto out_name = out_var.Name();
   if (out_name == kEmptyVarName) {
     return false;
@@ -236,9 +248,10 @@ bool MemoryReusePass::IsOutVarReusable(
 
   // out_var must be the first version!!!
   auto out_var_iter = (*all_vars_)[out_var.scope_idx()].find(out_name);
-  PADDLE_ENFORCE(out_var_iter != (*all_vars_)[out_var.scope_idx()].end() &&
-                     !out_var_iter->second.empty(),
-                 "Cannot find variable %s", out_name);
+  PADDLE_ENFORCE_EQ(
+      (out_var_iter != (*all_vars_)[out_var.scope_idx()].end() &&
+       !out_var_iter->second.empty()),
+      true, platform::errors::NotFound("Cannot find variable %s.", out_name));
 
   if (out_var_iter->second[0] != &out_var) {
     return false;
@@ -282,7 +295,11 @@ bool MemoryReusePass::IsVarPairReusable(
     const details::VarHandle &in_var, const details::VarHandle &out_var) const {
   auto *op =
       dynamic_cast<const details::ComputationOpHandle *>(out_var.GeneratedOp());
-  PADDLE_ENFORCE_NOT_NULL(op);
+  PADDLE_ENFORCE_NOT_NULL(
+      op,
+      platform::errors::InvalidArgument(
+          "Var(%s) have no GeneratedOp, or it's op is not ComputationOpHandle.",
+          out_var.Name()));
 
   const auto in_name = in_var.Name();
   if (in_name == out_var.Name()) {
@@ -308,8 +325,10 @@ bool MemoryReusePass::IsVarPairReusable(
 void MemoryReusePass::AddReuseVar(details::ComputationOpHandle *op,
                                   details::VarHandle *in_var,
                                   details::VarHandle *out_var) const {
-  PADDLE_ENFORCE((*var_infos_)[op->GetScopeIdx()].count(in_var->Name()) > 0,
-                 "%s does not in mem-opt var infos", in_var->Name());
+  PADDLE_ENFORCE_GT(
+      (*var_infos_)[op->GetScopeIdx()].count(in_var->Name()), 0,
+      platform::errors::NotFound("Var(%s) does not in mem opt var infos.",
+                                 in_var->Name()));
 
   if (ops_.count(op) == 0) {
     InsertShareTensorBufferOpHandleToGraph(op);
@@ -349,7 +368,10 @@ void MemoryReusePass::UpdateLastLiveOpOfVar(details::ComputationOpHandle *op,
   if (out_var_op_iter == (*last_live_ops_of_vars_)[scope_idx].end()) {
     last_live_op_of_in_var = op;
   } else {
-    PADDLE_ENFORCE(!out_var_op_iter->second.ops().empty());
+    PADDLE_ENFORCE_EQ(
+        out_var_op_iter->second.ops().empty(), false,
+        platform::errors::InvalidArgument(
+            "Var(%s)'s last live op should not empty.", out_var->Name()));
     last_live_op_of_in_var = *(out_var_op_iter->second.ops().begin());
   }
 
@@ -359,8 +381,9 @@ void MemoryReusePass::UpdateLastLiveOpOfVar(details::ComputationOpHandle *op,
   last_live_ops_of_in_var->insert(last_live_op_of_in_var);
 
   auto in_var_info_iter = (*var_infos_)[scope_idx].find(in_var->Name());
-  PADDLE_ENFORCE(in_var_info_iter != (*var_infos_)[scope_idx].end(),
-                 "Cannot find variable %s", in_var->Name());
+  PADDLE_ENFORCE_NE(
+      in_var_info_iter, (*var_infos_)[scope_idx].end(),
+      platform::errors::NotFound("Cannot find variable %s.", in_var->Name()));
 
   in_var_info_iter->second->SetRefCnt(1);
 }
diff --git a/paddle/fluid/framework/ir/memory_optimize_pass/op_graph_view.cc b/paddle/fluid/framework/ir/memory_optimize_pass/op_graph_view.cc
index d2cc89a2b49d8a6cace230e79ccb2e5f096dc53c..11c2508afb5747b6f0f3bba06c68448fef7d384a 100644
--- a/paddle/fluid/framework/ir/memory_optimize_pass/op_graph_view.cc
+++ b/paddle/fluid/framework/ir/memory_optimize_pass/op_graph_view.cc
@@ -39,7 +39,7 @@ void OpGraphView::Build(const std::vector<details::OpHandleBase *> &ops) {
   }
   PADDLE_ENFORCE(
       preceding_ops_.size() == ops.size() && pending_ops_.size() == ops.size(),
-      "There are duplicate ops in graph.");
+      platform::errors::InvalidArgument("There are duplicate ops in graph."));
 }
 
 std::unordered_set<details::OpHandleBase *> OpGraphView::AllOps() const {
@@ -56,8 +56,10 @@ bool OpGraphView::HasOp(details::OpHandleBase *op) const {
 }
 
 void OpGraphView::EnforceHasOp(details::OpHandleBase *op) const {
-  PADDLE_ENFORCE(HasOp(op), "Cannot find op %s in OpGraphView",
-                 op == nullptr ? "nullptr" : op->DebugString());
+  PADDLE_ENFORCE_EQ(HasOp(op), true,
+                    platform::errors::NotFound(
+                        "Cannot find op %s in OpGraphView.",
+                        op == nullptr ? "nullptr" : op->DebugString()));
 }
 
 const std::unordered_set<details::OpHandleBase *> &OpGraphView::PendingOps(
diff --git a/paddle/fluid/framework/ir/memory_optimize_pass/op_graph_view.h b/paddle/fluid/framework/ir/memory_optimize_pass/op_graph_view.h
index 86b25c13959a7934b9838085a0a92a62e4ac821c..5fb2caedba85d2892e18db5e84067c2d2ebada6e 100644
--- a/paddle/fluid/framework/ir/memory_optimize_pass/op_graph_view.h
+++ b/paddle/fluid/framework/ir/memory_optimize_pass/op_graph_view.h
@@ -127,9 +127,13 @@ void OpGraphView::BreadthFirstVisit(Callback &&callback) const {
     }
   }
 
-  PADDLE_ENFORCE_EQ(num_calls, op_num, "There are unvisited ops");
-  PADDLE_ENFORCE_EQ(visited_ops.size(), op_num, "There are unvisited ops");
-  PADDLE_ENFORCE(op_deps.empty(), "There are unvisited ops");
+  PADDLE_ENFORCE_EQ(num_calls, op_num, platform::errors::InvalidArgument(
+                                           "There are unvisited ops."));
+  PADDLE_ENFORCE_EQ(
+      visited_ops.size(), op_num,
+      platform::errors::InvalidArgument("There are unvisited ops."));
+  PADDLE_ENFORCE_EQ(op_deps.empty(), true, platform::errors::InvalidArgument(
+                                               "There are unvisited ops."));
 }
 
 }  // namespace ir
diff --git a/paddle/fluid/framework/ir/memory_optimize_pass/reference_count_pass.cc b/paddle/fluid/framework/ir/memory_optimize_pass/reference_count_pass.cc
index 4584b3d4e0f07d6cbf8b8afb226f69490bbef09d..88d1b2aa003ce70e16aa3171774a67753fad1896 100644
--- a/paddle/fluid/framework/ir/memory_optimize_pass/reference_count_pass.cc
+++ b/paddle/fluid/framework/ir/memory_optimize_pass/reference_count_pass.cc
@@ -77,11 +77,15 @@ class ShrinkDepsOpFunctor {
       const std::vector<details::OpHandleBase *> &ops) const {
     std::unordered_map<details::OpHandleBase *, size_t> op_to_idx;
     for (size_t i = 0; i < ops.size(); ++i) {
-      PADDLE_ENFORCE(graph_.HasOp(ops[i]), "Op does not exist in graph");
+      PADDLE_ENFORCE_EQ(
+          graph_.HasOp(ops[i]), true,
+          platform::errors::InvalidArgument("Op does not exist in graph."));
       op_to_idx[ops[i]] = i;
     }
 
-    PADDLE_ENFORCE(op_to_idx.size() == ops.size(), "Duplicate ops");
+    PADDLE_ENFORCE_EQ(
+        op_to_idx.size(), ops.size(),
+        platform::errors::InvalidArgument("Graph may have duplicate ops."));
 
     std::vector<std::vector<RelationShip>> ret(ops.size());
     for (auto &e : ret) {
@@ -247,9 +251,9 @@ ExtractComputationOpFromLastLivedVar(details::VarHandle *var, size_t scope_idx,
     return {};
   }
 
-  PADDLE_ENFORCE_EQ(
-      computation_ops.empty(), false,
-      platform::errors::InvalidArgument("Computation ops should not be empty"));
+  PADDLE_ENFORCE_EQ(computation_ops.empty(), false,
+                    platform::errors::InvalidArgument(
+                        "Computation ops should not be empty."));
 
   // stage four. Try to shrink computation op if they depend on each other.
   // Get the smallest set of the most ops.
@@ -263,8 +267,9 @@ void ReferenceCountPass::ApplyImpl(ir::Graph *graph) const {
       Get<std::vector<LastLiveOpsOfVars>>(kLastLiveOpsOfVars);
 
   PADDLE_ENFORCE(last_live_ops_of_vars.empty() && var_infos.empty(),
-                 "Last Live Ops and Reference Counts of vars should be "
-                 "initialized at here.");
+                 platform::errors::InvalidArgument(
+                     "Last live ops and reference counts of vars should be "
+                     "initialized at here."));
 
   const auto &vars = graph->Get<details::GraphVars>(details::kGraphVars);
 
@@ -304,11 +309,15 @@ void ReferenceCountPass::ApplyImpl(ir::Graph *graph) const {
       auto &var_name = name_var_pair.first;
       auto &var_handles = name_var_pair.second;
 
-      PADDLE_ENFORCE_EQ(var_desc->Name(), var_name);
-
       PADDLE_ENFORCE_EQ(
-          var_handles.empty(), false,
-          platform::errors::InvalidArgument("Variable %s not found", var_name));
+          var_desc->Name(), var_name,
+          platform::errors::InvalidArgument(
+              "A Var, it's VarName(%s) and DescName(%s) not same.", var_name,
+              var_desc->Name()));
+
+      PADDLE_ENFORCE_EQ(var_handles.empty(), false,
+                        platform::errors::InvalidArgument(
+                            "Variable %s not found.", var_name));
       auto last_ver_var = var_handles.back();
 
       if (last_ver_var->Node()->IsCtrlVar()) {
@@ -327,12 +336,13 @@ void ReferenceCountPass::ApplyImpl(ir::Graph *graph) const {
         continue;
       }
 
+      PADDLE_ENFORCE_EQ(status, LastLiveOpSearchStatus::kSuccess,
+                        platform::errors::InvalidArgument(
+                            "Status(%d) must be success.", status));
       PADDLE_ENFORCE_EQ(
-          status, LastLiveOpSearchStatus::kSuccess,
-          platform::errors::InvalidArgument("status must be success"));
-      PADDLE_ENFORCE_EQ(result.empty(), false,
-                        platform::errors::NotFound(
-                            "Last living ops of %s cannot be empty", var_name));
+          result.empty(), false,
+          platform::errors::NotFound("Last living ops of %s cannot be empty.",
+                                     var_name));
 
       std::string last_live_ops_log_str;
       for (auto &each_ret : result) {
diff --git a/paddle/fluid/framework/ir/mkldnn/conv_activation_mkldnn_fuse_pass.cc b/paddle/fluid/framework/ir/mkldnn/conv_activation_mkldnn_fuse_pass.cc
index 119917428997b03ecb0278fac5de677f0017b2bc..45ff275d530857690d1f169bbcf60a99952ae2c2 100644
--- a/paddle/fluid/framework/ir/mkldnn/conv_activation_mkldnn_fuse_pass.cc
+++ b/paddle/fluid/framework/ir/mkldnn/conv_activation_mkldnn_fuse_pass.cc
@@ -22,7 +22,8 @@ namespace framework {
 namespace ir {
 
 void ConvActivationFusePass::ApplyImpl(ir::Graph* graph) const {
-  PADDLE_ENFORCE_NOT_NULL(graph, "graph cannot be nullptr.");
+  PADDLE_ENFORCE_NOT_NULL(
+      graph, platform::errors::InvalidArgument("Graph cannot be nullptr."));
   FusePassBase::Init("conv_activation_mkldnn_fuse", graph);
 
   GraphPatternDetector gpd;
@@ -75,7 +76,8 @@ void ConvActivationFusePass::ApplyImpl(ir::Graph* graph) const {
     GraphSafeRemoveNodes(graph, {activation, conv_out});
 
     PADDLE_ENFORCE_GT(subgraph.count(conv_input), 0UL,
-                      "subgraph has to contain conv_input node.");
+                      platform::errors::InvalidArgument(
+                          "Subgraph has to contain conv input node."));
     IR_NODE_LINK_TO(conv, activation_out);
     found_conv_activation_count++;
   };
diff --git a/paddle/fluid/framework/ir/mkldnn/conv_bias_mkldnn_fuse_pass.cc b/paddle/fluid/framework/ir/mkldnn/conv_bias_mkldnn_fuse_pass.cc
index bbfc8c005580bb949b498e4474c4059cd09f56b3..82e0af3c198750296032769f2f3b04658871adb7 100644
--- a/paddle/fluid/framework/ir/mkldnn/conv_bias_mkldnn_fuse_pass.cc
+++ b/paddle/fluid/framework/ir/mkldnn/conv_bias_mkldnn_fuse_pass.cc
@@ -26,7 +26,11 @@ namespace ir {
 template <typename BinaryOperation>
 LoDTensor tensor_apply_eltwise(const LoDTensor& vec_a, const LoDTensor& vec_b,
                                BinaryOperation f) {
-  PADDLE_ENFORCE_EQ(vec_a.dims(), vec_b.dims());
+  PADDLE_ENFORCE_EQ(vec_a.dims(), vec_b.dims(),
+                    platform::errors::InvalidArgument(
+                        "Input two tensors must have same shape, but they are "
+                        "different: %s, %s.",
+                        vec_a.dims(), vec_b.dims()));
   LoDTensor vec_y;
   vec_y.Resize(vec_a.dims());
   const float* a = vec_a.data<float>();
@@ -39,11 +43,13 @@ LoDTensor tensor_apply_eltwise(const LoDTensor& vec_a, const LoDTensor& vec_b,
 }
 
 void ConvBiasFusePass::ApplyImpl(ir::Graph* graph) const {
-  PADDLE_ENFORCE(graph);
+  PADDLE_ENFORCE_NOT_NULL(
+      graph, platform::errors::InvalidArgument("Graph cannot be nullptr."));
   FusePassBase::Init(name_scope_, graph);
 
   auto* scope = param_scope();
-  PADDLE_ENFORCE(scope);
+  PADDLE_ENFORCE_NOT_NULL(
+      scope, platform::errors::InvalidArgument("Scope cannot be nullptr."));
 
   GraphPatternDetector gpd;
   auto* conv_input =
@@ -68,7 +74,9 @@ void ConvBiasFusePass::ApplyImpl(ir::Graph* graph) const {
     // elementwise_add op
     GET_IR_NODE_FROM_SUBGRAPH(eltwise, eltwise, conv_bias_pattern);
 
-    PADDLE_ENFORCE(subgraph.count(conv_input));
+    PADDLE_ENFORCE_NE(
+        subgraph.count(conv_input), 0,
+        platform::errors::NotFound("Detector did not find conv input."));
 
     // check if fuse can be done and if MKL-DNN should be used
     FuseOptions fuse_option = FindFuseOption(*conv, *eltwise);
@@ -86,10 +94,16 @@ void ConvBiasFusePass::ApplyImpl(ir::Graph* graph) const {
     if (has_bias && conv->Op()->Input("Bias").size() > 0) {
       auto conv_bias_names = conv->Op()->Input("Bias");
       // add eltwise bias to existing conv bias
-      PADDLE_ENFORCE_EQ(conv_bias_names.size(), 1);
+      PADDLE_ENFORCE_EQ(conv_bias_names.size(), 1,
+                        platform::errors::NotFound("Can not find var Bias."));
       auto* conv_bias_var = scope->FindVar(conv_bias_names[0]);
       auto* conv_bias_tensor = conv_bias_var->GetMutable<LoDTensor>();
-      PADDLE_ENFORCE_EQ(conv_bias_tensor->dims(), eltwise_bias_tensor->dims());
+      PADDLE_ENFORCE_EQ(
+          conv_bias_tensor->dims(), eltwise_bias_tensor->dims(),
+          platform::errors::InvalidArgument(
+              "Conv bias tensor and eltwise bias tensor "
+              "must have same shape, but they are different: %s, %s.",
+              conv_bias_tensor->dims(), eltwise_bias_tensor->dims()));
       *conv_bias_tensor = tensor_apply_eltwise(
           *conv_bias_tensor, *eltwise_bias_tensor, std::plus<float>());
 
diff --git a/paddle/fluid/framework/ir/mkldnn/conv_concat_relu_mkldnn_fuse_pass.cc b/paddle/fluid/framework/ir/mkldnn/conv_concat_relu_mkldnn_fuse_pass.cc
index 9e8f0f0c46cee250e4e425cc636467d89171fa84..af64cb22054e9f2ea751bb993a39e8be563ae458 100644
--- a/paddle/fluid/framework/ir/mkldnn/conv_concat_relu_mkldnn_fuse_pass.cc
+++ b/paddle/fluid/framework/ir/mkldnn/conv_concat_relu_mkldnn_fuse_pass.cc
@@ -39,7 +39,10 @@ void ConvConcatReLUFusePass::FindConcatWithConvs(
 
     for (auto node : concat_inputs) {
       auto prev_op_node = node->inputs;
-      PADDLE_ENFORCE_EQ(prev_op_node.size(), 1);
+      PADDLE_ENFORCE_EQ(prev_op_node.size(), 1,
+                        platform::errors::InvalidArgument(
+                            "Node(%s) input size(%d) must be 1.", node->Name(),
+                            prev_op_node.size()));
       auto* conv_op = prev_op_node[0];
       if (conv_op->Op()->Type() != "conv2d") return;
 
@@ -103,7 +106,8 @@ void ConvConcatReLUFusePass::FuseConvConcatReLU(
 }
 
 void ConvConcatReLUFusePass::ApplyImpl(ir::Graph* graph) const {
-  PADDLE_ENFORCE(graph);
+  PADDLE_ENFORCE_NOT_NULL(
+      graph, platform::errors::InvalidArgument("Graph cannot be nullptr."));
   FusePassBase::Init(name_scope_, graph);
 
   std::unordered_map<const Node*, int> concat_with_convs_counter;
diff --git a/paddle/fluid/framework/ir/mkldnn/cpu_quantize_pass.cc b/paddle/fluid/framework/ir/mkldnn/cpu_quantize_pass.cc
index f47ef2162da35f600b795bf276b2aef319ab7cff..23419d5b9e0a20adcb6245a5a5aa4c5c4b5f3a34 100644
--- a/paddle/fluid/framework/ir/mkldnn/cpu_quantize_pass.cc
+++ b/paddle/fluid/framework/ir/mkldnn/cpu_quantize_pass.cc
@@ -37,18 +37,24 @@ void UnlinkNodes(ir::Node* a, ir::Node* b) {
                   b->inputs.end());
 }
 
-void LogCannotQuantizeOp(Node* op) {
+void LogCannotQuantizeOp(Node* op, const char* details = nullptr) {
   std::stringstream msg_ss;
   msg_ss << "Cannot quantize operator " << op->Name()
          << " (type: " << op->Op()->Type() << ", id: " << op->id() << ").";
+  if (details) msg_ss << " " << details;
   PrettyLogDetail(msg_ss.str().c_str());
 }
 
 void LogScaleIsMissingForVar(Node* var) {
+  VLOG(4) << "Quantization scale for the variable " << var->Name()
+          << " is missing.";
+}
+
+void LogQuantizationDisabled(Node* op) {
   std::stringstream msg_ss;
-  msg_ss << "Quantization scale for the variable " << var->Name()
-         << " is missing.";
-  PrettyLogDetail(msg_ss.str().c_str());
+  VLOG(4) << "Qantization skipped for operator " << op->Name()
+          << " (type: " << op->Op()->Type() << ", id: " << op->id()
+          << "). Attribute use_quantizer = false.";
 }
 
 }  // namespace
@@ -62,10 +68,10 @@ void CPUQuantizePass::QuantizeInput(Graph* g, Node* op, Node* input,
   auto inputs = op->Op()->InputNames();
   bool name_found =
       std::find(inputs.begin(), inputs.end(), input_name) != inputs.end();
-  PADDLE_ENFORCE_EQ(
-      name_found, true,
-      platform::errors::InvalidArgument("%s isn't the input of the %s operator",
-                                        input_name, op->Op()->Type()));
+  PADDLE_ENFORCE_EQ(name_found, true,
+                    platform::errors::InvalidArgument(
+                        "Var(%s) isn't the input of the %s operator.",
+                        input_name, op->Op()->Type()));
   unsigned max = is_unsigned ? U8_MAX : S8_MAX;
   float scale = scale_to_one * max;
 
@@ -104,8 +110,14 @@ void CPUQuantizePass::QuantizeInputs(Graph* g, Node* op, std::string input_name,
                                      std::string scale_attr_name) const {
   auto inputs = op->inputs;
   auto output = op->outputs[0];
-  PADDLE_ENFORCE_GE(inputs.size(), 1);
-  PADDLE_ENFORCE_EQ(op->outputs.size(), 1);
+  PADDLE_ENFORCE_GE(inputs.size(), 1,
+                    platform::errors::InvalidArgument(
+                        "OP(%s)'s inputs(%d) must be equal or greater than 1.",
+                        op->Name(), inputs.size()));
+  PADDLE_ENFORCE_EQ(op->outputs.size(), 1,
+                    platform::errors::InvalidArgument(
+                        "OP(%s)'s outputs(%d) must be equal to 1.", op->Name(),
+                        op->outputs.size()));
 
   // create a quantize op desc prototype
   OpDesc q_desc;
@@ -153,8 +165,8 @@ void CPUQuantizePass::DequantizeOutput(Graph* g, Node* op, Node* output,
       std::find(outputs.begin(), outputs.end(), output_name) != outputs.end();
   PADDLE_ENFORCE_EQ(name_found, true,
                     platform::errors::InvalidArgument(
-                        "%s isn't the output of the %s operator", output_name,
-                        op->Op()->Type()));
+                        "Var(%s) isn't the output of the %s operator.",
+                        output_name, op->Op()->Type()));
   unsigned max = is_unsigned ? U8_MAX : S8_MAX;
   float scale = scale_to_one * max;
 
@@ -239,12 +251,23 @@ void CPUQuantizePass::QuantizeConv(Graph* graph,
     auto* conv_op_desc = conv_op->Op();
 
     // skip if should not be quantized
-    if (!conv_op_desc->GetAttrIfExists<bool>("use_quantizer")) return;
+    if (!conv_op_desc->GetAttrIfExists<bool>("use_quantizer")) {
+      LogQuantizationDisabled(conv_op);
+      return;
+    }
 
     GET_IR_NODE_FROM_SUBGRAPH(conv_filter, conv_filter, conv_pattern);
     GET_IR_NODE_FROM_SUBGRAPH(conv_input, conv_input, conv_pattern);
     GET_IR_NODE_FROM_SUBGRAPH(conv_output, conv_output, conv_pattern);
 
+    auto has_output_scale = AreScalesPresentForNodes(conv_op, {conv_output});
+    if (with_residual_data && !has_output_scale) {
+      LogCannotQuantizeOp(conv_op,
+                          "Conv op with ResidualData input cannot be quantized "
+                          "without output scale.");
+      return;
+    }
+
     if (with_residual_data) {
       GET_IR_NODE_FROM_SUBGRAPH(conv_residual_data, conv_residual_data,
                                 conv_pattern);
@@ -283,7 +306,7 @@ void CPUQuantizePass::QuantizeConv(Graph* graph,
     conv_op->Op()->SetAttr("Scale_weights", filter_scale);
 
     // if quantization scale is missing for output tensor, return fp32 data
-    if (AreScalesPresentForNodes(conv_op, {conv_output})) {
+    if (has_output_scale) {
       bool is_output_unsigned{false};
       auto output_scale =
           GetScaleValueForNode(conv_output, &is_output_unsigned);
@@ -333,9 +356,13 @@ void CPUQuantizePass::QuantizeFc(Graph* graph) const {
     auto* fc_op_desc = fc->Op();
 
     // skip if should not be quantized
-    if (fc_op_desc->GetAttrIfExists<bool>("use_quantizer") != true ||
-        fc_op_desc->GetAttrIfExists<bool>("use_mkldnn") != true)
+    if (!fc_op_desc->GetAttrIfExists<bool>("use_quantizer")) {
+      LogQuantizationDisabled(fc);
+      return;
+    }
+    if (!fc_op_desc->GetAttrIfExists<bool>("use_mkldnn")) {
       return;
+    }
 
     GET_IR_NODE_FROM_SUBGRAPH(weights, weights, fc_pattern);
     GET_IR_NODE_FROM_SUBGRAPH(input, input, fc_pattern);
@@ -396,7 +423,10 @@ void CPUQuantizePass::QuantizePool(Graph* graph) const {
     auto* pool_op_desc = pool_op->Op();
 
     // skip if should not be quantized
-    if (!pool_op_desc->GetAttrIfExists<bool>("use_quantizer")) return;
+    if (!pool_op_desc->GetAttrIfExists<bool>("use_quantizer")) {
+      LogQuantizationDisabled(pool_op);
+      return;
+    }
 
     GET_IR_NODE_FROM_SUBGRAPH(pool_input, pool_input, pool_pattern);
     GET_IR_NODE_FROM_SUBGRAPH(pool_output, pool_output, pool_pattern);
@@ -438,7 +468,10 @@ void CPUQuantizePass::QuantizeConcat(Graph* graph) const {
     auto* concat_op_desc = concat_op->Op();
 
     // skip if should not be quantized
-    if (!concat_op_desc->GetAttrIfExists<bool>("use_quantizer")) return;
+    if (!concat_op_desc->GetAttrIfExists<bool>("use_quantizer")) {
+      LogQuantizationDisabled(concat_op);
+      return;
+    }
 
     GET_IR_NODE_FROM_SUBGRAPH(concat_out, concat_out, concat_pattern);
 
@@ -481,7 +514,10 @@ void CPUQuantizePass::QuantizePriorBox(Graph* graph) const {
     auto* prior_box_op_desc = prior_box_op->Op();
 
     // skip if should not be quantized
-    if (!prior_box_op_desc->GetAttrIfExists<bool>("use_quantizer")) return;
+    if (!prior_box_op_desc->GetAttrIfExists<bool>("use_quantizer")) {
+      LogQuantizationDisabled(prior_box_op);
+      return;
+    }
 
     GET_IR_NODE_FROM_SUBGRAPH(prior_box_input, prior_box_input,
                               prior_box_pattern);
@@ -522,6 +558,7 @@ void CPUQuantizePass::QuantizeTranspose(Graph* graph) const {
 
     // skip if should not be quantized
     if (!transpose_op_desc->GetAttrIfExists<bool>("use_quantizer")) {
+      LogQuantizationDisabled(transpose_op);
       return;
     }
     GET_IR_NODE_FROM_SUBGRAPH(prev_op, prev_op, transpose_pattern);
@@ -576,6 +613,7 @@ void CPUQuantizePass::QuantizeReshape(Graph* graph) const {
 
     // skip if should not be quantized
     if (!reshape_op_desc->GetAttrIfExists<bool>("use_quantizer")) {
+      LogQuantizationDisabled(reshape_op);
       return;
     }
     GET_IR_NODE_FROM_SUBGRAPH(prev_op, prev_op, reshape_pattern);
@@ -628,6 +666,7 @@ void CPUQuantizePass::QuantizeMatmul(Graph* graph) const {
 
     // skip if should not be quantized
     if (!matmul_op_desc->GetAttrIfExists<bool>("use_quantizer")) {
+      LogQuantizationDisabled(matmul_op);
       return;
     }
     GET_IR_NODE_FROM_SUBGRAPH(prev_op_x, prev_op_x, matmul_pattern);
@@ -649,10 +688,12 @@ void CPUQuantizePass::QuantizeMatmul(Graph* graph) const {
     bool is_x_unsigned{false}, is_y_unsigned{false};
     auto input_x_scale = GetScaleValueForNode(matmul_in_x, &is_x_unsigned);
     auto input_y_scale = GetScaleValueForNode(matmul_in_y, &is_y_unsigned);
-    PADDLE_ENFORCE_EQ(
-        is_x_unsigned, is_y_unsigned,
-        platform::errors::InvalidArgument(
-            "Matmul inputs should have the same value of is_unsigned"));
+    PADDLE_ENFORCE_EQ(is_x_unsigned, is_y_unsigned,
+                      platform::errors::InvalidArgument(
+                          "Matmul inputs should have the same "
+                          "attribute of signed/unsigned, but they "
+                          "are different: x(%d), y(%d).",
+                          is_x_unsigned, is_y_unsigned));
     QuantizeInput(g, matmul_op, matmul_in_x, "X", input_x_scale, is_x_unsigned,
                   "Scale_x");
     QuantizeInput(g, matmul_op, matmul_in_y, "Y", input_y_scale, is_y_unsigned,
@@ -676,12 +717,88 @@ void CPUQuantizePass::QuantizeMatmul(Graph* graph) const {
   PrettyLogDetail("---    quantized %d matmul ops", quantize_matmul_count);
 }
 
+void CPUQuantizePass::QuantizeElementwiseAdd(Graph* graph) const {
+  GraphPatternDetector gpd;
+  auto pattern = gpd.mutable_pattern();
+  patterns::ElementwiseAdd elementwise_add_pattern{pattern, name_scope_};
+
+  elementwise_add_pattern(
+      pattern->NewNode(elementwise_add_pattern.elementwise_add_x_repr()),
+      pattern->NewNode(elementwise_add_pattern.elementwise_add_y_repr()));
+
+  int quantize_elementwise_add_count = 0;
+  auto handler = [&](const GraphPatternDetector::subgraph_t& subgraph,
+                     Graph* g) {
+    VLOG(4) << "Quantize elementwise_add op";
+    GET_IR_NODE_FROM_SUBGRAPH(elementwise_add_op, elementwise_add_op,
+                              elementwise_add_pattern);
+    auto* elementwise_add_op_desc = elementwise_add_op->Op();
+
+    // skip if should not be quantized
+    if (!elementwise_add_op_desc->GetAttrIfExists<bool>("use_quantizer")) {
+      LogQuantizationDisabled(elementwise_add_op);
+      return;
+    }
+
+    GET_IR_NODE_FROM_SUBGRAPH(elementwise_add_x, elementwise_add_x,
+                              elementwise_add_pattern);
+    GET_IR_NODE_FROM_SUBGRAPH(elementwise_add_y, elementwise_add_y,
+                              elementwise_add_pattern);
+    GET_IR_NODE_FROM_SUBGRAPH(elementwise_add_out, elementwise_add_out,
+                              elementwise_add_pattern);
+
+    if (!AreScalesPresentForNodes(elementwise_add_op,
+                                  {elementwise_add_x, elementwise_add_y})) {
+      LogCannotQuantizeOp(elementwise_add_op);
+      return;
+    }
+
+    bool is_x_unsigned{false}, is_y_unsigned{false};
+    auto input_x_scale =
+        GetScaleValueForNode(elementwise_add_x, &is_x_unsigned);
+    auto input_y_scale =
+        GetScaleValueForNode(elementwise_add_y, &is_y_unsigned);
+
+    // TODO(sfraczek): add support for different signness
+    if (is_x_unsigned != is_y_unsigned) {
+      LogCannotQuantizeOp(elementwise_add_op,
+                          "ElementwiseAdd inputs must be of the same type.");
+      return;
+    }
+
+    QuantizeInput(g, elementwise_add_op, elementwise_add_x, "X", input_x_scale,
+                  is_x_unsigned, "Scale_x");
+    QuantizeInput(g, elementwise_add_op, elementwise_add_y, "Y", input_y_scale,
+                  is_y_unsigned, "Scale_y");
+
+    // if quantization scale is missing for output tensor, return fp32 data
+    if (AreScalesPresentForNodes(elementwise_add_op, {elementwise_add_out})) {
+      bool is_output_unsigned{false};
+      auto output_scale =
+          GetScaleValueForNode(elementwise_add_out, &is_output_unsigned);
+      DequantizeOutput(g, elementwise_add_op, elementwise_add_out, "Out",
+                       output_scale, is_output_unsigned, "Scale_out");
+    } else {
+      elementwise_add_op->Op()->SetAttr("force_fp32_output", true);
+    }
+
+    ++quantize_elementwise_add_count;
+  };
+  gpd(graph, handler);
+  AddStatis(quantize_elementwise_add_count);
+
+  PrettyLogDetail("---    quantized %d elementwise_add ops",
+                  quantize_elementwise_add_count);
+}
+
 void CPUQuantizePass::ApplyImpl(ir::Graph* graph) const {
   VLOG(3) << "Quantizing the graph.";
-  PADDLE_ENFORCE(graph);
+  PADDLE_ENFORCE_NOT_NULL(
+      graph, platform::errors::InvalidArgument("Graph cannot be nullptr."));
   FusePassBase::Init(name_scope_, graph);
 
-  PADDLE_ENFORCE(param_scope());
+  PADDLE_ENFORCE_NOT_NULL(param_scope(), platform::errors::InvalidArgument(
+                                             "Scope cannot be nullptr."));
 
   QuantizeConv(graph, false /* with_residual_data */);
   QuantizeConv(graph, true /* with_residual_data */);
@@ -692,6 +809,7 @@ void CPUQuantizePass::ApplyImpl(ir::Graph* graph) const {
   QuantizeFc(graph);
   QuantizeReshape(graph);
   QuantizeMatmul(graph);
+  QuantizeElementwiseAdd(graph);
 }
 
 }  // namespace ir
diff --git a/paddle/fluid/framework/ir/mkldnn/cpu_quantize_pass.h b/paddle/fluid/framework/ir/mkldnn/cpu_quantize_pass.h
index cd5c673061b79602f6eceda55fb0107d2a41535c..21219e7dca8c712a09650779f7ef803052a85748 100644
--- a/paddle/fluid/framework/ir/mkldnn/cpu_quantize_pass.h
+++ b/paddle/fluid/framework/ir/mkldnn/cpu_quantize_pass.h
@@ -60,6 +60,8 @@ class CPUQuantizePass : public FusePassBase {
 
   void QuantizeMatmul(Graph* graph) const;
 
+  void QuantizeElementwiseAdd(Graph* graph) const;
+
   void QuantizeInput(Graph* g, Node* op, Node* input, std::string input_name,
                      double scale_to_one, bool is_unsigned,
                      std::string scale_attr_name = "") const;
diff --git a/paddle/fluid/framework/ir/mkldnn/cpu_quantize_pass_tester.cc b/paddle/fluid/framework/ir/mkldnn/cpu_quantize_pass_tester.cc
index 67a9957cb06004b298af025c465ab37134fc6bbe..395b419cac13d6d76b6e30579e52a1957b548bab 100644
--- a/paddle/fluid/framework/ir/mkldnn/cpu_quantize_pass_tester.cc
+++ b/paddle/fluid/framework/ir/mkldnn/cpu_quantize_pass_tester.cc
@@ -14,6 +14,7 @@
 
 #include "paddle/fluid/framework/ir/mkldnn/cpu_quantize_pass.h"
 #include <gtest/gtest.h>
+
 #include "paddle/fluid/framework/naive_executor.h"
 #include "paddle/fluid/imperative/type_defs.h"
 #include "paddle/fluid/platform/place.h"
@@ -82,6 +83,14 @@ void SetOp(ProgramDesc* prog, const std::string& type, const std::string& name,
     op->SetAttr("Scale_x", 1.0f);
     op->SetAttr("Scale_y", 1.0f);
     op->SetAttr("Scale_out", 1.0f);
+  } else if (type == "elementwise_add") {
+    op->SetInput("X", {inputs[0]});
+    if (inputs.size() > 1) op->SetInput("Y", {inputs[1]});
+    op->SetOutput("Out", {outputs[0]});
+    op->SetAttr("use_quantizer", use_quantizer);
+    op->SetAttr("Scale_x", 1.0f);
+    op->SetAttr("Scale_y", 1.0f);
+    op->SetAttr("Scale_out", 1.0f);
   }
 }
 
@@ -95,7 +104,8 @@ void InitTensorHolder(Scope* scope, const paddle::platform::Place& place,
 void PreparePass(std::unique_ptr<ir::Graph>* graph, const ProgramDesc& prog,
                  const std::initializer_list<std::string> variable_names,
                  int* original_nodes_num, int* current_nodes_num,
-                 std::string var_without_scale = "") {
+                 std::string var_without_scale = "",
+                 std::string var_signed = "") {
   auto place = paddle::platform::CPUPlace();
   NaiveExecutor exe{place};
   Scope scope;
@@ -108,8 +118,7 @@ void PreparePass(std::unique_ptr<ir::Graph>* graph, const ProgramDesc& prog,
     tensor.Resize({1});
     auto* ptr = tensor.mutable_data<double>(place);
     ptr[0] = 2.0;
-
-    (*scales)[v] = std::make_pair(false, std::move(tensor));
+    (*scales)[v] = std::make_pair(v == var_signed, std::move(tensor));
   }
 
   (*graph)->SetNotOwned(kParamScopeAttr, &scope);
@@ -387,7 +396,7 @@ static const std::initializer_list<std::string> variable_names_reshape = {
 // c->Dropout->d
 ProgramDesc BuildProgramDescReshape() {
   ProgramDesc prog;
-  for (auto& v : variable_names_transpose) {
+  for (auto& v : variable_names_reshape) {
     prog.MutableBlock(0)->Var(v);
   }
   SetOp(&prog, "dequantize", "Dequantize1", {"a"}, {"b"}, true);
@@ -402,7 +411,7 @@ ProgramDesc BuildProgramDescReshape() {
 // c->Dropout->d
 ProgramDesc BuildProgramDescReshapeBetweenNonQuantizedOp() {
   ProgramDesc prog;
-  for (auto& v : variable_names_transpose) {
+  for (auto& v : variable_names_reshape) {
     prog.MutableBlock(0)->Var(v);
   }
 
@@ -491,7 +500,7 @@ static const std::initializer_list<std::string> variable_names_matmul = {
 
 ProgramDesc BuildProgramDescMatmul() {
   ProgramDesc prog;
-  for (auto& v : variable_names_transpose) {
+  for (auto& v : variable_names_matmul) {
     prog.MutableBlock(0)->Var(v);
   }
   SetOp(&prog, "dequantize", "Dequantize1", {"a"}, {"b"}, true);
@@ -504,7 +513,7 @@ ProgramDesc BuildProgramDescMatmul() {
 
 ProgramDesc BuildProgramDescMatmulNotQuantized() {
   ProgramDesc prog;
-  for (auto& v : variable_names_transpose) {
+  for (auto& v : variable_names_matmul) {
     prog.MutableBlock(0)->Var(v);
   }
   SetOp(&prog, "dropout", "Dropout", {"a"}, {"b"}, false);
@@ -569,6 +578,97 @@ TEST(CpuQuantizePass, matmul_not_quantized) {
   MainTestMatmul(BuildProgramDescMatmulNotQuantized(), matmul_count,
                  quant_count, dequant_count, added_nodes_count, 1.0f);
 }
+
+static const std::initializer_list<std::string> variable_names_elementwise_add =
+    {"a", "b", "c", "d", "e", "f"};
+
+ProgramDesc BuildProgramDescElementwiseAdd() {
+  ProgramDesc prog;
+  for (auto& v : variable_names_elementwise_add) {
+    prog.MutableBlock(0)->Var(v);
+  }
+  SetOp(&prog, "dequantize", "Dequantize1", {"a"}, {"b"}, true);
+  SetOp(&prog, "dequantize", "Dequantize2", {"c"}, {"d"}, true);
+  SetOp(&prog, "elementwise_add", "ElementwiseAdd", {"b", "d"}, {"e"}, true,
+        true);
+  SetOp(&prog, "dropout", "Dropout", {"e"}, {"f"}, true, false);
+
+  return prog;
+}
+
+void MainTestElementwiseAdd(const ProgramDesc& prog, int elementwise_add_count,
+                            int quant_count, int dequant_count,
+                            int added_nodes_count, float scale,
+                            bool output_scale_missing = false,
+                            bool unsigned_and_signed_input = false) {
+  std::unique_ptr<ir::Graph> graph(new ir::Graph(prog));
+  int original_nodes_num, current_nodes_num;
+  PreparePass(&graph, prog, variable_names_elementwise_add, &original_nodes_num,
+              &current_nodes_num, output_scale_missing ? "e" : "",
+              unsigned_and_signed_input ? "b" : "");
+
+  int quantize_nodes_count = 0;
+  int dequantize_nodes_count = 0;
+  int elementwise_add_nodes_count = 0;
+  for (auto* node : graph->Nodes()) {
+    if (node->IsOp()) {
+      auto* op = node->Op();
+      if (op->Type() == "elementwise_add") {
+        elementwise_add_nodes_count++;
+        if (unsigned_and_signed_input) scale = 1.0f;
+        auto op_name = BOOST_GET_CONST(std::string, op->GetAttr("name"));
+        EXPECT_EQ(BOOST_GET_CONST(float, op->GetAttr("Scale_x")), scale)
+            << "Scale_x for node '" + op_name + "'.";
+        EXPECT_EQ(BOOST_GET_CONST(float, op->GetAttr("Scale_y")), scale)
+            << "Scale_y for node '" + op_name + "'.";
+        if (output_scale_missing) scale = 1.0;
+        EXPECT_EQ(BOOST_GET_CONST(float, op->GetAttr("Scale_out")), scale)
+            << "Scale_out for node '" + op_name + "'.";
+      } else if (op->Type() == "quantize") {
+        quantize_nodes_count++;
+      } else if (op->Type() == "dequantize") {
+        dequantize_nodes_count++;
+      }
+    }
+  }
+  EXPECT_EQ(elementwise_add_nodes_count, elementwise_add_count);
+  EXPECT_EQ(quantize_nodes_count, quant_count);
+  EXPECT_EQ(dequantize_nodes_count, dequant_count);
+  EXPECT_EQ(original_nodes_num + added_nodes_count, current_nodes_num);
+}
+
+TEST(CpuQuantizePass, elementwise_add) {
+  int elementwise_add_count = 1;
+  int quant_count = 2;
+  int dequant_count = 3;
+  // 2 Quant + 2 IN + 1 DeQuant + 1 OUT
+  int added_nodes_count = 6;
+  MainTestElementwiseAdd(BuildProgramDescElementwiseAdd(),
+                         elementwise_add_count, quant_count, dequant_count,
+                         added_nodes_count, 2.0f * 127);
+}
+
+TEST(CpuQuantizePass, elementwise_add_output_scale_missing) {
+  int elementwise_add_count = 1;
+  int quant_count = 2;
+  int dequant_count = 2;
+  // 2 Quant + 2 IN
+  int added_nodes_count = 4;
+  MainTestElementwiseAdd(BuildProgramDescElementwiseAdd(),
+                         elementwise_add_count, quant_count, dequant_count,
+                         added_nodes_count, 2.0f * 127, true);
+}
+
+TEST(CpuQuantizePass, elementwise_add_unsigned_and_signed_input) {
+  int elementwise_add_count = 1;
+  int quant_count = 0;
+  int dequant_count = 2;
+  int added_nodes_count = 0;
+  MainTestElementwiseAdd(BuildProgramDescElementwiseAdd(),
+                         elementwise_add_count, quant_count, dequant_count,
+                         added_nodes_count, 2.0f * 127, false, true);
+}
+
 }  // namespace
 
 }  // namespace ir
diff --git a/paddle/fluid/framework/ir/mkldnn/cpu_quantize_squash_pass.cc b/paddle/fluid/framework/ir/mkldnn/cpu_quantize_squash_pass.cc
index 130ba44ff64c77e9a968200f58719b123b6f4b76..bc24c10d9d0ae545d0dc71160d66e02a9fdbd730 100644
--- a/paddle/fluid/framework/ir/mkldnn/cpu_quantize_squash_pass.cc
+++ b/paddle/fluid/framework/ir/mkldnn/cpu_quantize_squash_pass.cc
@@ -75,7 +75,7 @@ void CPUQuantizeSquashPass::DequantQuantSquash(
         BOOST_GET_CONST(float, quant_op->Op()->GetAttr("Scale"));
     PADDLE_ENFORCE_NE(
         nodes_keep_counter->find(dequant_out), nodes_keep_counter->end(),
-        platform::errors::NotFound("The dequant output node is not found"));
+        platform::errors::NotFound("The dequant output node is not found."));
 
     // check if dequantize op should be kept or removed, decrease the counter
     bool keep_dequant = (*nodes_keep_counter)[dequant_out]-- > 1;
@@ -153,8 +153,9 @@ void CPUQuantizeSquashPass::OpRequantSquash(Graph* graph) const {
 
       PADDLE_ENFORCE_NE(
           any_op_output_name.empty(), true,
-          platform::errors::NotFound("Operator before requantize operator "
-                                     "should have requantize input as output"));
+          platform::errors::NotFound("Operator before requantize operator(%s) "
+                                     "should have requantize input as output.",
+                                     requant_in->Name()));
 
       float requant_scale_out =
           BOOST_GET_CONST(float, requant_op->Op()->GetAttr("Scale_out"));
@@ -195,10 +196,11 @@ void CPUQuantizeSquashPass::RequantOpSquash(Graph* graph) const {
         for (auto input_name : any_op->Op()->Input(name))
           if (input_name == requant_out->Name()) any_op_input_name = name;
 
-      PADDLE_ENFORCE_NE(
-          any_op_input_name.empty(), true,
-          platform::errors::NotFound("The operator after requantize operator "
-                                     "should have requantize output as input"));
+      PADDLE_ENFORCE_NE(any_op_input_name.empty(), true,
+                        platform::errors::NotFound(
+                            "The operator after requantize operator(%s) "
+                            "should have requantize output as input.",
+                            requant_out->Name()));
       float requant_scale_in =
           boost::get<float>(requant_op->Op()->GetAttr("Scale_in"));
 
@@ -206,11 +208,14 @@ void CPUQuantizeSquashPass::RequantOpSquash(Graph* graph) const {
       if (any_op->Op()->Type() == "matmul")
         scale_name = any_op_input_name == "X" ? "Scale_x" : "Scale_y";
 
-      PADDLE_ENFORCE_EQ(requant_op->Op()->GetAttrIfExists<float>("Scale_out"),
-                        any_op->Op()->GetAttrIfExists<float>(scale_name),
-                        platform::errors::InvalidArgument(
-                            "The operator after requantize should have input "
-                            "scale equal to requantize output scale"));
+      PADDLE_ENFORCE_EQ(
+          requant_op->Op()->GetAttrIfExists<float>("Scale_out"),
+          any_op->Op()->GetAttrIfExists<float>(scale_name),
+          platform::errors::InvalidArgument(
+              "The operator after requantize should have input "
+              "scale(%f) equal to requantize output scale(%f).",
+              any_op->Op()->GetAttrIfExists<float>(scale_name),
+              requant_op->Op()->GetAttrIfExists<float>("Scale_out")));
       any_op->Op()->SetAttr(scale_name, requant_scale_in);
       any_op->Op()->SetInput(any_op_input_name,
                              std::vector<std::string>({requant_in->Name()}));
@@ -286,8 +291,9 @@ void CPUQuantizeSquashPass::MultipleQuantizeSquash(Graph* graph) const {
     auto* first_quant_out = first_quant_op->outputs[0];
     float scale = first_quant_op->Op()->GetAttrIfExists<float>("Scale");
 
-    PADDLE_ENFORCE_NE(scale, 0, platform::errors::InvalidArgument(
-                                    "Quantize scale should not be equal 0"));
+    PADDLE_ENFORCE_NE(scale, 0,
+                      platform::errors::InvalidArgument(
+                          "Quantize scale(%f) should not be equal 0.", scale));
 
     for (int iter = prev_out->outputs.size() - 1; iter >= 0; iter--) {
       auto quant_op = prev_out->outputs[iter];
@@ -304,8 +310,9 @@ void CPUQuantizeSquashPass::MultipleQuantizeSquash(Graph* graph) const {
 
         PADDLE_ENFORCE_NE(
             last_op_input_name.empty(), true,
-            platform::errors::NotFound("Operator after quantize operator "
-                                       "should has quantize output as input"));
+            platform::errors::NotFound("Operator after quantize operator(%s) "
+                                       "should has quantize output as input.",
+                                       quant_out->Name()));
         last_op->Op()->SetInput(
             last_op_input_name,
             std::vector<std::string>({first_quant_out->Name()}));
@@ -345,10 +352,12 @@ void CPUQuantizeSquashPass::DequantScaleSquash(Graph* graph) const {
 
       PADDLE_ENFORCE_GT(dequant_scale, 0.0f,
                         platform::errors::InvalidArgument(
-                            "Dequantize scale should have positive value"));
+                            "Dequantize scale(%f) should have positive value.",
+                            dequant_scale));
       PADDLE_ENFORCE_GT(scale_scale, 0.0f,
                         platform::errors::InvalidArgument(
-                            "Scale of scale op should have positive value"));
+                            "Scale(%f) of scale op should have positive value.",
+                            scale_scale));
 
       dequant_op->Op()->SetAttr("Scale", dequant_scale / scale_scale);
       dequant_op->Op()->SetOutput(
@@ -367,8 +376,8 @@ void CPUQuantizeSquashPass::DequantScaleSquash(Graph* graph) const {
 void CPUQuantizeSquashPass::ApplyImpl(ir::Graph* graph) const {
   PADDLE_ENFORCE_NOT_NULL(
       graph,
-      platform::errors::NotFound(
-          "The graph in function CPUQuantizeSquashPass::ApplyImpl is null"));
+      platform::errors::InvalidArgument(
+          "The graph in function CPUQuantizeSquashPass::ApplyImpl is null."));
   FusePassBase::Init("cpu_quantize_squash_pass", graph);
 
   std::unordered_map<const Node*, int> nodes_keep_counter;
diff --git a/paddle/fluid/framework/ir/mkldnn/cpu_quantize_squash_pass_tester.cc b/paddle/fluid/framework/ir/mkldnn/cpu_quantize_squash_pass_tester.cc
index 9b827fdf6fef1788fafd5595a2705e9df1b2e720..37af0274ea8a2046a7c4376f3ffaa1091f3d4b04 100644
--- a/paddle/fluid/framework/ir/mkldnn/cpu_quantize_squash_pass_tester.cc
+++ b/paddle/fluid/framework/ir/mkldnn/cpu_quantize_squash_pass_tester.cc
@@ -57,7 +57,7 @@ void SetOp(ProgramDesc* prog, const std::string& type, const std::string& name,
     PADDLE_ENFORCE_EQ(inputs.size(), 2UL,
                       platform::errors::InvalidArgument(
                           "The fc inputs should contain input and weights, but "
-                          "now the size of inputs is %d",
+                          "now the size of inputs is %d.",
                           inputs.size()));
     op->SetInput("W", {inputs[1]});
     op->SetOutput("Out", outputs);
diff --git a/paddle/fluid/framework/ir/mkldnn/depthwise_conv_mkldnn_pass.cc b/paddle/fluid/framework/ir/mkldnn/depthwise_conv_mkldnn_pass.cc
index e854559ae7a8765da604c2043e8e4e8cedbbcf88..c5965701a53d4312d89f1e09f17840b09f1bd5f5 100644
--- a/paddle/fluid/framework/ir/mkldnn/depthwise_conv_mkldnn_pass.cc
+++ b/paddle/fluid/framework/ir/mkldnn/depthwise_conv_mkldnn_pass.cc
@@ -19,14 +19,17 @@ namespace paddle {
 namespace framework {
 namespace ir {
 
-#define GET_NODE(id, pattern)                               \
-  PADDLE_ENFORCE(subgraph.count(pattern.RetrieveNode(#id)), \
-                 "pattern has no Node called %s", #id);     \
-  auto* id = subgraph.at(pattern.RetrieveNode(#id));        \
-  PADDLE_ENFORCE_NOT_NULL(id, "subgraph has no node %s", #id);
+#define GET_NODE(id, pattern)                                     \
+  PADDLE_ENFORCE_NE(subgraph.count(pattern.RetrieveNode(#id)), 0, \
+                    platform::errors::InvalidArgument(            \
+                        "Pattern has no Node called %s.", #id));  \
+  auto* id = subgraph.at(pattern.RetrieveNode(#id));              \
+  PADDLE_ENFORCE_NOT_NULL(                                        \
+      id, platform::errors::InvalidArgument("Subgraph has no node %s.", #id));
 
 void DepthwiseConvMKLDNNPass::ApplyImpl(ir::Graph* graph) const {
-  PADDLE_ENFORCE(graph);
+  PADDLE_ENFORCE_NOT_NULL(
+      graph, platform::errors::InvalidArgument("Graph cannot be nullptr."));
   FusePassBase::Init("depthwise_conv_mkldnn_pass", graph);
   GraphPatternDetector gpd;
 
diff --git a/paddle/fluid/framework/ir/mkldnn/mkldnn_inplace_pass.cc b/paddle/fluid/framework/ir/mkldnn/mkldnn_inplace_pass.cc
index 9ca08ff777ba7d6032bffb6c358c030b1cd1366c..7bd94bf55ea21f3be00895c494553863b255543b 100644
--- a/paddle/fluid/framework/ir/mkldnn/mkldnn_inplace_pass.cc
+++ b/paddle/fluid/framework/ir/mkldnn/mkldnn_inplace_pass.cc
@@ -66,17 +66,17 @@ void MKLDNNInPlacePass::ApplyImpl(ir::Graph* graph) const {
       return;
     }
 
-    VLOG(3) << "DNNL Inplace op(" << current_op->id() << ") "
+    VLOG(3) << "oneDNN Inplace op(" << current_op->id() << ") "
             << "Curr Node In: " << current_op_in->Name()
             << " Curr Node out: " << current_op_out->Name();
 
-    VLOG(3) << "DNNL Inplace next op(" << next_op->id() << ") "
+    VLOG(3) << "oneDNN Inplace next op(" << next_op->id() << ") "
             << " next Node out: " << next_op_out->Name();
 
     auto inputs = current_op->Op()->Inputs();
     auto outputs = current_op->Op()->Outputs();
     auto in_to_outs = infer_inplace(false);  // strictly no CUDA for MKL-DNN
-    VLOG(3) << "DNNL InplaceInferer op(" << current_op->id() << ") "
+    VLOG(3) << "oneDNN InplaceInferer op(" << current_op->id() << ") "
             << in_to_outs.begin()->first << ": "
             << inputs[in_to_outs.begin()->first][0] << " "
             << in_to_outs.begin()->second << ": "
@@ -85,7 +85,7 @@ void MKLDNNInPlacePass::ApplyImpl(ir::Graph* graph) const {
     auto inplace_input_vec = inputs[in_to_outs.begin()->first];
     if (std::find(inplace_input_vec.begin(), inplace_input_vec.end(),
                   current_op_in->Name()) == inplace_input_vec.end()) {
-      VLOG(3) << "DNNL in-place pass SKIP pattern ";
+      VLOG(3) << "oneDNN in-place pass SKIP pattern ";
       return;
     }
 
@@ -93,7 +93,7 @@ void MKLDNNInPlacePass::ApplyImpl(ir::Graph* graph) const {
     // is used anywhere else apart from inplaced op
     auto input_consumers = current_op_in->outputs;
     if (input_consumers.size() > 1) {
-      VLOG(3) << "DNNL in-place pass FAIL: in-place var cannot "
+      VLOG(3) << "oneDNN in-place pass FAIL: in-place var cannot "
                  "be an input to multiple operators";
       return;
     } else {
@@ -106,7 +106,7 @@ void MKLDNNInPlacePass::ApplyImpl(ir::Graph* graph) const {
           if ((n->id() != current_op_in->id()) &&
               (n->id() != next_op_out->id()) &&
               (n->Name() == current_op_in->Name())) {
-            VLOG(3) << "DNNL in-place pass FAIL var used in diffrent part of "
+            VLOG(3) << "oneDNN in-place pass FAIL var used in diffrent part of "
                        "graph ";
             return;
           }
@@ -122,7 +122,7 @@ void MKLDNNInPlacePass::ApplyImpl(ir::Graph* graph) const {
       original_output_names[current_op->Name() + current_op_in->Name()] =
           current_op_out->Name();
     } else {
-      VLOG(3) << "DNNL Inplace: Current op already inplaced! ";
+      VLOG(3) << "oneDNN Inplace: Current op already inplaced! ";
     }
 
     // It may be that next op is reusing some of vars, we need to
@@ -133,7 +133,7 @@ void MKLDNNInPlacePass::ApplyImpl(ir::Graph* graph) const {
       if ((n_op_infer_inplace == nullptr)) {
         for (auto& m : n->outputs) {
           if (m->Name() == current_op_in->Name()) {
-            VLOG(3) << "DNNL in-place pass FAIL: in-place var cannot "
+            VLOG(3) << "oneDNN in-place pass FAIL: in-place var cannot "
                        "be an output to non-inplaced next op";
             return;
           }
@@ -173,7 +173,7 @@ void MKLDNNInPlacePass::ApplyImpl(ir::Graph* graph) const {
           (std::find(next_op_inplace_inputs.begin(),
                      next_op_inplace_inputs.end(),
                      original_name) != next_op_inplace_inputs.end())) {
-        VLOG(3) << "DNNL InPlace: Next Op is in-placed , updating its "
+        VLOG(3) << "oneDNN InPlace: Next Op is in-placed , updating its "
                    "input "
                    "and output var!";
         next_op->Op()->SetOutput(
@@ -190,10 +190,24 @@ void MKLDNNInPlacePass::ApplyImpl(ir::Graph* graph) const {
     next_op->Op()->RenameInput(original_name, current_op_out->Name());
 
     found_inplace_count++;
-    VLOG(3) << "DNNL InPlace applied!";
+    VLOG(3) << "oneDNN InPlace applied!";
   };
 
-  gpd(graph, handler);
+  // TODO(jczaja): inplace pass does not influece ops inside block ops
+  auto should_inplace = [&](Graph* g) {
+    std::unordered_set<std::string> unwanted_ops(
+        {"conditional_block", "While", "while_loop"});
+    for (auto& node : g->Nodes()) {
+      if (node->IsOp() &&
+          unwanted_ops.find(node->Name()) != unwanted_ops.end()) {
+        VLOG(3) << "oneDNN InPlace FAILED: unsupported op: " << node->Name();
+        return false;
+      }
+    }
+    return true;
+  };
+
+  if (should_inplace(graph)) gpd(graph, handler);
 }
 
 }  // namespace ir
diff --git a/paddle/fluid/framework/ir/mkldnn/scale_matmul_fuse_pass.cc b/paddle/fluid/framework/ir/mkldnn/scale_matmul_fuse_pass.cc
index 0d720e828b6d02aba253f5d52e8101ca4e7efb89..6c87e437caa1b159c889a68b4d6f5b1790217ca1 100644
--- a/paddle/fluid/framework/ir/mkldnn/scale_matmul_fuse_pass.cc
+++ b/paddle/fluid/framework/ir/mkldnn/scale_matmul_fuse_pass.cc
@@ -46,12 +46,15 @@ void ScaleMatmulFusePass::ApplyImpl(ir::Graph* graph) const {
     if (scale_op->Op()->GetAttrIfExists<float>("bias") == 0.0) {
       auto matmul_alpha = matmul_op->Op()->GetAttrIfExists<float>("alpha");
       auto scale_scale = scale_op->Op()->GetAttrIfExists<float>("scale");
-      PADDLE_ENFORCE_GT(matmul_alpha, 0.0f,
-                        platform::errors::InvalidArgument(
-                            "Alpha of matmul op should have positive value"));
+      PADDLE_ENFORCE_GT(
+          matmul_alpha, 0.0f,
+          platform::errors::InvalidArgument(
+              "Alpha(%f) of matmul op should have positive value.",
+              matmul_alpha));
       PADDLE_ENFORCE_GT(scale_scale, 0.0f,
                         platform::errors::InvalidArgument(
-                            "Scale of scale op should have positive value"));
+                            "Scale(%f) of scale op should have positive value.",
+                            scale_scale));
 
       std::string matmul_op_input_name;
       for (auto name : matmul_op->Op()->InputNames())
@@ -60,8 +63,9 @@ void ScaleMatmulFusePass::ApplyImpl(ir::Graph* graph) const {
 
       PADDLE_ENFORCE_NE(
           matmul_op_input_name.empty(), true,
-          platform::errors::NotFound("Operator after scale operator "
-                                     "should have scale output as input"));
+          platform::errors::NotFound("Operator after scale operator(%s) "
+                                     "should have scale output as input.",
+                                     scale_out->Name()));
       matmul_op->Op()->SetAttr("alpha", matmul_alpha * scale_scale);
       matmul_op->Op()->SetInput(matmul_op_input_name,
                                 std::vector<std::string>({scale_in->Name()}));
diff --git a/paddle/fluid/framework/ir/multi_devices_graph_pass/all_reduce_deps_pass.cc b/paddle/fluid/framework/ir/multi_devices_graph_pass/all_reduce_deps_pass.cc
index 8923dfc3232fb59692d34f843bd6dde6b2442734..6d5e4ac27bf8a95186ec16c9eeac5f4cba4dd989 100644
--- a/paddle/fluid/framework/ir/multi_devices_graph_pass/all_reduce_deps_pass.cc
+++ b/paddle/fluid/framework/ir/multi_devices_graph_pass/all_reduce_deps_pass.cc
@@ -45,7 +45,9 @@ class AllReduceDepsPass : public ir::Pass {
     for (size_t i = 0; i < all_reduce_op_handles.size(); ++i) {
       auto op_handle =
           dynamic_cast<details::NCCLOpHandleBase*>(all_reduce_op_handles[i]);
-      PADDLE_ENFORCE(op_handle, "op_handle must be NCCLOpHandleBase");
+      PADDLE_ENFORCE_NOT_NULL(op_handle,
+                              platform::errors::InvalidArgument(
+                                  "Op handle must be NCCLOpHandleBase."));
       op_handle->SetRunEnv(i, use_hierarchical_allreduce);
     }
 #endif
@@ -95,7 +97,9 @@ class AllReduceDepsPass : public ir::Pass {
         }
       }
 
-      PADDLE_ENFORCE_NE(next_ready_ops.size(), 0, "There maybe have a cycle.");
+      PADDLE_ENFORCE_NE(
+          next_ready_ops.size(), 0,
+          platform::errors::InvalidArgument("There may be a cycle."));
       ready_ops.clear();
       std::swap(ready_ops, next_ready_ops);
       GetSortedAllReduceOps(ready_ops, &all_reduce_op_handles);
@@ -122,18 +126,25 @@ class AllReduceDepsPass : public ir::Pass {
     // NOTE(zcd): For distributed training, it is important to keep the order of
     // allReduce on each node consistent. Otherwise, hang may occur.
     // Sort the current_all_reduce_op_handles according to the name of input.
-    sort(current_all_reduce_op_handles.begin(),
-         current_all_reduce_op_handles.end(),
-         [](const details::OpHandleBase* left,
-            const details::OpHandleBase* right) -> bool {
-           auto left_in_vars =
-               details::DynamicCast<details::VarHandle>(left->Inputs());
-           auto right_in_vars =
-               details::DynamicCast<details::VarHandle>(right->Inputs());
-           PADDLE_ENFORCE_GT(left_in_vars.size(), 0);
-           PADDLE_ENFORCE_GT(right_in_vars.size(), 0);
-           return left_in_vars[0]->Name() > right_in_vars[0]->Name();
-         });
+    sort(
+        current_all_reduce_op_handles.begin(),
+        current_all_reduce_op_handles.end(),
+        [](const details::OpHandleBase* left,
+           const details::OpHandleBase* right) -> bool {
+          auto left_in_vars =
+              details::DynamicCast<details::VarHandle>(left->Inputs());
+          auto right_in_vars =
+              details::DynamicCast<details::VarHandle>(right->Inputs());
+          PADDLE_ENFORCE_GT(left_in_vars.size(), 0,
+                            platform::errors::InvalidArgument(
+                                "OpHandle(%s) inputs size must greater than 0.",
+                                left->Name()));
+          PADDLE_ENFORCE_GT(right_in_vars.size(), 0,
+                            platform::errors::InvalidArgument(
+                                "OpHandle(%s) inputs size must greater than 0.",
+                                right->Name()));
+          return left_in_vars[0]->Name() > right_in_vars[0]->Name();
+        });
 
     all_reduce_op_handles->insert(all_reduce_op_handles->end(),
                                   current_all_reduce_op_handles.begin(),
@@ -170,7 +181,10 @@ class AllReduceDepsPass : public ir::Pass {
           break;
         }
       }
-      PADDLE_ENFORCE(find_valid_input, "Doesn't find valid input.");
+      PADDLE_ENFORCE_EQ(
+          find_valid_input, true,
+          platform::errors::NotFound(
+              "In OpHandle(%s) Doesn't find valid input.", op->Name()));
     }
     VLOG(10) << out2.str();
     if (grads_of_stale_program != all_reduce_op_handles.size()) {
diff --git a/paddle/fluid/framework/ir/multi_devices_graph_pass/backward_optimizer_op_deps_pass.cc b/paddle/fluid/framework/ir/multi_devices_graph_pass/backward_optimizer_op_deps_pass.cc
index 782c51a032c039f87c83c61a5db29e1f3804a184..2aae14fa33391dc251856ab578a37f50d4ac0ad5 100644
--- a/paddle/fluid/framework/ir/multi_devices_graph_pass/backward_optimizer_op_deps_pass.cc
+++ b/paddle/fluid/framework/ir/multi_devices_graph_pass/backward_optimizer_op_deps_pass.cc
@@ -179,9 +179,10 @@ class BackWardOpDepsPass : public ir::Pass {
     // Currently, we assume that once gradient is generated, it can be
     // broadcast, and each gradient is only broadcast once.
     auto backward_vars = details::GetOpRoleVarsOrEmpty(op_desc);
-    PADDLE_ENFORCE_EQ(node->IsWrappedBy<details::OpHandleBase>(), true,
-                      platform::errors::InvalidArgument(
-                          "Node must be wrapped by OpHandleBase"));
+    PADDLE_ENFORCE_EQ(
+        node->IsWrappedBy<details::OpHandleBase>(), true,
+        platform::errors::InvalidArgument(
+            "Node(%s) must be wrapped by OpHandleBase.", node->Name()));
 
     backward_op_handles->emplace_back(&node->Wrapper<details::OpHandleBase>());
 
diff --git a/paddle/fluid/framework/ir/multi_devices_graph_pass/fuse_all_reduce_op_pass.cc b/paddle/fluid/framework/ir/multi_devices_graph_pass/fuse_all_reduce_op_pass.cc
index 86fbbaf7720be52c0c0ab1c5120681a997db58ad..81c98ecf0c0b680a674807dc17d807eea1ca2950 100644
--- a/paddle/fluid/framework/ir/multi_devices_graph_pass/fuse_all_reduce_op_pass.cc
+++ b/paddle/fluid/framework/ir/multi_devices_graph_pass/fuse_all_reduce_op_pass.cc
@@ -64,9 +64,10 @@ class FuseAllReduceOpPass : public ir::Pass {
     PADDLE_ENFORCE_EQ(
         all_reduce_ops.size(), grads.size(),
         platform::errors::Unimplemented(
-            "The number of all_reduce OpHandle is not equal to the "
-            "number of grads. Maybe some gradients are sparse type, "
-            "it is not supported currently."));
+            "The number of all_reduce OpHandle(%d) is not equal to the "
+            "number of grads(%d). Maybe some gradients are sparse type, "
+            "it is not supported currently.",
+            all_reduce_ops.size(), grads.size()));
 
     auto &group_params_grads = graph->Get<details::GroupParamsAndGrads>(
         details::kGroupParamsAndDenseGrads);
@@ -79,7 +80,10 @@ class FuseAllReduceOpPass : public ir::Pass {
 
     for (auto &group_p_g : group_params_grads) {
       size_t group_size = group_p_g.size();
-      PADDLE_ENFORCE_GT(group_size, static_cast<size_t>(0));
+      PADDLE_ENFORCE_GT(
+          group_size, static_cast<size_t>(0),
+          platform::errors::InvalidArgument(
+              "Parameter and Parameter@grad in one group, must not be empty."));
       std::vector<ir::Node *> group_all_reduce_ops;
       group_all_reduce_ops.reserve(group_size);
       for (auto &p_g : group_p_g) {
@@ -103,26 +107,40 @@ class FuseAllReduceOpPass : public ir::Pass {
     all_reduce_ops.reserve(grads.size());
     for (auto &node : result.Nodes()) {
       if (node->IsOp()) {
-        PADDLE_ENFORCE(node->IsWrappedBy<details::OpHandleBase>());
+        PADDLE_ENFORCE_EQ(
+            node->IsWrappedBy<details::OpHandleBase>(), true,
+            platform::errors::InvalidArgument(
+                "Op Node(%s) should Wrapped by OpHandleBase.", node->Name()));
         auto *all_reduce_op_handle = dynamic_cast<details::AllReduceOpHandle *>(
             &node->Wrapper<details::OpHandleBase>());
         if (all_reduce_op_handle) {
 #if defined(PADDLE_WITH_DGC)
           PADDLE_ENFORCE_NE(
               all_reduce_op_handle->Name(), "sparse_all_reduce",
-              "DGC doesn't support fuse for now, if you want to use DGC "
-              "you need set strategy.fuse_all_reduce_ops = False.");
+              platform::errors::InvalidArgument(
+                  "DGC doesn't support fuse for now, if you want to use DGC "
+                  "you need set strategy.fuse_all_reduce_ops = False."));
 #endif
           auto inputs = details::DynamicCast<details::VarHandle>(
               all_reduce_op_handle->Inputs());
-          PADDLE_ENFORCE_EQ(inputs.size(), num_place);
+          PADDLE_ENFORCE_EQ(inputs.size(), num_place,
+                            platform::errors::InvalidArgument(
+                                "The input size(%d) of all reduce op must "
+                                "equal to place cnt(%d)!",
+                                inputs.size(), num_place));
           // The inputs' name should be the same.
           auto &grad_name = inputs[0]->name();
           for (size_t i = 1; i < inputs.size(); ++i) {
-            PADDLE_ENFORCE_EQ(inputs[i]->name(), grad_name,
-                              "The input name should be the same.");
+            PADDLE_ENFORCE_EQ(
+                inputs[i]->name(), grad_name,
+                platform::errors::InvalidArgument(
+                    "The input name should be the same.diff name: %s %s.",
+                    inputs[i]->name(), grad_name));
           }
-          PADDLE_ENFORCE_NE(grads.count(grad_name), static_cast<size_t>(0));
+          PADDLE_ENFORCE_NE(
+              grads.count(grad_name), static_cast<size_t>(0),
+              platform::errors::InvalidArgument(
+                  "Parameter@grad(%s) must in grad set.", grad_name));
           all_reduce_ops.emplace(grad_name, node);
         }
       }
diff --git a/paddle/fluid/framework/ir/multi_devices_graph_pass/multi_devices_graph_check_pass.cc b/paddle/fluid/framework/ir/multi_devices_graph_pass/multi_devices_graph_check_pass.cc
index 8cc33a6ceb9f14d6360f03625a83bee23a577c9f..73f8cd67ee89e8017a6bc15a0931047c8449c9d1 100644
--- a/paddle/fluid/framework/ir/multi_devices_graph_pass/multi_devices_graph_check_pass.cc
+++ b/paddle/fluid/framework/ir/multi_devices_graph_pass/multi_devices_graph_check_pass.cc
@@ -24,7 +24,10 @@ namespace ir {
 class SSAGraghBuilderWithChecker : public ir::Pass {
  protected:
   void ApplyImpl(ir::Graph *graph) const override {
-    PADDLE_ENFORCE(IsValidGraph(graph));
+    PADDLE_ENFORCE_EQ(
+        IsValidGraph(graph), true,
+        platform::errors::InvalidArgument(
+            "In SSAGraghBuilderWithChecker, invalid Graph input."));
   }
 
   bool IsValidGraph(const ir::Graph *graph) const {
diff --git a/paddle/fluid/framework/ir/multi_devices_graph_pass/multi_devices_graph_pass.cc b/paddle/fluid/framework/ir/multi_devices_graph_pass/multi_devices_graph_pass.cc
index 4fbd8a878a7cf5df99529c8ed8a1d47d9ca40217..fd82d6b10e718e890d2532404cf5b462d9f0df86 100644
--- a/paddle/fluid/framework/ir/multi_devices_graph_pass/multi_devices_graph_pass.cc
+++ b/paddle/fluid/framework/ir/multi_devices_graph_pass/multi_devices_graph_pass.cc
@@ -163,7 +163,13 @@ void MultiDevSSAGraphBuilderBase::Init() const {
     nccl_ctxs_ = multi_nccl_ctxs_->DefaultFlatCtx();
   }
 #endif
-  PADDLE_ENFORCE_EQ(places_.size(), local_scopes_.size());
+  PADDLE_ENFORCE_EQ(
+      places_.size(), local_scopes_.size(),
+      platform::errors::InvalidArgument(
+          "Places size and LocalScopes not equal "
+          "Places size(%d), LocalScopes size(%d) "
+          "If use multi devices， Places size must equas to LocalScopes size.",
+          places_.size(), local_scopes_.size()));
 }
 
 void MultiDevSSAGraphBuilderBase::ApplyImpl(ir::Graph *graph) const {
@@ -500,7 +506,11 @@ void MultiDevSSAGraphBuilderBase::CreateAllReduceOp(ir::Graph *result,
 
     SetCommunicationContext(op_handle, places_[i]);
     auto &vars = result->Get<details::GraphVars>(details::kGraphVars)[i][og];
-    PADDLE_ENFORCE(!vars.empty());
+    PADDLE_ENFORCE_EQ(vars.empty(), false,
+                      platform::errors::InvalidArgument(
+                          "Can not find Var(%s) in Place[%d] "
+                          "Paddle Can not add AllReduce OP for Var(%s).",
+                          og, i, og));
     auto &prev_grad = vars.back();
     op_handle->AddInput(prev_grad);
     VLOG(10) << "all_reduce_op_handle add input " << prev_grad->DebugString();
@@ -566,7 +576,11 @@ details::VarHandle *MultiDevSSAGraphBuilderBase::CreateReduceOp(
     auto &p = places_[i];
     SetCommunicationContext(op_handle, p);
     auto &vars = result->Get<details::GraphVars>(details::kGraphVars)[i][og];
-    PADDLE_ENFORCE(!vars.empty());
+    PADDLE_ENFORCE_EQ(vars.empty(), false,
+                      platform::errors::InvalidArgument(
+                          "Can not find Var(%s) in Place[%d] "
+                          "Paddle Can not add Reduce OP for Var(%s).",
+                          og, i, og));
     auto &prev_grad = vars.back();
     op_handle->AddInput(prev_grad);
   }
@@ -590,7 +604,11 @@ bool MultiDevSSAGraphBuilderBase::IsScaleLossOp(ir::Node *node) const {
 
 bool MultiDevSSAGraphBuilderBase::IsSparseGradient(
     const std::string &og) const {
-  PADDLE_ENFORCE(all_vars_.count(og) != 0);
+  PADDLE_ENFORCE_NE(all_vars_.count(og), 0,
+                    platform::errors::InvalidArgument(
+                        "Can not find Var(%s) in VarDescs "
+                        "Paddle Can not add Collective OP for Var(%s).",
+                        og, og));
   return all_vars_.at(og)->GetType() == proto::VarType::SELECTED_ROWS;
 }
 
@@ -641,10 +659,20 @@ int BalanceVarSSAGraphBuilder::GetOpDeviceID(ir::Node *node) const {
       std::vector<std::string>,
       node->Op()->GetAttr(OpProtoAndCheckerMaker::OpRoleVarAttrName()));
 
-  PADDLE_ENFORCE_EQ(param_grad.size(), 2U);
+  PADDLE_ENFORCE_EQ(
+      param_grad.size(), 2U,
+      platform::errors::InvalidArgument(
+          "In Node %s, the size of attribute %s must be 2, include Parameter "
+          "and Parameter@Grad.",
+          node->Name(), OpProtoAndCheckerMaker::OpRoleVarAttrName()));
   int dev_id = GetVarDeviceID(param_grad[1]);
-  PADDLE_ENFORCE_NE(dev_id, -1, "dev_id should not be -1.[%s, %s, %s]",
-                    node->Op()->Type(), param_grad[0], param_grad[1]);
+  PADDLE_ENFORCE_NE(dev_id, -1, platform::errors::NotFound(
+                                    "Can not find Device ID, for NodeName:%s, "
+                                    "NodeType:%s, Param:%s, Param@Grad:%s"
+                                    "For this fault, you can consult the "
+                                    "Paddle technical personnel for answer ",
+                                    node->Name(), node->Op()->Type(),
+                                    param_grad[0], param_grad[1]));
   return dev_id;
 }
 
@@ -654,10 +682,16 @@ size_t BalanceVarSSAGraphBuilder::GetAppropriateDeviceID(
   for (auto var_name : var_names) {
     if (all_vars_.find(var_name) == all_vars_.end()) continue;
     auto var_desc = all_vars_.at(var_name);
-    PADDLE_ENFORCE_NOT_NULL(var_desc);
+    PADDLE_ENFORCE_NOT_NULL(var_desc,
+                            platform::errors::NotFound(
+                                "Can not find Var(%s) in Var Desc.", var_name));
     auto dim = framework::make_ddim(var_desc->GetShape());
     int64_t numel = framework::product(dim);
-    PADDLE_ENFORCE_GT(numel, 0);
+    PADDLE_ENFORCE_GT(numel, 0,
+                      platform::errors::InvalidArgument(
+                          "The numel of Var(%s) must greater than 0"
+                          "Please check your code，about Var(%s) Shape.",
+                          var_name, var_name));
     numel_sum += numel;
   }
 
@@ -736,7 +770,12 @@ int ReduceSSAGraphBuilder::GetOpDeviceID(
       std::vector<std::string>,
       node->Op()->GetAttr(OpProtoAndCheckerMaker::OpRoleVarAttrName()));
 
-  PADDLE_ENFORCE_EQ(param_grad.size(), 2U);
+  PADDLE_ENFORCE_EQ(
+      param_grad.size(), 2U,
+      platform::errors::InvalidArgument(
+          "In Node %s, The size of attribute %s must be 2, include Parameter "
+          "and Parameter@Grad.",
+          node->Name(), OpProtoAndCheckerMaker::OpRoleVarAttrName()));
   int dev_id = GetVarDeviceID(param_grad[1]);
 
   if (dev_id == -1) {
@@ -798,7 +837,12 @@ std::vector<ir::Node *> ReduceSSAGraphBuilder::SortForReduceMode(
     }
   }
 
-  PADDLE_ENFORCE_EQ(sorted_ops.size(), topo_ops.size());
+  PADDLE_ENFORCE_EQ(sorted_ops.size(), topo_ops.size(),
+                    platform::errors::InvalidArgument(
+                        "Sorted ops calc error!"
+                        "The result for sorted ops size(%d) must be "
+                        "equal to topo ops size(%d).",
+                        sorted_ops.size(), topo_ops.size()));
 
   ResetState();
   return sorted_ops;
@@ -820,14 +864,23 @@ bool DistSSAGraphBuilder::DealWithSpecialOp(ir::Graph *result,
   bool insert_op = false;
   if (OpHaveRole(*node, OpRole::kRPC)) {
     int op_dev_id = CreateRPCOp(result, node);
-    PADDLE_ENFORCE(op_dev_id != -1,
-                   "Can not schedule the RPC operator to the right place.");
+    PADDLE_ENFORCE_NE(op_dev_id, -1, platform::errors::InvalidArgument(
+                                         "Can not schedule the RPC operator to "
+                                         "the right place. NodeName:%s.",
+                                         node->Name()));
     if (node->Op()->Type() == "recv") {
       auto recv_vars_attr =
           BOOST_GET_CONST(std::vector<std::string>,
                           node->Op()->GetNullableAttr(
                               OpProtoAndCheckerMaker::OpRoleVarAttrName()));
-      PADDLE_ENFORCE(recv_vars_attr.size() == 2UL);  // [parameter, gradient]
+      PADDLE_ENFORCE_EQ(
+          recv_vars_attr.size(), 2UL,
+          platform::errors::InvalidArgument(
+              "In Node %s, the size of attribute %s must be 2, include "
+              "Parameter and Parameter@Grad.",
+              node->Name(),
+              OpProtoAndCheckerMaker::OpRoleVarAttrName()));  // [parameter,
+                                                              // gradient]
       if (recv_vars_attr[0].find(".block") == std::string::npos) {
         bcast_var_name_set_[op_dev_id].emplace(recv_vars_attr[0]);
       }
@@ -879,8 +932,9 @@ int DistSSAGraphBuilder::CreateRPCOp(ir::Graph *result, ir::Node *node) const {
   if (node->Op()->Type() == "send") {
     // TODO(paddle-dev): getting the first var is not safe.
     op_dev_id = GetVarDeviceID(node->inputs[0]->Name());
-    PADDLE_ENFORCE(!ir::IsControlDepVar(*node->inputs[0]),
-                   "This hack no longer holds, please fix.");
+    PADDLE_ENFORCE_EQ(ir::IsControlDepVar(*node->inputs[0]), false,
+                      platform::errors::InvalidArgument(
+                          "This hack no longer holds, please fix."));
     // the variable name which contains .block means it was split by
     // split_byref op
     if (strategy_.reduce_ ==
@@ -893,7 +947,12 @@ int DistSSAGraphBuilder::CreateRPCOp(ir::Graph *result, ir::Node *node) const {
       auto send_param_grad = BOOST_GET_CONST(
           std::vector<std::string>,
           node->Op()->GetAttr(OpProtoAndCheckerMaker::OpRoleVarAttrName()));
-      PADDLE_ENFORCE_EQ(send_param_grad.size(), 2U);
+      PADDLE_ENFORCE_EQ(
+          send_param_grad.size(), 2U,
+          platform::errors::InvalidArgument(
+              "In Node %s, the size of attribute %s must be 2, include "
+              "Parameter and Parameter@Grad.",
+              node->Name(), OpProtoAndCheckerMaker::OpRoleVarAttrName()));
       op_dev_id = GetAppropriateDeviceID({send_param_grad[1]});
       VLOG(10) << "send grad " << input_var_names[0] << " origin "
                << send_param_grad[1] << " place: " << op_dev_id;
@@ -926,9 +985,10 @@ int DistSSAGraphBuilder::CreateRPCOp(ir::Graph *result, ir::Node *node) const {
     op_dev_id = 0;
   }
 
-  PADDLE_ENFORCE(op_dev_id != -1, "can not find the right place for rpc op: %s",
-                 node->Op()->Type());
-
+  PADDLE_ENFORCE_NE(
+      op_dev_id, -1,
+      platform::errors::NotFound("Can not find the right place for rpc op: %s.",
+                                 node->Op()->Type()));
   // Create fetch_barrier op handle to enable output on all devices.
   // **NOTE** fetch_barrier should output variables list same as recv op does.
   if (node->Op()->Type() == "fetch_barrier") {
@@ -956,7 +1016,10 @@ int DistSSAGraphBuilder::CreateRPCOp(ir::Graph *result, ir::Node *node) const {
       int outvar_dev_id = op_dev_id;
       if (node->Op()->Type() == "fetch_barrier") {
         outvar_dev_id = GetVarDeviceID(output->Name());
-        PADDLE_ENFORCE_NE(outvar_dev_id, -1, "output name %s", output->Name());
+        PADDLE_ENFORCE_NE(outvar_dev_id, -1,
+                          platform::errors::NotFound(
+                              "Can not find the right place for the var: %s.",
+                              output->Name()));
       }
       p = places_[outvar_dev_id];
       ir::Node *new_node = nullptr;
@@ -1007,13 +1070,14 @@ int DistSSAGraphBuilder::CreateDistTrainOp(ir::Graph *result,
   } else {
     LOG(ERROR) << "got unexpected dist op: " << node->Op()->Type();
     PADDLE_THROW(
-        "the distribute training related op should be in [split_byref, "
-        "concat].");
+        platform::errors::Unimplemented("The distribute training related op "
+                                        "should be in [split_byref, concat]."));
   }
 
-  PADDLE_ENFORCE(op_dev_id != -1,
-                 "can not find right place for distributed op: %s",
-                 node->Op()->Type());
+  PADDLE_ENFORCE_NE(op_dev_id, -1,
+                    platform::errors::NotFound(
+                        "Can not find right place for distributed op: %s.",
+                        node->Op()->Type()));
 
   CreateComputationalOp(result, node, op_dev_id);
   return op_dev_id;
diff --git a/paddle/fluid/framework/ir/multi_devices_graph_pass/multi_devices_graph_print_pass.cc b/paddle/fluid/framework/ir/multi_devices_graph_pass/multi_devices_graph_print_pass.cc
index efd549e79d0ef2ff31a3d1253201f1c2656adf84..a080b4bc33c53c376b54ae106c2e8f52e1ee7c86 100644
--- a/paddle/fluid/framework/ir/multi_devices_graph_pass/multi_devices_graph_print_pass.cc
+++ b/paddle/fluid/framework/ir/multi_devices_graph_pass/multi_devices_graph_print_pass.cc
@@ -28,7 +28,10 @@ class SSAGraghBuilderWithPrinterPass : public ir::Pass {
   void ApplyImpl(ir::Graph *graph) const override {
     std::unique_ptr<std::ostream> fout(
         new std::ofstream(Get<std::string>(kGraphvizPath)));
-    PADDLE_ENFORCE(fout->good());
+    PADDLE_ENFORCE_EQ(
+        fout->good(), true,
+        platform::errors::Unavailable("Open file fail! kGraphvizPath = %s.",
+                                      Get<std::string>(kGraphvizPath)));
     if (Has("graph_printer")) {
       Get<GraphvizSSAGraphPrinter>("graph_printer").Print(*graph, *fout);
     } else {
diff --git a/paddle/fluid/framework/ir/multi_devices_graph_pass/sequential_execution_pass.cc b/paddle/fluid/framework/ir/multi_devices_graph_pass/sequential_execution_pass.cc
index 7de3b7c6054183d9a9cb80e66bee571f29ed68eb..bcbd1e066cc1fd056f7de018a697fb842ad195eb 100644
--- a/paddle/fluid/framework/ir/multi_devices_graph_pass/sequential_execution_pass.cc
+++ b/paddle/fluid/framework/ir/multi_devices_graph_pass/sequential_execution_pass.cc
@@ -54,11 +54,16 @@ class SequentialExecutionPass : public ir::Pass {
       if (!node->IsOp()) continue;
       std::unordered_set<ir::Node *> preceding_ops;
       for (auto *in : node->inputs) {
-        PADDLE_ENFORCE(in->IsVar(),
-                       "Preceding Node of Op Nodes must be Var Node");
+        PADDLE_ENFORCE_EQ(
+            in->IsVar(), true,
+            platform::errors::InvalidArgument(
+                "Preceding Node(%s) of Op Nodes must be Var Node.",
+                in->Name()));
         if (in->inputs.empty()) continue;
-        PADDLE_ENFORCE(in->inputs.size() == 1 && in->inputs[0]->IsOp(),
-                       "Preceding Op Node of Var Node must be unique");
+        PADDLE_ENFORCE_EQ((in->inputs.size() == 1 && in->inputs[0]->IsOp()),
+                          true,
+                          platform::errors::InvalidArgument(
+                              "Preceding Op Node of Var Node must be unique."));
         preceding_ops.insert(in->inputs[0]);
         pending_ops[in->inputs[0]].insert(node);
       }
@@ -72,15 +77,18 @@ class SequentialExecutionPass : public ir::Pass {
       ir::Node *found_node = nullptr;
       for (auto *node : ready_ops) {
         if (IsSameOpDesc(op_desc, node->Op())) {
-          PADDLE_ENFORCE(found_node == nullptr,
-                         "Found multiple op_desc in graph: %s",
-                         op_desc->Type());
+          PADDLE_ENFORCE_EQ(
+              found_node, nullptr,
+              platform::errors::InvalidArgument(
+                  "Found multiple op_desc in graph: %s.", op_desc->Type()));
           found_node = node;
         }
       }
 
-      PADDLE_ENFORCE_NOT_NULL(found_node, "Cannot find op_desc in graph: %s",
-                              op_desc->Type());
+      PADDLE_ENFORCE_NOT_NULL(
+          found_node,
+          platform::errors::NotFound("Cannot find op_desc in graph: %s.",
+                                     op_desc->Type()));
       for (auto *pending_op : pending_ops[found_node]) {
         if (--op_deps.at(pending_op) == 0) {
           ready_ops.insert(pending_op);
diff --git a/paddle/fluid/framework/ir/multihead_matmul_fuse_pass.cc b/paddle/fluid/framework/ir/multihead_matmul_fuse_pass.cc
index 85d20a7b9a2902dbe26129649604405a860c61b5..40e01c75bb99157aedccd0692d7410b99393c009 100644
--- a/paddle/fluid/framework/ir/multihead_matmul_fuse_pass.cc
+++ b/paddle/fluid/framework/ir/multihead_matmul_fuse_pass.cc
@@ -45,13 +45,10 @@ static int BuildFusion(Graph* graph, const std::string& name_scope) {
   // Create pattern.
   MultiHeadMatmulPattern multihead_pattern(pattern, name_scope);
 
-  PDNode* x =
-      pattern->NewNode(patterns::UniqueKey("X"))->assert_var_not_persistable();
-
-  multihead_pattern(x);
+  multihead_pattern();
   // Create New OpDesc
   auto fuse_creater = [&](
-      Node* x, Node* mul0, Node* mul1, Node* mul2, Node* mul0_out,
+      Node* input0, Node* mul0, Node* mul1, Node* mul2, Node* mul0_out,
       Node* mul1_out, Node* mul2_out, Node* eltadd0_b, Node* eltadd1_b,
       Node* eltadd2_b, Node* eltadd_qk_b, Node* reshape2,
       Node* reshape2_qkv_out, Node* scale, Node* scale_out) {
@@ -115,7 +112,7 @@ static int BuildFusion(Graph* graph, const std::string& name_scope) {
   auto handler = [&](const GraphPatternDetector::subgraph_t& subgraph,
                      Graph* g) {
     // GET_IR_NODE_FROM_SUBGRAPH(dropout_out, dropout_out, multihead_pattern);
-    GET_IR_NODE_FROM_SUBGRAPH(layer_norm, layer_norm, multihead_pattern);
+    GET_IR_NODE_FROM_SUBGRAPH(input0, input0, multihead_pattern);
 
     GET_IR_NODE_FROM_SUBGRAPH(mul0, mul0, multihead_pattern);
     GET_IR_NODE_FROM_SUBGRAPH(mul0_out, mul0_out, multihead_pattern);
@@ -185,7 +182,7 @@ static int BuildFusion(Graph* graph, const std::string& name_scope) {
     GET_IR_NODE_FROM_SUBGRAPH(transpose2_qkv_out, transpose2_qkv_out,
                               multihead_pattern);
 
-    fuse_creater(layer_norm, mul0, mul1, mul2, mul0_out, mul1_out, mul2_out,
+    fuse_creater(input0, mul0, mul1, mul2, mul0_out, mul1_out, mul2_out,
                  eltadd0_b, eltadd1_b, eltadd2_b, eltadd_qk_b, reshape2_0,
                  reshape2_qkv_out, scale, scale_out);
 
@@ -232,12 +229,9 @@ static int BuildFusion(Graph* graph, const std::string& name_scope) {
   return fusion_count;
 }
 
-PDNode* MultiHeadMatmulPattern::operator()(paddle::framework::ir::PDNode* x) {
-  // Create shared nodes.
-  auto* layer_norm = pattern->NewNode(layer_norm_repr());
-
-  auto* layer_norm_out_var = pattern->NewNode(layer_norm_out_repr());
-  layer_norm_out_var->assert_is_op_input("mul");
+PDNode* MultiHeadMatmulPattern::operator()() {
+  auto* input0 = pattern->NewNode(input0_repr());
+  input0->assert_is_op_input("mul");
 
   // First path with scale
   auto* mul0 = pattern->NewNode(mul0_repr())->assert_is_op("mul");
@@ -390,17 +384,15 @@ PDNode* MultiHeadMatmulPattern::operator()(paddle::framework::ir::PDNode* x) {
   transpose2_2_out_var->AsIntermediate()->assert_is_op_input(
       "matmul");  // link to matmul qkv
 
-  // Link all nodes together
-  layer_norm->LinksFrom({x}).LinksTo({layer_norm_out_var});
   // Q path
-  mul0->LinksFrom({layer_norm_out_var, mul0_w_var}).LinksTo({mul0_out_var});
+  mul0->LinksFrom({input0, mul0_w_var}).LinksTo({mul0_out_var});
   eltadd0->LinksFrom({mul0_out_var, eltadd0_b_var}).LinksTo({eltadd0_out_var});
 
   reshape2_0->LinksFrom({eltadd0_out_var}).LinksTo({reshape2_0_out_var});
   transpose2_0->LinksFrom({reshape2_0_out_var}).LinksTo({transpose2_0_out_var});
   scale->LinksFrom({transpose2_0_out_var}).LinksTo({scale_out_var});
   // K path
-  mul1->LinksFrom({layer_norm_out_var, mul1_w_var}).LinksTo({mul1_out_var});
+  mul1->LinksFrom({input0, mul1_w_var}).LinksTo({mul1_out_var});
   eltadd1->LinksFrom({mul1_out_var, eltadd1_b_var}).LinksTo({eltadd1_out_var});
   reshape2_1->LinksFrom({eltadd1_out_var}).LinksTo({reshape2_1_out_var});
   transpose2_1->LinksFrom({reshape2_1_out_var}).LinksTo({transpose2_1_out_var});
@@ -411,7 +403,7 @@ PDNode* MultiHeadMatmulPattern::operator()(paddle::framework::ir::PDNode* x) {
       .LinksTo({eltadd_qk_out_var});
   softmax_qk->LinksFrom({eltadd_qk_out_var}).LinksTo({softmax_qk_out_var});
   // V  path
-  mul2->LinksFrom({layer_norm_out_var, mul2_w_var}).LinksTo({mul2_out_var});
+  mul2->LinksFrom({input0, mul2_w_var}).LinksTo({mul2_out_var});
   eltadd2->LinksFrom({mul2_out_var, eltadd2_b_var}).LinksTo({eltadd2_out_var});
   reshape2_2->LinksFrom({eltadd2_out_var}).LinksTo({reshape2_2_out_var});
   transpose2_2->LinksFrom({reshape2_2_out_var}).LinksTo({transpose2_2_out_var});
@@ -434,13 +426,10 @@ static int BuildFusionV2(Graph* graph, const std::string& name_scope,
   // Create pattern.
   MultiHeadMatmulPattern multihead_pattern(pattern, name_scope);
 
-  PDNode* x =
-      pattern->NewNode(patterns::UniqueKey("X"))->assert_var_not_persistable();
-
-  multihead_pattern(x);
+  multihead_pattern();
   // Create New OpDesc
   auto fuse_creater = [&](
-      Node* layer_norm_out, Node* mul0, Node* mul1, Node* mul2, Node* mul0_out,
+      Node* input0, Node* mul0, Node* mul1, Node* mul2, Node* mul0_out,
       Node* mul1_out, Node* mul2_out, Node* mul0_w, Node* mul1_w, Node* mul2_w,
       Node* eltadd0_b, Node* eltadd1_b, Node* eltadd2_b, Node* eltadd_qk_b,
       Node* reshape2, Node* reshape2_qkv_out, Node* scale, Node* scale_out) {
@@ -471,29 +460,20 @@ static int BuildFusionV2(Graph* graph, const std::string& name_scope,
         framework::make_ddim({wq_tensor->dims()[0], 3, wq_tensor->dims()[1]});
     auto combined_bias_dims = framework::make_ddim({3, bq_tensor->dims()[0]});
 
-    // create a new var in scope
-    VarDesc combined_w_desc(
-        patterns::PDNodeName(name_scope, "multi_head_combined_weight"));
-    combined_w_desc.SetShape({wq_tensor->dims()[0], 3, wq_tensor->dims()[1]});
-    combined_w_desc.SetDataType(wq_tensor->type());
-    combined_w_desc.SetLoDLevel(mul0_w->Var()->GetLoDLevel());
-    combined_w_desc.SetPersistable(true);
-
-    // create a new var in scope
-    VarDesc combined_bias_desc(
-        patterns::PDNodeName(name_scope, "multi_head_combined_bias"));
-    combined_bias_desc.SetShape({3, bq_tensor->dims()[0]});
-    combined_bias_desc.SetDataType(bq_tensor->type());
-    combined_bias_desc.SetLoDLevel(eltadd0_b->Var()->GetLoDLevel());
-    combined_bias_desc.SetPersistable(true);
-
-    auto* combined_w_node = graph->CreateVarNode(&combined_w_desc);
-    auto* combined_w_tensor =
-        scope->Var(combined_w_node->Name())->GetMutable<LoDTensor>();
-
-    combined_w_tensor->Resize(combined_w_dims);
-    auto* combined_w_data =
-        combined_w_tensor->mutable_data<float>(platform::CPUPlace());
+    // reuse the mul0_w and eltadd_0_b nodes for the combined nodes.
+    auto* combined_w_desc = mul0_w->Var();
+    combined_w_desc->SetShape({wq_tensor->dims()[0], 3, wq_tensor->dims()[1]});
+    combined_w_desc->SetPersistable(true);
+
+    auto* combined_bias_desc = eltadd0_b->Var();
+    combined_bias_desc->SetShape({3, bq_tensor->dims()[0]});
+    combined_bias_desc->SetPersistable(true);
+
+    framework::LoDTensor tmp_combined_w_tensor;
+    tmp_combined_w_tensor.Resize(combined_w_dims);
+    auto* tmp_combined_w_data =
+        tmp_combined_w_tensor.mutable_data<float>(platform::CPUPlace());
+
     std::vector<float*> w_vec = {wq_data, wk_data, wv_data};
     int dims_h = combined_w_dims[0], dims_w = combined_w_dims[2];
     // Combine the three fc weights together.
@@ -502,25 +482,38 @@ static int BuildFusionV2(Graph* graph, const std::string& name_scope,
         for (int k = 0; k < dims_w; k++) {
           int out_index = i * (3 * dims_w) + j * dims_w + k;
           int in_index = i * dims_w + k;
-          combined_w_data[out_index] = w_vec[j][in_index];
+          tmp_combined_w_data[out_index] = w_vec[j][in_index];
         }
       }
     }
-    scope->EraseVars({mul0_w->Name(), mul1_w->Name(), mul2_w->Name()});
-    auto* combined_bias_node = graph->CreateVarNode(&combined_bias_desc);
-    auto* combined_bias_tensor =
-        scope->Var(combined_bias_node->Name())->GetMutable<LoDTensor>();
-
-    combined_bias_tensor->Resize(combined_bias_dims);
-    auto* combined_bias_data =
-        combined_bias_tensor->mutable_data<float>(platform::CPUPlace());
+
+    wq_tensor->Resize(combined_w_dims);
+    auto* new_combined_w_data =
+        wq_tensor->mutable_data<float>(platform::CPUPlace());
+    memcpy(new_combined_w_data, tmp_combined_w_data,
+           sizeof(float) * wq_tensor->numel());
+
+    scope->EraseVars({mul1_w->Name(), mul2_w->Name()});
+
+    framework::LoDTensor tmp_combined_bias_tensor;
+    tmp_combined_bias_tensor.Resize(combined_bias_dims);
+    auto* tmp_combined_bias_data =
+        tmp_combined_bias_tensor.mutable_data<float>(platform::CPUPlace());
+
     size_t bias_size = bq_tensor->numel();
-    memcpy(combined_bias_data, bq_data, sizeof(float) * bias_size);
-    memcpy(combined_bias_data + bias_size, bk_data, sizeof(float) * bias_size);
-    memcpy(combined_bias_data + 2 * bias_size, bv_data,
+    memcpy(tmp_combined_bias_data, bq_data, sizeof(float) * bias_size);
+    memcpy(tmp_combined_bias_data + bias_size, bk_data,
+           sizeof(float) * bias_size);
+    memcpy(tmp_combined_bias_data + 2 * bias_size, bv_data,
            sizeof(float) * bias_size);
 
-    scope->EraseVars({eltadd0_b->Name(), eltadd1_b->Name(), eltadd2_b->Name()});
+    bq_tensor->Resize(combined_bias_dims);
+    auto* new_combined_bias_data =
+        bq_tensor->mutable_data<float>(platform::CPUPlace());
+    memcpy(new_combined_bias_data, tmp_combined_bias_data,
+           sizeof(float) * bq_tensor->numel());
+
+    scope->EraseVars({eltadd1_b->Name(), eltadd2_b->Name()});
 
     auto reshape_desc = reshape2->Op();
     int head_number =
@@ -529,9 +522,9 @@ static int BuildFusionV2(Graph* graph, const std::string& name_scope,
     OpDesc multihead_op_desc;
     multihead_op_desc.SetType("multihead_matmul");
 
-    multihead_op_desc.SetInput("Input", {layer_norm_out->Name()});
-    multihead_op_desc.SetInput("W", {combined_w_node->Name()});
-    multihead_op_desc.SetInput("Bias", {combined_bias_node->Name()});
+    multihead_op_desc.SetInput("Input", {input0->Name()});
+    multihead_op_desc.SetInput("W", {mul0_w->Name()});
+    multihead_op_desc.SetInput("Bias", {eltadd0_b->Name()});
     multihead_op_desc.SetInput("BiasQK", {eltadd_qk_b->Name()});
 
     multihead_op_desc.SetOutput("Out", {reshape2_qkv_out->Name()});
@@ -540,9 +533,9 @@ static int BuildFusionV2(Graph* graph, const std::string& name_scope,
 
     auto* multihead = graph->CreateOpNode(&multihead_op_desc);
 
-    IR_NODE_LINK_TO(layer_norm_out, multihead);
-    IR_NODE_LINK_TO(combined_w_node, multihead);
-    IR_NODE_LINK_TO(combined_bias_node, multihead);
+    IR_NODE_LINK_TO(input0, multihead);
+    IR_NODE_LINK_TO(mul0_w, multihead);
+    IR_NODE_LINK_TO(eltadd0_b, multihead);
     IR_NODE_LINK_TO(eltadd_qk_b, multihead);
 
     IR_NODE_LINK_TO(multihead, reshape2_qkv_out);
@@ -552,9 +545,7 @@ static int BuildFusionV2(Graph* graph, const std::string& name_scope,
   auto handler = [&](const GraphPatternDetector::subgraph_t& subgraph,
                      Graph* g) {
     // GET_IR_NODE_FROM_SUBGRAPH(dropout_out, dropout_out, multihead_pattern);
-    GET_IR_NODE_FROM_SUBGRAPH(layer_norm, layer_norm, multihead_pattern);
-    GET_IR_NODE_FROM_SUBGRAPH(layer_norm_out, layer_norm_out,
-                              multihead_pattern);
+    GET_IR_NODE_FROM_SUBGRAPH(input0, input0, multihead_pattern);
 
     GET_IR_NODE_FROM_SUBGRAPH(mul0, mul0, multihead_pattern);
     GET_IR_NODE_FROM_SUBGRAPH(mul0_out, mul0_out, multihead_pattern);
@@ -624,14 +615,13 @@ static int BuildFusionV2(Graph* graph, const std::string& name_scope,
     GET_IR_NODE_FROM_SUBGRAPH(transpose2_qkv_out, transpose2_qkv_out,
                               multihead_pattern);
 
-    fuse_creater(layer_norm_out, mul0, mul1, mul2, mul0_out, mul1_out, mul2_out,
-                 mul0_w, mul1_w, mul2_w, eltadd0_b, eltadd1_b, eltadd2_b,
-                 eltadd_qk_b, reshape2_0, reshape2_qkv_out, scale, scale_out);
+    fuse_creater(input0, mul0, mul1, mul2, mul0_out, mul1_out, mul2_out, mul0_w,
+                 mul1_w, mul2_w, eltadd0_b, eltadd1_b, eltadd2_b, eltadd_qk_b,
+                 reshape2_0, reshape2_qkv_out, scale, scale_out);
 
     std::unordered_set<const Node*> marked_nodes({eltadd0,
                                                   eltadd1,
                                                   eltadd2,
-                                                  eltadd0_b,
                                                   eltadd1_b,
                                                   eltadd2_b,
                                                   eltadd0_out,
@@ -665,7 +655,6 @@ static int BuildFusionV2(Graph* graph, const std::string& name_scope,
                                                   mul0_out,
                                                   mul1_out,
                                                   mul2_out,
-                                                  mul0_w,
                                                   mul1_w,
                                                   mul2_w,
                                                   reshape2_qkv,
diff --git a/paddle/fluid/framework/ir/multihead_matmul_fuse_pass.h b/paddle/fluid/framework/ir/multihead_matmul_fuse_pass.h
index d6299c39c739d7ef191eebd5f09f56aceaa9b9c7..0afa00fc62aa79c8a63350bc63cfe464999ca0e4 100644
--- a/paddle/fluid/framework/ir/multihead_matmul_fuse_pass.h
+++ b/paddle/fluid/framework/ir/multihead_matmul_fuse_pass.h
@@ -29,11 +29,10 @@ struct MultiHeadMatmulPattern : public PatternBase {
   MultiHeadMatmulPattern(PDPattern* pattern, const std::string& name_scope)
       : PatternBase(pattern, name_scope, "multihead_matmul") {}
 
-  PDNode* operator()(PDNode* x);
+  PDNode* operator()();
 
   // declare operator node's name
-  PATTERN_DECL_NODE(layer_norm);
-  PATTERN_DECL_NODE(layer_norm_out);
+  PATTERN_DECL_NODE(input0);
   PATTERN_DECL_NODE(mul0);
   PATTERN_DECL_NODE(mul1);
   PATTERN_DECL_NODE(mul2);
diff --git a/paddle/fluid/framework/lod_tensor_test.cu b/paddle/fluid/framework/lod_tensor_test.cu
index 7d6ba984f6fe0385b81e320c8a5a162210e33e83..7f0f46b1bb362b0b3983c1e61921d5c306e8d15f 100644
--- a/paddle/fluid/framework/lod_tensor_test.cu
+++ b/paddle/fluid/framework/lod_tensor_test.cu
@@ -22,10 +22,7 @@
 #include "paddle/fluid/platform/place.h"
 
 __global__ void test(size_t* a, int size) {
-  for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < size;
-       i += blockDim.x * gridDim.x) {
-    a[i] *= 2;
-  }
+  CUDA_KERNEL_LOOP(i, size) { a[i] *= 2; }
 }
 
 TEST(LoD, data) {
diff --git a/paddle/fluid/framework/pipeline_trainer.cc b/paddle/fluid/framework/pipeline_trainer.cc
index 47e962a4825369020535905dab2859fd9be0398b..379892ecfd1161fd5e5003552bc48b1153b2c412 100644
--- a/paddle/fluid/framework/pipeline_trainer.cc
+++ b/paddle/fluid/framework/pipeline_trainer.cc
@@ -23,8 +23,13 @@ namespace framework {
 
 void PipelineTrainer::Initialize(const TrainerDesc& trainer_desc,
                                  Dataset* dataset) {
-  pipeline_num_ = trainer_desc.thread_num();
-  VLOG(3) << "pipeline num: " << pipeline_num_;
+  const auto& section_params = trainer_desc.section_param();
+  num_microbatches_ = section_params.num_microbatches();
+  VLOG(3) << "Number of microbatches per minibatch: " << num_microbatches_;
+  section_num_ = section_params.section_config_size();
+  VLOG(3) << "Number of program sections: " << section_num_;
+  trainer_desc_ = trainer_desc;
+  start_cpu_core_id_ = section_params.start_cpu_core_id();
 
   SetDataset(dataset);
   ParseDumpConfig(trainer_desc);
@@ -32,96 +37,62 @@ void PipelineTrainer::Initialize(const TrainerDesc& trainer_desc,
   const std::vector<paddle::framework::DataFeed*> readers =
       dataset->GetReaders();
   VLOG(3) << "readers num: " << readers.size();
-
-  pipeline_config_ = trainer_desc.section_param();
-  scope_queue_size_ = pipeline_config_.queue_size();
-  sync_steps_ = pipeline_config_.sync_steps();
-  section_num_ = pipeline_config_.section_config_size();
-
-  VLOG(3) << "scope_queue_size: " << scope_queue_size_;
-  VLOG(3) << "section num: " << section_num_;
-  VLOG(3) << "sync_steps: " << sync_steps_;
+  int num_readers = readers.size();
+  PADDLE_ENFORCE_EQ(num_readers, 1,
+                    platform::errors::InvalidArgument(
+                        "Number of dataset readers for pipeline "
+                        "must be 1 now, but the value you give is %d.",
+                        num_readers));
+  auto* reader = readers[0];
+  feed_var_names_ = reader->GetUseSlotAlias();
 
   workers_.resize(section_num_);
-  in_var_names_.resize(section_num_);
-  out_var_names_.resize(section_num_);
-  worker_count_.resize(section_num_);
-  worker_count_mutex_.resize(section_num_);
-  param_need_sync_.reset(new std::vector<std::string>);
-
-  int reader_index = 0;
   for (int i = 0; i < section_num_; ++i) {
-    const auto& section_config = pipeline_config_.section_config(i);
-    int concurrency = section_config.concurrency();
-    VLOG(3) << "the thread num of each pipeline in section " << i
-            << " is: " << concurrency;
-    in_var_names_[i].reset(new std::vector<std::string>(
-        section_config.section_in_var_names().begin(),
-        section_config.section_in_var_names().end()));
-    out_var_names_[i].reset(new std::vector<std::string>(
-        section_config.section_out_var_names().begin(),
-        section_config.section_out_var_names().end()));
-    worker_count_[i].resize(pipeline_num_);
-    worker_count_mutex_[i].resize(pipeline_num_);
-    for (int j = 0; j < pipeline_num_; ++j) {
-      worker_count_[i][j] = new int(concurrency);
-      worker_count_mutex_[i][j].reset(new std::mutex);
-    }
-
+    const auto& section_config = section_params.section_config(i);
     platform::Place place;
-    workers_[i].resize(pipeline_num_);
-    for (int j = 0; j < pipeline_num_; ++j) {
-      workers_[i][j].resize(concurrency);
-
-      switch (section_config.place()) {
-        case SectionConfig::CPUPlace:
-          place = platform::CPUPlace();
-          break;
-        case SectionConfig::CUDAPlace:
-          // Note that one section has at most one GPU place in one pipeline
-          place = platform::CUDAPlace(j);
-          break;
-        case SectionConfig::CUDAPinnedPlace:
-          place = platform::CUDAPinnedPlace();
-          break;
-        default:
-          PADDLE_ENFORCE(false, "Unkown place type in SectionConfig: %d",
-                         section_config.place());
-      }
+    int place_id = section_config.place_id();
+    switch (section_config.place()) {
+      case SectionConfig::CPUPlace:
+        place = platform::CPUPlace();
+        break;
+      case SectionConfig::CUDAPlace:
+        // Note that one section has at most one GPU place in one pipeline
+        PADDLE_ENFORCE_GE(
+            place_id, 0,
+            platform::errors::InvalidArgument(
+                "The place_id value for CUDAPlace shoud be greater "
+                "than or equal to 0, but the value you give is %d.",
+                place_id));
+        place = platform::CUDAPlace(place_id);
+        break;
+      case SectionConfig::CUDAPinnedPlace:
+        place = platform::CUDAPinnedPlace();
+        break;
+      default:
+        PADDLE_ENFORCE_NOT_NULL(nullptr,
+                                platform::errors::InvalidArgument(
+                                    "Unkown place type in SectionConfig: %d",
+                                    section_config.place()));
+    }
+    places_.emplace_back(place);
+    VLOG(3) << "Device worker place: " << place << ", device id: " << place_id
+            << ", section: " << i;
 
-      for (int k = 0; k < concurrency; ++k) {
-        workers_[i][j][k] = DeviceWorkerFactory::CreateDeviceWorker(
-            trainer_desc.device_worker_name());
-        auto this_worker =
-            std::dynamic_pointer_cast<paddle::framework::SectionWorker>(
-                workers_[i][j][k]);
-        this_worker->SetSectionIndex(i);
-        this_worker->SetDeviceIndex(j);
-        this_worker->SetThreadIndex(k);
-        this_worker->SetSectionNum(section_num_);
-        this_worker->SetPipelineNum(pipeline_num_);
-        if (i == 0) {
-          this_worker->SetDataFeed(readers[reader_index++]);
-          this_worker->SetReaderPlace(place);
-        }
-        if (i == section_num_ - 1) {
-          this_worker->SetNeedDumpField(need_dump_field_);
-          this_worker->SetNeedDumpParam(need_dump_param_);
-          this_worker->SetDumpFieldVector(dump_fields_);
-          this_worker->SetDumpParamVector(dump_param_);
-        }
-        this_worker->SetPlace(place);
-        this_worker->Initialize(trainer_desc);
-        this_worker->InitRandomDumpConfig(trainer_desc);
-      }
+    workers_[i] = DeviceWorkerFactory::CreateDeviceWorker(
+        trainer_desc.device_worker_name());
+    auto this_worker =
+        std::dynamic_pointer_cast<paddle::framework::SectionWorker>(
+            workers_[i]);
+    if (i == 0) {
+      // we only set reader for the first section
+      this_worker->SetDataFeed(reader);
+      this_worker->SetReaderPlace(place);
     }
-  }
-  param_need_sync_.reset(
-      new std::vector<std::string>(pipeline_config_.param_need_sync().begin(),
-                                   pipeline_config_.param_need_sync().end()));
-  VLOG(3) << "param_need_sync_ have: ";
-  for (const std::string& name : *param_need_sync_) {
-    VLOG(3) << name;
+    this_worker->SetThreadIndex(i);
+    this_worker->SetSectionIndex(i);
+    this_worker->SetPlace(place);
+    this_worker->Initialize(trainer_desc);
+    this_worker->SetMicrobatchNum(num_microbatches_);
   }
   // set debug here
   SetDebug(trainer_desc.debug());
@@ -140,13 +111,7 @@ std::string PipelineTrainer::GetDumpPath(int tid) {
 
 void PipelineTrainer::InitDumpEnv() {
   queue_ = paddle::framework::MakeChannel<std::string>();
-  // Only set dump channel on the last section
-  for (int j = 0; j < pipeline_num_; ++j) {
-    for (size_t k = 0; k < workers_[section_num_ - 1][j].size(); ++k) {
-      workers_[section_num_ - 1][j][k]->SetChannelWriter(queue_.get());
-    }
-  }
-  // TODO(hutuxian): should make it as a config
+  // TODO(sandyhouse): should make it as a config
   dump_thread_num_ = 1;
   for (int i = 0; i < dump_thread_num_; i++) {
     dump_thread_.push_back(
@@ -154,150 +119,105 @@ void PipelineTrainer::InitDumpEnv() {
   }
 }
 
-void PipelineTrainer::InitFirstScopeQueue(ScopeQueue* scope_queue,
-                                          int pipeline_id,
-                                          const ProgramDesc& main_program,
-                                          const Scope& root_scope) {
-  for (int i = 0; i < scope_queue_size_; ++i) {
-    Scope* scope = &pipeline_scopes_[pipeline_id]->NewScope();
-    for (auto& var : main_program.Block(0).AllVars()) {
-      if (!var->Persistable()) {
-        auto* ptr = scope->Var(var->Name());
-        InitializeVariable(ptr, var->GetType());
+void PipelineTrainer::CopyParameters(int section_id, int microbatch_id,
+                                     const ProgramDesc& program,
+                                     const platform::Place& place) {
+  auto& global_block = program.Block(0);
+  for (auto& var : global_block.AllVars()) {
+    int is_feed_var =
+        std::count(feed_var_names_.begin(), feed_var_names_.end(), var->Name());
+    if ((var->Persistable() || is_feed_var) && microbatch_id == 0) {
+      if (is_feed_var) {
+        auto* new_ptr = minibatch_scopes_[section_id]->Var(var->Name());
+        VLOG(3) << "data name: " << var->Name() << ", ptr: " << new_ptr;
+        InitializeVariable(new_ptr, var->GetType());
       } else {
-        if (section_num_ == 1) {  // Means only one section and it must be
-                                  // CUDAPlace, so copy all persistable vars to
-                                  // pipeline scope
-          const LoDTensor& root_tensor =
-              root_scope.FindVar(var->Name())->Get<LoDTensor>();
-          LoDTensor* gpu_tensor = pipeline_scopes_[pipeline_id]
-                                      ->Var(var->Name())
-                                      ->GetMutable<LoDTensor>();
-          platform::Place place = platform::CUDAPlace(pipeline_id);
-          TensorCopy(*static_cast<const Tensor*>(&root_tensor), place,
-                     static_cast<Tensor*>(gpu_tensor));
-        }
+        auto* ptr = root_scope_->FindVar(var->Name());
+        auto* new_ptr = minibatch_scopes_[section_id]->Var(var->Name());
+        VLOG(3) << "Create persistable var " << var->Name() << " for minibatch "
+                << section_id << ", which pointer is " << new_ptr;
+        InitializeVariable(new_ptr, var->GetType());
+        const LoDTensor& root_tensor = ptr->Get<LoDTensor>();
+        LoDTensor* minibatch_tensor = new_ptr->GetMutable<LoDTensor>();
+        TensorCopy(*static_cast<const Tensor*>(&root_tensor), place,
+                   static_cast<Tensor*>(minibatch_tensor));
       }
+    } else if (!var->Persistable() && !is_feed_var) {
+      auto* ptr =
+          microbatch_scopes_[section_id][microbatch_id]->Var(var->Name());
+      VLOG(3) << "Create variable " << var->Name() << " for section "
+              << section_id << " microbatch " << microbatch_id
+              << ", which pointer is " << ptr;
+      InitializeVariable(ptr, var->GetType());
     }
-    scope_queue->Send(scope);
   }
 }
 
-void PipelineTrainer::CopyParameters(const Scope& root_scope, int pipeline_id) {
-  for (const std::string& name : *param_need_sync_) {
-    const LoDTensor& root_tensor = root_scope.FindVar(name)->Get<LoDTensor>();
-
-    // TODO(hutxian): check a new var of the same name is created in
-    // pipeline_scope
-    LoDTensor* gpu_tensor =
-        pipeline_scopes_[pipeline_id]->Var(name)->GetMutable<LoDTensor>();
-    platform::Place place = platform::CUDAPlace(pipeline_id);
-    TensorCopy(*static_cast<const Tensor*>(&root_tensor), place,
-               static_cast<Tensor*>(gpu_tensor));
+void PipelineTrainer::GetSkipVars(int section_id, const ProgramDesc& program) {
+  auto& global_block = program.Block(0);
+  for (auto& op : global_block.AllOps()) {
+    if (op->Type() != "enqueue") {
+      continue;
+    }
+    auto input_arg_names = op->InputArgumentNames();
+    PADDLE_ENFORCE_EQ(input_arg_names.size(), 1,
+                      platform::errors::InvalidArgument(
+                          "Number of input arguments for enqueue op must be 1, "
+                          "but the value is %d.",
+                          input_arg_names.size()));
+    std::string input_arg_name = input_arg_names[0];
+    if (input_arg_name.rfind("@GRAD") != input_arg_name.size() - 5) {
+      skip_vars_[section_id].emplace_back(input_arg_name);
+      VLOG(3) << "add skip var name: " << input_arg_name;
+    }
   }
 }
 
 void PipelineTrainer::InitTrainerEnv(const ProgramDesc& main_program,
                                      const platform::Place& place) {
-  PADDLE_ENFORCE(root_scope_, "Null root_scope pointer");
-  SectionWorker::cpu_id_.store(pipeline_config_.start_cpu_core_id());
-  scope_queues_.resize(section_num_);
-  pipeline_scopes_.resize(pipeline_num_);
-  for (auto& var : main_program.Block(0).AllVars()) {
-    if (var->Persistable()) {
-      persistable_vars_.push_back(var->Name());
-    }
-  }
+  PADDLE_ENFORCE_NOT_NULL(root_scope_,
+                          platform::errors::InvalidArgument(
+                              "root_scope pointer can not be nullptr"));
+  auto start_cpu_id = trainer_desc_.section_param().start_cpu_core_id();
+  SectionWorker::cpu_id_.store(start_cpu_id);
+  minibatch_scopes_.resize(section_num_);
+  microbatch_scopes_.resize(section_num_);
+  skip_vars_.resize(section_num_);
 
   VLOG(3) << "Init ScopeQueues and create all scopes";
   for (int i = 0; i < section_num_; ++i) {
-    for (int j = 0; j < pipeline_num_; ++j) {
-      scope_queues_[i].emplace_back(new ScopeQueue(scope_queue_size_));
-      if (i == 0) {
-        pipeline_scopes_[j] = &root_scope_->NewScope();
-        CopyParameters(*root_scope_, j);
-        InitFirstScopeQueue(scope_queues_[0].back().get(), j, main_program,
-                            *root_scope_);
-      }
+    minibatch_scopes_[i] = &root_scope_->NewScope();
+    std::shared_ptr<framework::ProgramDesc> program;
+    program.reset(new ProgramDesc(
+        trainer_desc_.section_param().section_config(i).program_desc()));
+    microbatch_scopes_[i].resize(num_microbatches_);
+    for (int j = 0; j < num_microbatches_; ++j) {
+      microbatch_scopes_[i][j] = &minibatch_scopes_[i]->NewScope();
+      CopyParameters(i, j, *program, places_[i]);
     }
+    GetSkipVars(i, *program);
   }
 
   for (int i = 0; i < section_num_; ++i) {
-    for (int j = 0; j < pipeline_num_; ++j) {
-      for (size_t k = 0; k < workers_[i][j].size(); ++k) {
-        auto this_worker =
-            std::dynamic_pointer_cast<paddle::framework::SectionWorker>(
-                workers_[i][j][k]);
-        this_worker->SetRootScope(root_scope_);
-        this_worker->SetCountMutex(worker_count_mutex_[i][j].get());
-        this_worker->SetWorkerCount(worker_count_[i][j]);
-        this_worker->SetScopeQueue(scope_queues_[i][j].get(),
-                                   (i == section_num_ - 1)
-                                       ? scope_queues_[0][j].get()
-                                       : scope_queues_[i + 1][j].get());
-        this_worker->SetVarNames(*in_var_names_[i], *out_var_names_[i]);
-        if (i != section_num_ - 1) {
-          // For data copy in adjacent different place
-          this_worker->SetNextSectionPlace(
-              std::dynamic_pointer_cast<paddle::framework::SectionWorker>(
-                  workers_[i + 1][j][0])
-                  ->place());
-        }
-      }
-    }
-  }
-
-  if (pipeline_num_ > 1 && sync_steps_ != -1) {
-    construct_sync_functor();
-  }
-}
-
-void PipelineTrainer::construct_sync_functor() {
-  std::vector<platform::Place> cuda_places;
-  for (int i = 0; i < pipeline_num_; ++i) {
-    cuda_places.emplace_back(platform::CUDAPlace(i));
-  }
-  nccl_ctx_map_.reset(new platform::NCCLContextMap(cuda_places));
-  sync_functors_.resize(pipeline_num_);
-  SyncFunctor::sync_flag_ = 0;
-  SyncFunctor::pipeline_scopes_.resize(0);
-
-  for (int j = 0; j < pipeline_num_; ++j) {
-    SyncFunctor* sync_function = new SyncFunctor(j, pipeline_num_, sync_steps_);
-    sync_function->SetSyncParam(*param_need_sync_);
-    sync_function->SetNcclCtxMap(nccl_ctx_map_.get());
-    SyncFunctor::pipeline_scopes_.push_back(this->pipeline_scopes_[j]);
-    sync_functors_[j].reset(sync_function);
-  }
-  for (int i = section_num_ - 1; i >= 0; --i) {
-    if (SectionConfig::CUDAPlace ==
-        pipeline_config_.section_config(i).place()) {
-      for (int j = 0; j < pipeline_num_; ++j) {
-        for (size_t k = 0; k < workers_[i][j].size(); ++k) {
-          auto this_worker =
-              std::dynamic_pointer_cast<paddle::framework::SectionWorker>(
-                  workers_[i][j][k]);
-          this_worker->SetSyncFunctor(sync_functors_[j].get());
-        }
-      }
-      break;
-    }
+    auto this_worker =
+        std::dynamic_pointer_cast<paddle::framework::SectionWorker>(
+            workers_[i]);
+    this_worker->SetRootScope(root_scope_);
+    this_worker->SetMinibatchScope(minibatch_scopes_[i]);
+    this_worker->SetMicrobatchScopes(microbatch_scopes_[i]);
+    this_worker->SetSkipVars(skip_vars_[i]);
   }
 }
 
 void PipelineTrainer::Run() {
   VLOG(3) << "Going to run";
   for (int i = 0; i < section_num_; ++i) {
-    for (int j = 0; j < pipeline_num_; ++j) {
-      for (size_t k = 0; k < workers_[i][j].size(); ++k) {
-        if (!debug_) {
-          section_threads_.push_back(
-              std::thread(&DeviceWorker::TrainFiles, workers_[i][j][k].get()));
-        } else {
-          section_threads_.push_back(std::thread(
-              &DeviceWorker::TrainFilesWithProfiler, workers_[i][j][k].get()));
-        }
-      }
+    if (!debug_) {
+      section_threads_.push_back(
+          std::thread(&DeviceWorker::TrainFiles, workers_[i].get()));
+    } else {
+      section_threads_.push_back(std::thread(
+          &DeviceWorker::TrainFilesWithProfiler, workers_[i].get()));
     }
   }
 }
@@ -309,18 +229,31 @@ void PipelineTrainer::Finalize() {
   if (need_dump_field_) {
     FinalizeDumpEnv();
   }
-  for (const auto& var : persistable_vars_) {
-    auto* root_tensor = root_scope_->Var(var)->GetMutable<LoDTensor>();
-    // TODO(hutuxian): Add a final all-reduce?
-    const auto& thread_tensor =
-        pipeline_scopes_[0]->FindVar(var)->Get<LoDTensor>();
-    TensorCopySync(thread_tensor, platform::CPUPlace(), root_tensor);
+  VLOG(3) << "copying back parameters. ";
+  for (int i = 0; i < section_num_; ++i) {
+    std::shared_ptr<framework::ProgramDesc> program;
+    program.reset(new ProgramDesc(
+        trainer_desc_.section_param().section_config(i).program_desc()));
+    for (int j = 0; j < num_microbatches_; ++j) {
+      auto& global_block = program->Block(0);
+      for (auto& var : global_block.AllVars()) {
+        if (var->Persistable()) {
+          auto* ptr = root_scope_->FindVar(var->Name());
+          LoDTensor* root_tensor = ptr->GetMutable<LoDTensor>();
+          auto* minibatch_ptr = minibatch_scopes_[i]->Var(var->Name());
+          const LoDTensor& minibatch_tensor = minibatch_ptr->Get<LoDTensor>();
+          TensorCopy(*static_cast<const Tensor*>(&minibatch_tensor), places_[0],
+                     static_cast<Tensor*>(root_tensor));
+          VLOG(4) << "Copy persitable var " << var->Name() << " to root scope";
+        }
+      }
+    }
   }
   root_scope_->DropKids();
 }
 
 Scope* PipelineTrainer::GetWorkerScope(int thread_id) {
-  return pipeline_scopes_[thread_id];
+  return microbatch_scopes_[thread_id][0];
 }
 
 }  // end namespace framework
diff --git a/paddle/fluid/framework/section_worker.cc b/paddle/fluid/framework/section_worker.cc
index df8bd61554e590fb0d83960a0fca63f78229c9a4..03b7afbb8771fadbe07a352497fa69a299928cf7 100644
--- a/paddle/fluid/framework/section_worker.cc
+++ b/paddle/fluid/framework/section_worker.cc
@@ -10,6 +10,11 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #if defined(PADDLE_WITH_NCCL)
+#include <float.h>
+#include "paddle/fluid/framework/executor_gc_helper.h"
+#include "paddle/fluid/framework/garbage_collector.h"
+#include "paddle/fluid/framework/program_desc.h"
+
 #include "google/protobuf/io/zero_copy_stream_impl.h"
 #include "google/protobuf/message.h"
 #include "google/protobuf/text_format.h"
@@ -25,82 +30,17 @@ limitations under the License. */
 namespace paddle {
 namespace framework {
 
-uint64_t SyncFunctor::sync_flag_ = 0;
-std::vector<Scope*> SyncFunctor::pipeline_scopes_;
-
-SyncFunctor::SyncFunctor(int rank_id, int rank_num, int sync_steps)
-    : rank_id_(rank_id), rank_num_(rank_num), sync_steps_(sync_steps) {
-  PADDLE_ENFORCE(rank_num > 1, "rank_num should larger than 1");
-  counter_ = 0;
-  sync_signal_ = 0;
-  uint8_t* ptr = reinterpret_cast<uint8_t*>(&sync_signal_);
-  for (int i = 0; i < rank_num_; ++i) {
-    ptr[i] = 0xFF;
-  }
-}
-
-int SyncFunctor::operator()(Scope* scope) {
-  ++counter_;
-  if (counter_ < sync_steps_) {
-    return 0;
-  }
-  if (counter_ == sync_steps_) {
-    reinterpret_cast<uint8_t*>(&sync_flag_)[rank_id_] = 0xFF;
-  }
-
-  if (sync_flag_ == sync_signal_) {
-    static std::mutex mutex;
-    if (mutex.try_lock()) {
-      if (sync_flag_ == sync_signal_) {
-        Synchronize();
-        sync_flag_ = 0;
-      }
-      mutex.unlock();
-    }
-  }
-
-  if (sync_flag_ == 0) {
-    counter_ = 0;
-  }
-  return 0;
-}
-
-void SyncFunctor::Synchronize() {
-  for (const std::string& name : *sync_param_) {
-    platform::NCCLGroupGuard guard;
-    for (int i = 0; i < rank_num_; ++i) {
-      const platform::NCCLContext& nccl_ctx = nccl_ctx_map_->at(i);
-      LoDTensor* tensor =
-          pipeline_scopes_[i]->Var(name)->GetMutable<LoDTensor>();
-      // TODO(hutuxian): do not depend on data type explicitly
-      float* data =
-          tensor->mutable_data<float>(nccl_ctx_map_->DevCtx(i)->GetPlace());
-      const int numel = tensor->numel();
-
-      paddle::framework::AttributeMap attrs;
-      attrs.insert({"scale", static_cast<float>(1. / rank_num_)});
-      auto scale_op = framework::OpRegistry::CreateOp("scale", {{"X", {name}}},
-                                                      {{"Out", {name}}}, attrs);
-      scale_op->Run(*(pipeline_scopes_[i]),
-                    nccl_ctx_map_->DevCtx(i)->GetPlace());
-      PADDLE_ENFORCE(platform::dynload::ncclAllReduce(
-          data, data, numel, ncclFloat, ncclSum, nccl_ctx.comm(),
-          dynamic_cast<platform::CUDADeviceContext*>(
-              platform::DeviceContextPool::Instance().Get(
-                  platform::CUDAPlace(i)))
-              ->stream()));
-    }
-  }
-  nccl_ctx_map_->WaitAll();
-}
-
 std::atomic<int> SectionWorker::cpu_id_(0);
+std::mutex SectionWorker::thread_mutex;
+std::condition_variable SectionWorker::thread_condition;
+bool SectionWorker::threads_completed = false;
+uint64_t SectionWorker::batch_id_(0);
+
 void SectionWorker::Initialize(const TrainerDesc& desc) {
   dev_ctx_ = platform::DeviceContextPool::Instance().Get(place_);
-  std::shared_ptr<framework::ProgramDesc> program;
-  program.reset(new ProgramDesc(
+  program_.reset(new ProgramDesc(
       desc.section_param().section_config(section_id_).program_desc()));
-  for (auto& op_desc : program->Block(0).AllOps()) {
+  for (auto& op_desc : program_->Block(0).AllOps()) {
     ops_.push_back(OpRegistry::CreateOp(*op_desc));
   }
 }
@@ -136,314 +76,494 @@ void SectionWorker::AutoSetCPUAffinity(bool reuse) {
       (0 == CPU_ISSET(proc, &mask))) {
     LOG(WARNING) << "Fail to set thread affinity to CPU " << proc;
   }
-  SEC_LOG << "Set " << thread_cpu_id << "th thread affinity to CPU " << proc;
+  VLOG(3) << "Set " << thread_cpu_id << "th thread affinity to CPU " << proc;
 }
 
 void SectionWorker::TrainFiles() {
-  SEC_LOG << "begin section_worker TrainFiles";
+  VLOG(3) << "begin section_worker TrainFiles";
   AutoSetCPUAffinity(true);
 
-  int64_t step_cnt = 0;
-  int64_t accum_num = 0;
-  int batch_size = 0;
-  Scope* scope = nullptr;
-  if (device_reader_ != nullptr) {
-    device_reader_->Start();
-  }
-  while (in_scope_queue_->Receive(&scope)) {
-    if (device_reader_ != nullptr) {
-      device_reader_->AssignFeedVar(*scope);
-      batch_size = device_reader_->Next();
-      if (batch_size <= 0) {
-        break;
-      }
-      SEC_LOG << "read batch size: " << batch_size;
+  int64_t max_memory_size = 0;
+  std::unique_ptr<GarbageCollector> gc;
+  auto unused_vars_ = GetUnusedVars(program_->Block(0), ops_, skip_vars_);
+#ifdef PADDLE_WITH_CUDA
+  if (platform::is_gpu_place(place_)) {
+    if (IsFastEagerDeletionModeEnabled()) {
+      gc.reset(new UnsafeFastGPUGarbageCollector(
+          BOOST_GET_CONST(platform::CUDAPlace, place_), max_memory_size));
     } else {
-      // TODO(hutuxian): Keep batch_size in scope? Or is there a better way to
-      // fetch batch_size? Some variables may not have batch_size.
-      PADDLE_ENFORCE(
-          in_var_names_->size(),
-          "Section without a reader or in variable is not supported by now");
-      const LoDTensor& tensor =
-          scope->FindVar(in_var_names_->at(0))->Get<LoDTensor>();
-      batch_size =
-          tensor.lod().size() ? tensor.lod()[0].size() - 1 : tensor.dims()[0];
-      SEC_LOG << "input batch size: " << batch_size;
+      gc.reset(new DefaultStreamGarbageCollector(
+          BOOST_GET_CONST(platform::CUDAPlace, place_), max_memory_size));
     }
+  } else if (platform::is_cpu_place(place_)) {
+#endif
+    gc.reset(new CPUGarbageCollector(
+        BOOST_GET_CONST(platform::CPUPlace, place_), max_memory_size));
+#ifdef PADDLE_WITH_CUDA
+  }
+#endif
 
-    Scope* exe_scope = scope;
-    if (section_id_ > 0 && platform::is_gpu_place(place_)) {
-      SEC_LOG << "CPU2GPU memory copy";
-
-      if (scope->kids().empty()) {
-        exe_scope = &scope->NewScope();
-      } else {
-        exe_scope = scope->kids().front();
-        PADDLE_ENFORCE(scope->kids().size() == 1, "scope->kids().size(): %zu",
-                       scope->kids().size());
+  if (thread_id_ == 0) {
+    while (true) {
+      // Start a minibatch.
+      for (int i = 0; i < num_microbatches_; ++i) {
+        try {
+          for (auto& op : ops_) {
+            int op_role = op->Attr<int>(std::string("op_role"));
+            // We run op with op_role = kLRSched only for the first microbatch
+            // to avoid increasing the @LR_DECAY_STEP@ multiple times.
+            bool run_first_mbatch =
+                op_role == static_cast<int>(OpRole::kForward) ||
+                op_role == (static_cast<int>(OpRole::kForward) |
+                            static_cast<int>(OpRole::kLoss)) ||
+                op_role == static_cast<int>(OpRole::kLRSched);
+            bool run_others = op_role == static_cast<int>(OpRole::kForward) ||
+                              op_role == (static_cast<int>(OpRole::kForward) |
+                                          static_cast<int>(OpRole::kLoss));
+            if ((i == 0 && run_first_mbatch) || (i != 0 && run_others)) {
+              VLOG(3) << "running an op " << op->Type() << " for " << thread_id_
+                      << " for scope " << i;
+              op->Run(*microbatch_scopes_[i], place_);
+              if (gc) {
+                DeleteUnusedTensors(*microbatch_scopes_[i], op.get(),
+                                    unused_vars_, gc.get());
+              }
+            }
+          }
+        } catch (platform::EOFException&) {
+          std::unique_lock<std::mutex> lk(thread_mutex);
+          threads_completed = true;
+          VLOG(3) << "thread " << thread_id_ << " completed.";
+          VLOG(3) << "called notify all";
+          thread_condition.notify_all();
+          VLOG(0) << "EOF encountered";
+          return;
+        }
+        if (i == 0) {
+          VLOG(3) << "called notify all";
+          std::unique_lock<std::mutex> lk(thread_mutex);
+          batch_id_ += 1;
+          thread_condition.notify_all();
+        }
       }
-
-      for (const std::string& name : *in_var_names_) {
-        const LoDTensor& src_tensor = scope->FindVar(name)->Get<LoDTensor>();
-        if (platform::is_gpu_place(src_tensor.place())) {
-          continue;
+      // backward pass
+      for (int i = 0; i < num_microbatches_; ++i) {
+        for (auto& op : ops_) {
+          int op_role = op->Attr<int>(std::string("op_role"));
+          if (op_role == static_cast<int>(OpRole::kBackward) ||
+              op_role == (static_cast<int>(OpRole::kBackward) |
+                          static_cast<int>(OpRole::kLoss))) {
+            VLOG(3) << "running an op " << op->Type() << " for " << thread_id_
+                    << " for scope " << i;
+            op->Run(*microbatch_scopes_[i], place_);
+            if (gc) {
+              DeleteUnusedTensors(*microbatch_scopes_[i], op.get(),
+                                  unused_vars_, gc.get());
+            }
+          }
         }
-        LoDTensor* gpu_tensor = exe_scope->Var(name)->GetMutable<LoDTensor>();
-        gpu_tensor->set_lod(src_tensor.lod());
-        TensorCopy(*static_cast<const Tensor*>(&src_tensor), place_, *dev_ctx_,
-                   static_cast<Tensor*>(gpu_tensor));
       }
-    }
-
-    SEC_LOG << "begin running ops";
-
-    for (auto& op : ops_) {
-      op->Run(*exe_scope, place_);
-    }
-    exe_scope->DropKids();
-    // Wait for GPU calc finising, as the cudaMemcpy and GPU calc may be in
-    // different streams
-    // No effect when it is a CPUDeviceContext
-    dev_ctx_->Wait();
-
-#ifdef PADDLE_WITH_BOX_PS
-    auto box_ptr = BoxWrapper::GetInstance();
-    auto& metric_list = box_ptr->GetMetricList();
-    for (auto iter = metric_list.begin(); iter != metric_list.end(); iter++) {
-      auto* metric_msg = iter->second;
-      if (box_ptr->Phase() != metric_msg->MetricPhase()) {
-        continue;
+      // update pass
+      for (auto& op : ops_) {
+        int op_role = op->Attr<int>(std::string("op_role"));
+        if (op_role == static_cast<int>(OpRole::kOptimize)) {
+          VLOG(3) << "running an op " << op->Type() << " for " << thread_id_
+                  << " for minibatch scope";
+          op->Run(*microbatch_scopes_[0], place_);
+          if (gc) {
+            DeleteUnusedTensors(*microbatch_scopes_[num_microbatches_ - 1],
+                                op.get(), unused_vars_, gc.get());
+          }
+        }
       }
-      metric_msg->add_data(exe_scope);
+      dev_ctx_->Wait();
     }
-#endif
-    if (section_id_ != section_num_ - 1 && platform::is_gpu_place(place_)) {
-      // FIXME: Temporarily we assume two adjacent sections are in different
-      // places,
-      // and we do data transformation only in sections in GPU place, so the
-      // data is
-      // transform from GPU to CPU
-      // A better way to handle such a data transformation is to record each
-      // place of
-      // joint-out variables, and do transform as required
-
-      SEC_LOG << "GPU2CPU memory copy";
-
-      for (const std::string& name : *out_var_names_) {
-        const LoDTensor& src_tensor =
-            exe_scope->FindVar(name)->Get<LoDTensor>();
-        LoDTensor* dst_tensor = scope->Var(name)->GetMutable<LoDTensor>();
-        dst_tensor->set_lod(src_tensor.lod());
-        TensorCopy(*static_cast<const Tensor*>(&src_tensor),
-                   next_section_place_, *dev_ctx_,
-                   static_cast<Tensor*>(dst_tensor));
+  } else {
+    while (true) {
+      {
+        PADDLE_ENFORCE_LE(
+            local_batch_id_, batch_id_,
+            platform::errors::InvalidArgument(
+                "local_batch_id_ (%d) must be less than or equal to "
+                "batch_id_ (%d)",
+                local_batch_id_, batch_id_));
+        std::unique_lock<std::mutex> lk(thread_mutex);
+        if (local_batch_id_ == batch_id_ && !threads_completed) {
+          thread_condition.wait(lk);
+        }
+        VLOG(3) << "thread " << thread_id_ << " local_batch_id_ "
+                << local_batch_id_ << " batch_id_ " << batch_id_;
+        if (threads_completed) {
+          VLOG(3) << "thread " << thread_id_ << " completed.";
+          lk.unlock();
+          threads_completed = false;
+          return;
+        }
+        lk.unlock();
+        local_batch_id_ += 1;
       }
+      // forward pass:
+      for (int i = 0; i < num_microbatches_; ++i) {
+        for (auto& op : ops_) {
+          int op_role = op->Attr<int>(std::string("op_role"));
+          // We run op with op_role = kLRSched only for the first microbatch
+          // to avoid increasing the @LR_DECAY_STEP@ multiple times.
+          bool run_first_mbatch =
+              op_role == static_cast<int>(OpRole::kForward) ||
+              op_role == (static_cast<int>(OpRole::kForward) |
+                          static_cast<int>(OpRole::kLoss)) ||
+              op_role == static_cast<int>(OpRole::kLRSched);
+          bool run_others = op_role == static_cast<int>(OpRole::kForward) ||
+                            op_role == (static_cast<int>(OpRole::kForward) |
+                                        static_cast<int>(OpRole::kLoss));
+          if ((i == 0 && run_first_mbatch) || (i != 0 && run_others)) {
+            VLOG(3) << "running an op " << op->Type() << " for " << thread_id_
+                    << " for scope " << i;
+            op->Run(*microbatch_scopes_[i], place_);
+            if (gc) {
+              DeleteUnusedTensors(*microbatch_scopes_[i], op.get(),
+                                  unused_vars_, gc.get());
+            }
+          }
+        }
+      }
+      // backward pass
+      for (int i = 0; i < num_microbatches_; ++i) {
+        for (auto& op : ops_) {
+          int op_role = op->Attr<int>(std::string("op_role"));
+          if (op_role == static_cast<int>(OpRole::kBackward) ||
+              op_role == (static_cast<int>(OpRole::kBackward) |
+                          static_cast<int>(OpRole::kLoss))) {
+            VLOG(3) << "running an op " << op->Type() << " for " << thread_id_
+                    << " for scope " << i;
+            op->Run(*microbatch_scopes_[i], place_);
+            if (gc) {
+              DeleteUnusedTensors(*microbatch_scopes_[i], op.get(),
+                                  unused_vars_, gc.get());
+            }
+          }
+        }
+      }
+      // update pass
+      for (auto& op : ops_) {
+        int op_role = op->Attr<int>(std::string("op_role"));
+        if (op_role == static_cast<int>(OpRole::kOptimize)) {
+          VLOG(3) << "running an op " << op->Type() << " for " << thread_id_
+                  << " for minibatch scope";
+          op->Run(*microbatch_scopes_[0], place_);
+          if (gc) {
+            DeleteUnusedTensors(*microbatch_scopes_[num_microbatches_ - 1],
+                                op.get(), unused_vars_, gc.get());
+          }
+        }
+      }
+      dev_ctx_->Wait();
     }
-
-    out_scope_queue_->Send(scope);
-
-    if (sync_func_) {
-      (*sync_func_)(scope);
-    }
-
-    ++step_cnt;
-    accum_num += batch_size;
-  }
-
-  worker_count_mutex_->lock();
-  --(*worker_count_);
-  worker_count_mutex_->unlock();
-
-  if (*worker_count_ <= 0) {
-    while (section_id_ < section_num_ - 1 && out_scope_queue_->Size()) {
-      sleep(1);
-    }
-    out_scope_queue_->Close();
   }
 }
 
 void SectionWorker::TrainFilesWithProfiler() {
-  SEC_LOG << "begin section_worker TrainFiles with profiler";
+  VLOG(3) << "begin section_worker TrainFiles with profiler";
   AutoSetCPUAffinity(true);
 
-  int64_t step_cnt = 0;
-  int64_t accum_num = 0;
-  int batch_size = 0;
-  Scope* scope = nullptr;
-
-  platform::Timer reader_timer;
-  platform::Timer cal_timer;
-  platform::Timer trans_timer;
-  platform::Timer sync_timer;
-  platform::Timer main_timer;
-  platform::Timer outer_timer;
+  platform::Timer batch_timer;
+  platform::Timer timeline;
 
   std::vector<double> op_total_time;
   std::vector<std::string> op_name;
+  std::vector<double> op_max_time;
+  std::vector<double> op_min_time;
+  std::vector<uint64_t> op_count;
   for (auto& op : ops_) {
     op_name.push_back(op->Type());
   }
   op_total_time.resize(ops_.size());
-  for (size_t i = 0; i < op_total_time.size(); ++i) {
-    op_total_time[i] = 0.0;
-  }
-  platform::Timer timeline;
-  if (device_reader_ != nullptr) {
-    device_reader_->Start();
+  op_max_time.resize(ops_.size());
+  op_min_time.resize(ops_.size());
+  for (size_t i = 0; i < op_min_time.size(); ++i) {
+    op_min_time[i] = DBL_MAX;
   }
-
-  bool started = false;
-  while (in_scope_queue_->Receive(&scope)) {
-    if (UNLIKELY(!started)) {
-      outer_timer.Start();
-      started = true;
-    }
-    main_timer.Resume();
-
-    if (device_reader_ != nullptr) {
-      reader_timer.Resume();
-      device_reader_->AssignFeedVar(*scope);
-      batch_size = device_reader_->Next();
-      reader_timer.Pause();
-      if (batch_size <= 0) {
-        break;
-      }
-      SEC_LOG << "read batch size: " << batch_size;
+  op_count.resize(ops_.size());
+
+  int64_t max_memory_size = 0;
+  std::unique_ptr<GarbageCollector> gc;
+  // const std::vector<std::string> keep_vars;
+  auto unused_vars_ = GetUnusedVars(program_->Block(0), ops_, skip_vars_);
+#ifdef PADDLE_WITH_CUDA
+  if (platform::is_gpu_place(place_)) {
+    if (IsFastEagerDeletionModeEnabled()) {
+      gc.reset(new UnsafeFastGPUGarbageCollector(
+          BOOST_GET_CONST(platform::CUDAPlace, place_), max_memory_size));
     } else {
-      PADDLE_ENFORCE(
-          in_var_names_->size(),
-          "Section without a reader or in variable is not supported by now");
-      const LoDTensor& tensor =
-          scope->FindVar(in_var_names_->at(0))->Get<LoDTensor>();
-      batch_size =
-          tensor.lod().size() ? tensor.lod()[0].size() - 1 : tensor.dims()[0];
-      SEC_LOG << "input batch size: " << batch_size;
+      gc.reset(new DefaultStreamGarbageCollector(
+          BOOST_GET_CONST(platform::CUDAPlace, place_), max_memory_size));
     }
+  } else if (platform::is_cpu_place(place_)) {
+#endif
+    gc.reset(new CPUGarbageCollector(
+        BOOST_GET_CONST(platform::CPUPlace, place_), max_memory_size));
+#ifdef PADDLE_WITH_CUDA
+  }
+#endif
 
-    Scope* exe_scope = scope;
-    if (section_id_ > 0 && platform::is_gpu_place(place_)) {
-      SEC_LOG << "CPU2GPU memory copy";
-      trans_timer.Resume();
-      if (scope->kids().empty()) {
-        exe_scope = &scope->NewScope();
-      } else {
-        exe_scope = scope->kids().front();
-        PADDLE_ENFORCE(scope->kids().size() == 1, "scope->kids().size(): %zu",
-                       scope->kids().size());
+  if (thread_id_ == 0) {
+    while (true) {
+      // Start a minibatch.
+      // int batch_size = 0;
+      batch_timer.Start();
+      for (int i = 0; i < num_microbatches_; ++i) {
+        try {
+          int op_idx = 0;
+          for (auto& op : ops_) {
+            int op_role = op->Attr<int>(std::string("op_role"));
+            // We run op with op_role = kLRSched only for the first microbatch
+            // to avoid increasing the @LR_DECAY_STEP@ multiple times.
+            bool run_first_mbatch =
+                op_role == static_cast<int>(OpRole::kForward) ||
+                op_role == (static_cast<int>(OpRole::kForward) |
+                            static_cast<int>(OpRole::kLoss)) ||
+                op_role == static_cast<int>(OpRole::kLRSched);
+            bool run_others = op_role == static_cast<int>(OpRole::kForward) ||
+                              op_role == (static_cast<int>(OpRole::kForward) |
+                                          static_cast<int>(OpRole::kLoss));
+            if ((i == 0 && run_first_mbatch) || (i != 0 && run_others)) {
+              VLOG(3) << "running an op " << op->Type() << " for " << thread_id_
+                      << " for scope " << i;
+              timeline.Start();
+              op->Run(*microbatch_scopes_[i], place_);
+              if (gc) {
+                DeleteUnusedTensors(*microbatch_scopes_[i], op.get(),
+                                    unused_vars_, gc.get());
+              }
+              timeline.Pause();
+              auto time = timeline.ElapsedUS();
+              op_total_time[op_idx] += time;
+              if (time > op_max_time[op_idx]) {
+                op_max_time[op_idx] = time;
+              }
+              if (time < op_min_time[op_idx]) {
+                op_min_time[op_idx] = time;
+              }
+              op_count[op_idx] += 1;
+              op_total_time[op_idx] += time;
+            }
+            op_idx++;
+          }
+        } catch (platform::EOFException&) {
+          std::unique_lock<std::mutex> lk(thread_mutex);
+          threads_completed = true;
+          VLOG(3) << "thread " << thread_id_ << " completed.";
+          VLOG(3) << "called notify all";
+          thread_condition.notify_all();
+          VLOG(0) << "EOF encountered";
+          VLOG(0) << "============timeline============";
+          for (size_t i = 0; i < ops_.size(); ++i) {
+            VLOG(0) << "op: " << op_name[i] << ", max_time: " << op_max_time[i]
+                    << ", min_time: " << op_min_time[i]
+                    << ", mean_time: " << op_total_time[i] / op_count[i];
+          }
+          VLOG(0) << "================================";
+          return;
+        }
+        if (i == 0) {
+          VLOG(3) << "called notify all";
+          std::unique_lock<std::mutex> lk(thread_mutex);
+          batch_id_ += 1;
+          thread_condition.notify_all();
+        }
       }
-
-      for (const std::string& name : *in_var_names_) {
-        const LoDTensor& src_tensor = scope->FindVar(name)->Get<LoDTensor>();
-        if (platform::is_gpu_place(src_tensor.place())) {
-          continue;
+      // backward pass
+      for (int i = 0; i < num_microbatches_; ++i) {
+        int op_idx = 0;
+        for (auto& op : ops_) {
+          int op_role = op->Attr<int>(std::string("op_role"));
+          if (op_role == static_cast<int>(OpRole::kBackward) ||
+              op_role == (static_cast<int>(OpRole::kBackward) |
+                          static_cast<int>(OpRole::kLoss))) {
+            VLOG(3) << "running an op " << op->Type() << " for " << thread_id_
+                    << " for scope " << i;
+            timeline.Start();
+            op->Run(*microbatch_scopes_[i], place_);
+            if (gc) {
+              DeleteUnusedTensors(*microbatch_scopes_[i], op.get(),
+                                  unused_vars_, gc.get());
+            }
+            timeline.Pause();
+            auto time = timeline.ElapsedUS();
+            op_total_time[op_idx] += time;
+            if (time > op_max_time[op_idx]) {
+              op_max_time[op_idx] = time;
+            }
+            if (time < op_min_time[op_idx]) {
+              op_min_time[op_idx] = time;
+            }
+            op_count[op_idx] += 1;
+            op_total_time[op_idx] += time;
+          }
+          op_idx++;
         }
-        LoDTensor* gpu_tensor = exe_scope->Var(name)->GetMutable<LoDTensor>();
-        gpu_tensor->set_lod(src_tensor.lod());
-        TensorCopy(*static_cast<const Tensor*>(&src_tensor), place_, *dev_ctx_,
-                   static_cast<Tensor*>(gpu_tensor));
       }
-      trans_timer.Pause();
-    }
-
-    SEC_LOG << "begin running ops";
-    cal_timer.Resume();
-    int op_id = 0;
-    dev_ctx_->Wait();
-    for (auto& op : ops_) {
-      timeline.Start();
-      op->Run(*exe_scope, place_);
+      // update pass
+      int op_idx = 0;
+      for (auto& op : ops_) {
+        int op_role = op->Attr<int>(std::string("op_role"));
+        if (op_role == static_cast<int>(OpRole::kOptimize)) {
+          VLOG(3) << "running an op " << op->Type() << " for " << thread_id_
+                  << " for minibatch scope";
+          timeline.Start();
+          op->Run(*microbatch_scopes_[0], place_);
+          if (gc) {
+            DeleteUnusedTensors(*microbatch_scopes_[num_microbatches_ - 1],
+                                op.get(), unused_vars_, gc.get());
+          }
+          timeline.Pause();
+          auto time = timeline.ElapsedUS();
+          op_total_time[op_idx] += time;
+          if (time > op_max_time[op_idx]) {
+            op_max_time[op_idx] = time;
+          }
+          if (time < op_min_time[op_idx]) {
+            op_min_time[op_idx] = time;
+          }
+          op_count[op_idx] += 1;
+          op_total_time[op_idx] += time;
+        }
+        op_idx++;
+      }
       dev_ctx_->Wait();
-      timeline.Pause();
-      op_total_time[op_id++] += timeline.ElapsedUS();
+      batch_timer.Pause();
+      VLOG(0) << "batch time: " << batch_timer.ElapsedUS();
     }
-    exe_scope->DropKids();
-    // Wait for GPU calc finising, as the cudaMemcpy and GPU calc may be in
-    // different streams
-    // No effect when it is a CPUDeviceContext
-    dev_ctx_->Wait();
-    cal_timer.Pause();
-#ifdef PADDLE_WITH_BOX_PS
-    auto box_ptr = BoxWrapper::GetInstance();
-    auto& metric_list = box_ptr->GetMetricList();
-    for (auto iter = metric_list.begin(); iter != metric_list.end(); iter++) {
-      auto* metric_msg = iter->second;
-      if (box_ptr->Phase() != metric_msg->MetricPhase()) {
-        continue;
+  } else {
+    while (true) {
+      {
+        PADDLE_ENFORCE_LE(
+            local_batch_id_, batch_id_,
+            platform::errors::InvalidArgument(
+                "local_batch_id_ (%d) must be less than or equal to "
+                "batch_id_ (%d)",
+                local_batch_id_, batch_id_));
+        std::unique_lock<std::mutex> lk(thread_mutex);
+        if (local_batch_id_ == batch_id_ && !threads_completed) {
+          thread_condition.wait(lk);
+        }
+        VLOG(3) << "thread " << thread_id_ << " local_batch_id_ "
+                << local_batch_id_ << " batch_id_ " << batch_id_;
+        if (threads_completed) {
+          VLOG(3) << "thread " << thread_id_ << " completed.";
+          lk.unlock();
+          VLOG(0) << "============timeline============";
+          for (size_t i = 0; i < ops_.size(); ++i) {
+            VLOG(0) << "op: " << op_name[i] << ", max_time: " << op_max_time[i]
+                    << ", min_time: " << op_min_time[i]
+                    << ", mean_time: " << op_total_time[i] / op_count[i];
+          }
+          VLOG(0) << "================================";
+          threads_completed = false;
+          return;
+        }
+        lk.unlock();
+        local_batch_id_ += 1;
       }
-      metric_msg->add_data(exe_scope);
-    }
-#endif
-    if (need_dump_field_) {
-      DumpField(*scope, dump_mode_, dump_interval_);
-    }
-    if (need_dump_param_ && pipeline_id_ == 0) {
-      DumpParam(*scope, step_cnt);
-    }
-
-    if (section_id_ != section_num_ - 1 && platform::is_gpu_place(place_)) {
-      // FIXME: Temporarily we assume two adjacent sections are in different
-      // places,
-      // and we do data transformation only in sections in GPU place, so the
-      // data is
-      // transform from GPU to CPU
-      // A better way to handle such a data transformation is to record each
-      // place of
-      // joint-out variables, and do transform as required
-
-      SEC_LOG << "GPU2CPU memory copy";
-      trans_timer.Resume();
-      for (const std::string& name : *out_var_names_) {
-        const LoDTensor& src_tensor =
-            exe_scope->FindVar(name)->Get<LoDTensor>();
-        LoDTensor* dst_tensor = scope->Var(name)->GetMutable<LoDTensor>();
-        dst_tensor->set_lod(src_tensor.lod());
-        TensorCopy(*static_cast<const Tensor*>(&src_tensor),
-                   next_section_place_, *dev_ctx_,
-                   static_cast<Tensor*>(dst_tensor));
+      // forward pass:
+      for (int i = 0; i < num_microbatches_; ++i) {
+        int op_idx = 0;
+        for (auto& op : ops_) {
+          int op_role = op->Attr<int>(std::string("op_role"));
+          // We run op with op_role = kLRSched only for the first microbatch
+          // to avoid increasing the @LR_DECAY_STEP@ multiple times.
+          bool run_first_mbatch =
+              op_role == static_cast<int>(OpRole::kForward) ||
+              op_role == (static_cast<int>(OpRole::kForward) |
+                          static_cast<int>(OpRole::kLoss)) ||
+              op_role == static_cast<int>(OpRole::kLRSched);
+          bool run_others = op_role == static_cast<int>(OpRole::kForward) ||
+                            op_role == (static_cast<int>(OpRole::kForward) |
+                                        static_cast<int>(OpRole::kLoss));
+          if ((i == 0 && run_first_mbatch) || (i != 0 && run_others)) {
+            VLOG(3) << "running an op " << op->Type() << " for " << thread_id_
+                    << " for scope " << i;
+            timeline.Start();
+            op->Run(*microbatch_scopes_[i], place_);
+            if (gc) {
+              DeleteUnusedTensors(*microbatch_scopes_[i], op.get(),
+                                  unused_vars_, gc.get());
+            }
+            timeline.Pause();
+            auto time = timeline.ElapsedUS();
+            op_total_time[op_idx] += time;
+            if (time > op_max_time[op_idx]) {
+              op_max_time[op_idx] = time;
+            }
+            if (time < op_min_time[op_idx]) {
+              op_min_time[op_idx] = time;
+            }
+            op_count[op_idx] += 1;
+            op_total_time[op_idx] += time;
+          }
+          op_idx++;
+        }
       }
-      trans_timer.Pause();
-    }
-
-    out_scope_queue_->Send(scope);
-
-    if (sync_func_) {
-      sync_timer.Resume();
-      (*sync_func_)(scope);
-      sync_timer.Pause();
-    }
-
-    ++step_cnt;
-    accum_num += batch_size;
-    main_timer.Pause();
-  }
-  if (need_dump_field_ || need_dump_param_) {
-    writer_.Flush();
-  }
-  outer_timer.Pause();
-
-  worker_count_mutex_->lock();
-  --(*worker_count_);
-  worker_count_mutex_->unlock();
-
-  if (*worker_count_ <= 0) {
-    while (section_id_ < section_num_ - 1 && out_scope_queue_->Size()) {
-      sleep(1);
+      // backward pass
+      for (int i = 0; i < num_microbatches_; ++i) {
+        int op_idx = 0;
+        for (auto& op : ops_) {
+          int op_role = op->Attr<int>(std::string("op_role"));
+          if (op_role == static_cast<int>(OpRole::kBackward) ||
+              op_role == (static_cast<int>(OpRole::kBackward) |
+                          static_cast<int>(OpRole::kLoss))) {
+            VLOG(3) << "running an op " << op->Type() << " for " << thread_id_
+                    << " for scope " << i;
+            timeline.Start();
+            op->Run(*microbatch_scopes_[i], place_);
+            if (gc) {
+              DeleteUnusedTensors(*microbatch_scopes_[i], op.get(),
+                                  unused_vars_, gc.get());
+            }
+            timeline.Pause();
+            auto time = timeline.ElapsedUS();
+            op_total_time[op_idx] += time;
+            if (time > op_max_time[op_idx]) {
+              op_max_time[op_idx] = time;
+            }
+            if (time < op_min_time[op_idx]) {
+              op_min_time[op_idx] = time;
+            }
+            op_count[op_idx] += 1;
+            op_total_time[op_idx] += time;
+          }
+          op_idx++;
+        }
+      }
+      // update pass
+      int op_idx = 0;
+      for (auto& op : ops_) {
+        int op_role = op->Attr<int>(std::string("op_role"));
+        if (op_role == static_cast<int>(OpRole::kOptimize)) {
+          VLOG(3) << "running an op " << op->Type() << " for " << thread_id_
+                  << " for minibatch scope";
+          timeline.Start();
+          op->Run(*microbatch_scopes_[0], place_);
+          if (gc) {
+            DeleteUnusedTensors(*microbatch_scopes_[num_microbatches_ - 1],
+                                op.get(), unused_vars_, gc.get());
+          }
+          timeline.Pause();
+          auto time = timeline.ElapsedUS();
+          op_total_time[op_idx] += time;
+          if (time > op_max_time[op_idx]) {
+            op_max_time[op_idx] = time;
+          }
+          if (time < op_min_time[op_idx]) {
+            op_min_time[op_idx] = time;
+          }
+          op_count[op_idx] += 1;
+          op_total_time[op_idx] += time;
+        }
+        op_idx++;
+      }
+      dev_ctx_->Wait();
     }
-    out_scope_queue_->Close();
-  }
-  LOG(ERROR) << "log_for_profile"
-             << " card:" << pipeline_id_ << " thread:" << thread_id_
-             << " section:" << section_id_ << " step_count:" << step_cnt
-             << " batch_count:" << accum_num
-             << " read_time:" << reader_timer.ElapsedUS()
-             << " trans_time:" << trans_timer.ElapsedUS()
-             << " cal_time:" << cal_timer.ElapsedUS()
-             << " sync_time:" << sync_timer.ElapsedUS()
-             << " main_time:" << main_timer.ElapsedUS()
-             << " outer_time:" << outer_timer.ElapsedUS();
-  for (size_t i = 0; i < ops_.size(); ++i) {
-    LOG(ERROR) << "op: " << op_name[i]
-               << ", mean time: " << op_total_time[i] / accum_num;
   }
 }
 }  // namespace framework
diff --git a/paddle/fluid/framework/trainer.h b/paddle/fluid/framework/trainer.h
index c18ea33d041b9518fb60d2453830de8e4b4ff033..bb56b3ea3d251d53d6e8e494ec1c658574c2e96c 100644
--- a/paddle/fluid/framework/trainer.h
+++ b/paddle/fluid/framework/trainer.h
@@ -137,49 +137,31 @@ class PipelineTrainer : public TrainerBase {
   virtual Scope* GetWorkerScope(int thread_id);
   void InitDumpEnv() override;
   virtual std::string GetDumpPath(int tid);
+  void GetSkipVars(int section_id, const ProgramDesc& main_program);
 
  protected:
   int section_num_;
-  int pipeline_num_;
-  int scope_queue_size_;
-  int sync_steps_;
+  int num_microbatches_;
+  int start_cpu_core_id_;
+  std::vector<std::string> feed_var_names_;
+  std::vector<platform::Place> places_;
+  std::vector<std::vector<std::string>> skip_vars_;
+  TrainerDesc trainer_desc_;
 
-  SectionWorkerParameter pipeline_config_;
-
-  // The in/output var names for each section
-  std::vector<std::unique_ptr<std::vector<std::string>>> in_var_names_;
-  std::vector<std::unique_ptr<std::vector<std::string>>> out_var_names_;
-
-  // Counter for the running thread
-  std::vector<std::vector<int*>> worker_count_;
-  std::vector<std::vector<std::unique_ptr<std::mutex>>> worker_count_mutex_;
-
-  // worker: [section_id][pipeline_id][thread_id]
-  std::vector<std::vector<
-      std::vector<std::shared_ptr<paddle::framework::DeviceWorker>>>>
-      workers_;
   std::vector<std::thread> section_threads_;
-
-  // We use scope to maintain context info, and scopes
-  // will be deliverd between different sections.
-  std::vector<std::vector<std::unique_ptr<ScopeQueue>>> scope_queues_;
-  std::vector<Scope*> pipeline_scopes_;
-
-  // The parameters that should be syncronized between different cards using
-  // nccl all-reduce
-  std::shared_ptr<std::vector<std::string>> param_need_sync_;
-  std::vector<std::string> persistable_vars_;
-  std::vector<std::unique_ptr<SyncFunctor>> sync_functors_;
-  std::shared_ptr<platform::NCCLContextMap> nccl_ctx_map_;
-
-  std::vector<DataFeed*> readers_;
-
-  void InitFirstScopeQueue(ScopeQueue* scope_queue, int pipeline_id,
-                           const ProgramDesc& main_program,
-                           const Scope& root_scope);
-  void CopyParameters(const Scope& root_scope, int pipeline_id);
-  void construct_sync_functor();
+  // worker: [section_id]
+  std::vector<std::shared_ptr<paddle::framework::DeviceWorker>> workers_;
+  // minibatch_scopes_: [section_id]
+  std::vector<Scope*> minibatch_scopes_;
+  // microbatch_scopes_: [section_id][microbatch_id]
+  std::vector<std::vector<Scope*>> microbatch_scopes_;
+
+  void CopyParameters(int section_id, int microbatch_id,
+                      const ProgramDesc& program, const platform::Place& place);
+  bool isPersistableVarGrad(std::string name);
+  bool isPersistable(VarDesc* var);
 };
 #endif
+
 }  // namespace framework
 }  // namespace paddle
diff --git a/paddle/fluid/framework/trainer_desc.proto b/paddle/fluid/framework/trainer_desc.proto
index 9cbb063a3fab6810709c1504deed2ccf40743123..670ae074c7c7f0e3bcd91e157ba7b01b48d3b7ee 100644
--- a/paddle/fluid/framework/trainer_desc.proto
+++ b/paddle/fluid/framework/trainer_desc.proto
@@ -83,6 +83,7 @@ message SectionWorkerParameter {
   optional int64 sync_steps = 3 [ default = 1 ];
   optional int32 start_cpu_core_id = 4 [ default = 1 ];
   repeated string param_need_sync = 5;
+  optional int32 num_microbatches = 6;
 }
 
 message SectionConfig {
@@ -99,6 +100,7 @@ message SectionConfig {
   optional int32 concurrency = 3 [ default = 1 ];
   repeated string section_in_var_names = 4;
   repeated string section_out_var_names = 5;
+  optional int32 place_id = 6 [ default = -1 ];
 }
 
 message FetchConfig {
diff --git a/paddle/fluid/imperative/basic_engine.cc b/paddle/fluid/imperative/basic_engine.cc
index 60bc88ca7237c44dc63aa98e0064ab59addd707c..6c2f74e2712b0e7ccdce60e2b2c53ee529b52c5c 100644
--- a/paddle/fluid/imperative/basic_engine.cc
+++ b/paddle/fluid/imperative/basic_engine.cc
@@ -205,7 +205,9 @@ void BasicEngine::Execute() {
             continue;
           }
 
-          var = std::make_shared<VariableWrapper>(var->Name());
+          auto tmp_var = std::make_shared<VariableWrapper>(var->Name());
+          tmp_var->SetType(var->Type());
+          var = tmp_var;
           need_accu_var_list_.emplace_back(iter->second.get(), var);
         }
       }
diff --git a/paddle/fluid/imperative/tests/test_tracer.cc b/paddle/fluid/imperative/tests/test_tracer.cc
index 5852e60a481209dd1d65a53660dbdce3a7376816..3c3ec2e6263396881597649d3ab643b5492d630a 100644
--- a/paddle/fluid/imperative/tests/test_tracer.cc
+++ b/paddle/fluid/imperative/tests/test_tracer.cc
@@ -285,6 +285,11 @@ TEST(test_tracer, test_unique_name_generator) {
   auto fc_2 = tracer.GenerateUniqueName("fc");
   ASSERT_STREQ("fc_0", fc_1.c_str());
   ASSERT_STREQ("fc_1", fc_2.c_str());
+  // use `eager_tmp` as key if not specify it.
+  auto tmp_var_2 = tracer.GenerateUniqueName();
+  ASSERT_STREQ("eager_tmp_2", tmp_var_2.c_str());
+  auto tmp_var_3 = tracer.GenerateUniqueName("eager_tmp");
+  ASSERT_STREQ("eager_tmp_3", tmp_var_3.c_str());
 }
 
 TEST(test_tracer, test_current_tracer) {
diff --git a/paddle/fluid/imperative/tracer.h b/paddle/fluid/imperative/tracer.h
index 49aa39d2b0f2e3bf3bae5248aeb5772e5ec807a2..7652b3aa291ac0063fcc411b5f86f6084f01e8ef 100644
--- a/paddle/fluid/imperative/tracer.h
+++ b/paddle/fluid/imperative/tracer.h
@@ -32,7 +32,7 @@ namespace imperative {
 class UniqueNameGenerator {
  public:
   explicit UniqueNameGenerator(std::string prefix = "") : prefix_(prefix) {}
-  std::string Generate(std::string key = "tmp") {
+  std::string Generate(std::string key = "eager_tmp") {
     return prefix_ + key + "_" + std::to_string(id_++);
   }
 
@@ -76,7 +76,14 @@ class Tracer {
     return program_desc_tracer_.get();
   }
 
-  std::string GenerateUniqueName(std::string key = "tmp") {
+  // Note(Aurelius84): The `tmp` is used as prefix key while naming a temporary
+  // intermediate var both in imperative and static mode. But the
+  // `UniqueNameGenerator` in C++ and `unique_name.py` in Python doesn't share
+  // the same auto-increment id. It will create a variable repeatedly with same
+  // name like `tmp_0` in some cases when transform dygraph into static layers.
+  // So we modify the default prefix key into `eager_tmp` to distinguish with
+  // static graph.
+  std::string GenerateUniqueName(std::string key = "eager_tmp") {
     return generator_->Generate(key);
   }
 
diff --git a/paddle/fluid/inference/CMakeLists.txt b/paddle/fluid/inference/CMakeLists.txt
index a78fe41552b7cb1a42ce924fc604db8e0dafc0e7..37ea3e5b40a65cbeb424c216aa74a75ace60ff64 100644
--- a/paddle/fluid/inference/CMakeLists.txt
+++ b/paddle/fluid/inference/CMakeLists.txt
@@ -36,7 +36,6 @@ endif()
 
 # fluid_modules exclude API-interface of inference/api and inference/capi
 get_property(fluid_modules GLOBAL PROPERTY FLUID_MODULES)
-get_property(cuda_modules GLOBAL PROPERTY CUDA_MODULES)
 
 add_subdirectory(api)
 
diff --git a/paddle/fluid/inference/api/analysis_predictor.cc b/paddle/fluid/inference/api/analysis_predictor.cc
index af915acc0e44d9ed9e23c7ada81cc9965c49f5ec..1a15ecfda5d439db91d5dca3bd9e3bd8fd1a6507 100644
--- a/paddle/fluid/inference/api/analysis_predictor.cc
+++ b/paddle/fluid/inference/api/analysis_predictor.cc
@@ -828,6 +828,25 @@ bool AnalysisPredictor::LoadParameters() {
   return true;
 }
 
+void AnalysisPredictor::ClearIntermediateTensor() {
+  PADDLE_ENFORCE_NOT_NULL(inference_program_.get(),
+                          platform::errors::PreconditionNotMet(
+                              "The inference program should be loaded first."));
+  const auto &global_block = inference_program_->MutableBlock(0);
+  for (auto *var : global_block->AllVars()) {
+    if (!IsPersistable(var)) {
+      const std::string name = var->Name();
+      auto *variable = executor_->scope()->FindVar(name);
+      if (variable != nullptr && variable->IsType<framework::LoDTensor>() &&
+          name != "feed" && name != "fetch") {
+        VLOG(3) << "Clear Intermediate Tensor: " << name;
+        auto *t = variable->GetMutable<framework::LoDTensor>();
+        t->clear();
+      }
+    }
+  }
+}
+
 #if PADDLE_WITH_TENSORRT
 bool AnalysisPredictor::SaveTrtCalibToDisk() {
   PADDLE_ENFORCE(config_.tensorrt_engine_enabled(),
diff --git a/paddle/fluid/inference/api/analysis_predictor.h b/paddle/fluid/inference/api/analysis_predictor.h
index 267817829ec4598808486fd3ea5df241a1466e22..365f86c21105a7f1ffb7c300e0ab38c6aaa230fc 100644
--- a/paddle/fluid/inference/api/analysis_predictor.h
+++ b/paddle/fluid/inference/api/analysis_predictor.h
@@ -187,6 +187,12 @@ class AnalysisPredictor : public PaddlePredictor {
   ///
   void OptimizeInferenceProgram();
 
+  ///
+  /// \brief Clear the intermediate tensors of the predictor
+  ///
+  ///
+  void ClearIntermediateTensor();
+
   ///
   /// \brief Get the argument used by predictor
   ///
diff --git a/paddle/fluid/inference/api/mkldnn_quantizer_config.cc b/paddle/fluid/inference/api/mkldnn_quantizer_config.cc
index 9ff5ef133e17a3491a7266bdbcad72583f29e872..76cf1661f305443592c345e879f2d4514dc914d9 100644
--- a/paddle/fluid/inference/api/mkldnn_quantizer_config.cc
+++ b/paddle/fluid/inference/api/mkldnn_quantizer_config.cc
@@ -49,6 +49,10 @@ MkldnnQuantizerConfig::MkldnnQuantizerConfig() {
   rules_["matmul"]["Y"] = ScaleAlgo::KL;
   rules_["matmul"]["Out"] = ScaleAlgo::KL;
 
+  rules_["elementwise_add"]["X"] = ScaleAlgo::KL;
+  rules_["elementwise_add"]["Y"] = ScaleAlgo::KL;
+  rules_["elementwise_add"]["Out"] = ScaleAlgo::KL;
+
   // Reshape2 does not perform calculation on the data and shapes are not
   // changed. Scale is calculated on input data and assign to Quantize and
   // Dequantize scale.
diff --git a/paddle/fluid/inference/api/paddle_api.h b/paddle/fluid/inference/api/paddle_api.h
index d871661497a243bc22903f4d24fe4e1e6d2517db..386d20103a71acb34cd47ddf5527f580cc5bf5b1 100644
--- a/paddle/fluid/inference/api/paddle_api.h
+++ b/paddle/fluid/inference/api/paddle_api.h
@@ -27,10 +27,10 @@
 #include <memory>
 #include <string>
 #include <vector>
+#include "crypto/cipher.h"
 #include "paddle_infer_declare.h"  // NOLINT
-
-/*! \namespace paddle
- */
+                                   /*! \namespace paddle
+                                    */
 namespace paddle {
 
 /// \brief Paddle data type.
@@ -313,6 +313,12 @@ class PD_INFER_DECL PaddlePredictor {
   /// \return Whether the run is successful
   virtual bool ZeroCopyRun() { return false; }
 
+  ///
+  /// \brief Clear the intermediate tensors of the predictor
+  ///
+  ///
+  virtual void ClearIntermediateTensor() {}
+
   /// \brief Clone an existing predictor
   /// When using clone, the same network will be created,
   /// and the parameters between them are shared.
@@ -431,4 +437,6 @@ PD_INFER_DECL std::string get_version();
 
 PD_INFER_DECL std::string UpdateDllFlag(const char* name, const char* value);
 
+PD_INFER_DECL std::shared_ptr<framework::Cipher> MakeCipher(
+    const std::string& config_file);
 }  // namespace paddle
diff --git a/paddle/fluid/inference/lite/CMakeLists.txt b/paddle/fluid/inference/lite/CMakeLists.txt
index 1d957048148b59cd98b40ae1d95bd02481288b85..decf2d830fe0a77c69c80c071248b04c7fdb2f9d 100644
--- a/paddle/fluid/inference/lite/CMakeLists.txt
+++ b/paddle/fluid/inference/lite/CMakeLists.txt
@@ -1,5 +1,6 @@
 cc_library(lite_op_teller SRCS op_teller.cc DEPS lite_full_static framework_proto device_context boost xxhash)
 cc_library(lite_engine SRCS engine.cc DEPS lite_full_static framework_proto)
-cc_library(lite_tensor_utils SRCS tensor_utils.cc DEPS memcpy lite_full_static framework_proto boost)
+cc_library(lite_tensor_utils SRCS tensor_utils.cc DEPS memcpy lite_full_static framework_proto boost device_context)
+
 cc_test(test_lite_engine SRCS test_engine.cc DEPS lite_engine protobuf framework_proto glog gtest analysis)
 cc_test(test_lite_tensor_utils SRCS test_tensor_utils.cc DEPS lite_engine lite_tensor_utils)
diff --git a/paddle/fluid/inference/lite/engine.cc b/paddle/fluid/inference/lite/engine.cc
index 6dbe748879c31523ade333dd7ca0b5c340915b3c..fb3b6e460d5bb23133de1d6a8a106530043cd99a 100644
--- a/paddle/fluid/inference/lite/engine.cc
+++ b/paddle/fluid/inference/lite/engine.cc
@@ -18,8 +18,6 @@
 
 #include "paddle/fluid/inference/lite/engine.h"
 
-#include "lite/api/paddle_use_kernels.h"
-#include "lite/api/paddle_use_ops.h"
 #include "lite/api/paddle_use_passes.h"
 
 namespace paddle {
diff --git a/paddle/fluid/inference/lite/op_teller.cc b/paddle/fluid/inference/lite/op_teller.cc
index f461da89fecc3a27acbb5d84a9acdf53bce5fbad..3a162c3fde13f61fae5aba7a7da0bbfdc5f20801 100644
--- a/paddle/fluid/inference/lite/op_teller.cc
+++ b/paddle/fluid/inference/lite/op_teller.cc
@@ -26,15 +26,14 @@ namespace lite {
 // Just tell by the op_types.
 struct SimpleOpTeller : public Teller {
   SimpleOpTeller() {
-    const std::map<std::string, std::string>& op2path =
-        paddle::lite::GetOp2PathDict();
+    std::vector<std::string> lite_ops = paddle::lite::GetAllOps();
     auto is_non_inst = [](const std::string& op) -> bool {
       const std::vector<std::string> ops = {"feed", "fetch", "while"};
       return std::find(ops.begin(), ops.end(), op) != ops.end();
     };
-    for (const auto& op : op2path) {
-      if (!is_non_inst(op.first)) {
-        ops_.insert(op.first);
+    for (const auto& op : lite_ops) {
+      if (!is_non_inst(op)) {
+        ops_.insert(op);
       }
     }
   }
diff --git a/paddle/fluid/inference/lite/test_tensor_utils.cc b/paddle/fluid/inference/lite/test_tensor_utils.cc
index 48ae1bd71d8a4363c7b0f5af9222e92bcd7a3b1c..eee00e9ba31a6de8688dfb27dd56031e9da4353f 100644
--- a/paddle/fluid/inference/lite/test_tensor_utils.cc
+++ b/paddle/fluid/inference/lite/test_tensor_utils.cc
@@ -30,7 +30,7 @@ TEST(LiteEngineOp, GetNativePlace) {
   platform::Place GetNativePlace(const TargetType& type, int id = 0);
   EXPECT_TRUE(platform::is_cpu_place(GetNativePlace(TargetType::kHost)));
   EXPECT_TRUE(platform::is_gpu_place(GetNativePlace(TargetType::kCUDA)));
-  ASSERT_DEATH(GetNativePlace(TargetType::kUnk), "");
+  EXPECT_ANY_THROW(GetNativePlace(TargetType::kUnk));
 }
 
 TEST(LiteEngineOp, GetLiteTargetType) {
@@ -48,8 +48,8 @@ TEST(LiteEngineOp, GetLitePrecisionType) {
             PrecisionType::kInt8);
   ASSERT_EQ(GetLitePrecisionType(framework::proto::VarType_Type_INT32),
             PrecisionType::kInt32);
-  ASSERT_DEATH(
-      GetLitePrecisionType(framework::proto::VarType_Type_SELECTED_ROWS), "");
+  EXPECT_ANY_THROW(
+      GetLitePrecisionType(framework::proto::VarType_Type_SELECTED_ROWS));
 }
 
 TEST(LiteEngineOp, GetNativePrecisionType) {
@@ -62,7 +62,7 @@ TEST(LiteEngineOp, GetNativePrecisionType) {
             framework::proto::VarType_Type_INT8);
   ASSERT_EQ(GetNativePrecisionType(PrecisionType::kInt32),
             framework::proto::VarType_Type_INT32);
-  ASSERT_DEATH(GetNativePrecisionType(PrecisionType::kUnk), "");
+  EXPECT_ANY_THROW(GetNativePrecisionType(PrecisionType::kUnk));
 }
 
 TEST(LiteEngineOp, GetNativeLayoutType) {
@@ -70,7 +70,7 @@ TEST(LiteEngineOp, GetNativeLayoutType) {
   framework::DataLayout GetNativeLayoutType(const DataLayoutType& type);
   ASSERT_EQ(GetNativeLayoutType(DataLayoutType::kNCHW),
             framework::DataLayout::kNCHW);
-  ASSERT_DEATH(GetNativeLayoutType(DataLayoutType::kNHWC), "");
+  EXPECT_ANY_THROW(GetNativeLayoutType(DataLayoutType::kNHWC));
 }
 
 void test_tensor_copy(const platform::DeviceContext& ctx) {
diff --git a/paddle/fluid/inference/tensorrt/plugin/qkv_to_context_plugin.cu b/paddle/fluid/inference/tensorrt/plugin/qkv_to_context_plugin.cu
index 0f9c94a0afb2f4619a2660b493a13b6875a81840..240ecaa25893d04fe4836d08998a312582425f2f 100644
--- a/paddle/fluid/inference/tensorrt/plugin/qkv_to_context_plugin.cu
+++ b/paddle/fluid/inference/tensorrt/plugin/qkv_to_context_plugin.cu
@@ -115,7 +115,18 @@ inline void TransposeQKV(const int batch, const int seq_len,
                          const half *input, half *output, cudaStream_t stream) {
   int scratch_size = batch * head_num * seq_len * seq_len;
   const dim3 grid(seq_len, batch, 3);
-  if (head_size % 2 == 0 && scratch_size % 2 == 0) {
+  if (head_size % 8 == 0 && scratch_size % 8 == 0) {
+    int h = head_size / 8;
+    const int4 *input4 = reinterpret_cast<const int4 *>(input);
+    int4 *output4 = reinterpret_cast<int4 *>(output);
+    dim3 block(h, head_num, 1);
+    // limit h * head_num to max block size(1024).
+    PADDLE_ENFORCE_LE(h * head_num, 1024,
+                      platform::errors::InvalidArgument(
+                          "head_num (%d) * head_size (%d) should <= %d",
+                          head_num, head_size, 1024 * 8));
+    TransposeQkvKernel<int4><<<grid, block, 0, stream>>>(h, input4, output4);
+  } else if (head_size % 2 == 0 && scratch_size % 2 == 0) {
     const int h = head_size / 2;
     const half2 *input2 = reinterpret_cast<const half2 *>(input);
     half2 *output2 = reinterpret_cast<half2 *>(output);
@@ -167,7 +178,7 @@ nvinfer1::DimsExprs QkvToContextPluginDynamic::getOutputDimensions(
   ret.nbDims = 5;
   ret.d[0] = inputs[0].d[0];
   ret.d[1] = inputs[0].d[1];
-  ret.d[2] = expr_builder.constant(hidden_);
+  ret.d[2] = expr_builder.constant(head_size_ * head_number_);
   ret.d[3] = expr_builder.constant(1);
   ret.d[4] = expr_builder.constant(1);
   return ret;
diff --git a/paddle/fluid/inference/tests/api/CMakeLists.txt b/paddle/fluid/inference/tests/api/CMakeLists.txt
index b2f42d39e885fe93352e323627bbd532ddfe773d..22bf27ce594963839b1cf245d273da9fd29c33ca 100644
--- a/paddle/fluid/inference/tests/api/CMakeLists.txt
+++ b/paddle/fluid/inference/tests/api/CMakeLists.txt
@@ -20,7 +20,7 @@ function(download_int8_data install_dir data_file)
     endif()
 endfunction()
 
-function(download_qat_data install_dir data_file)
+function(download_quant_data install_dir data_file)
     if (NOT EXISTS ${install_dir}/${data_file})
 	    inference_download_and_uncompress(${install_dir} ${INFERENCE_URL}/int8/QAT_models ${data_file})
     endif()
@@ -85,7 +85,7 @@ function(inference_analysis_api_test_with_fake_data_run TARGET_NAME test_binary
              --disable_mkldnn_fc=${disable_fc}) 
 endfunction()
 
-function(inference_analysis_api_qat_test_run TARGET_NAME test_binary fp32_model_dir int8_model_dir data_path)
+function(inference_analysis_api_quant_test_run TARGET_NAME test_binary fp32_model_dir int8_model_dir data_path)
     inference_analysis_test_run(${TARGET_NAME}
     COMMAND ${test_binary}
         ARGS --fp32_model=${fp32_model_dir}
@@ -249,7 +249,7 @@ if(WITH_MKLDNN)
   ## Image classification models
 
   # ImageNet small dataset
-  # May be already downloaded for INT8 QAT unit tests
+  # It may be already downloaded for Quant & INT8 unit tests
   set(IMAGENET_DATA_ARCHIVE "imagenet_val_100_tail.tar.gz")
   set(IMAGENET_DATA_DIR "${INFERENCE_DEMO_INSTALL_DIR}/imagenet")
   set(IMAGENET_DATA_PATH "${IMAGENET_DATA_DIR}/data.bin")
@@ -315,21 +315,21 @@ if(WITH_MKLDNN)
   download_int8_data(${INT8_MOBILENET_SSD_MODEL_DIR} "mobilenet_ssd_int8_model.tar.gz" )
   inference_analysis_api_object_dection_int8_test_run(test_analyzer_int8_mobilenet_ssd ${INT8_OBJ_DETECT_TEST_APP} ${INT8_MOBILENET_SSD_MODEL_DIR} ${PASCALVOC_DATA_PATH})
 
-  ### optimized FP32 vs. QAT INT8 tests
+  ### optimized FP32 vs. Quant INT8 tests
   
-  set(QAT_DATA_DIR "${INFERENCE_DEMO_INSTALL_DIR}/qat")
-  set(QAT_IMG_CLASS_TEST_APP "test_analyzer_qat_image_classification")
-  set(QAT_IMG_CLASS_TEST_APP_SRC "analyzer_qat_image_classification_tester.cc")
+  set(QUANT_DATA_DIR "${INFERENCE_DEMO_INSTALL_DIR}/quant")
+  set(QUANT_IMG_CLASS_TEST_APP "test_analyzer_quant_image_classification")
+  set(QUANT_IMG_CLASS_TEST_APP_SRC "analyzer_quant_image_classification_tester.cc")
 
   # build test binary to be used in subsequent tests
-  inference_analysis_api_test_build(${QAT_IMG_CLASS_TEST_APP} ${QAT_IMG_CLASS_TEST_APP_SRC})
-
-  # MobileNet FP32 vs. QAT INT8
-  # The FP32 model should already be downloaded for slim QAT unit tests
-  set(QAT2_MobileNet_MODEL_DIR "${QAT_DATA_DIR}/MobileNet_qat_perf")
-  set(QAT2_INT8_MobileNet_MODEL_DIR "${QAT_DATA_DIR}/MobileNet_qat_perf_int8")
-  download_qat_data(${QAT2_INT8_MobileNet_MODEL_DIR} "MobileNet_qat_perf_int8.tar.gz")
-  inference_analysis_api_qat_test_run(test_analyzer_qat_performance_benchmark ${QAT_IMG_CLASS_TEST_APP} ${QAT2_MobileNet_MODEL_DIR}/MobileNet_qat_perf/float ${QAT2_INT8_MobileNet_MODEL_DIR}/MobileNet_qat_perf_int8 ${IMAGENET_DATA_PATH})
+  inference_analysis_api_test_build(${QUANT_IMG_CLASS_TEST_APP} ${QUANT_IMG_CLASS_TEST_APP_SRC})
+
+  # MobileNetV1 FP32 vs. Quant INT8
+  # The FP32 model should already be downloaded for slim Quant unit tests
+  set(QUANT2_MobileNetV1_MODEL_DIR "${QUANT_DATA_DIR}/MobileNetV1_quant2")
+  set(QUANT2_INT8_MobileNetV1_MODEL_DIR "${QUANT_DATA_DIR}/MobileNetV1_quant2_int8")
+  download_quant_data(${QUANT2_INT8_MobileNetV1_MODEL_DIR} "MobileNet_qat_perf_int8.tar.gz")
+  inference_analysis_api_quant_test_run(test_analyzer_quant_performance_benchmark ${QUANT_IMG_CLASS_TEST_APP} ${QUANT2_MobileNetV1_MODEL_DIR}/MobileNet_qat_perf/float ${QUANT2_INT8_MobileNetV1_MODEL_DIR}/MobileNet_qat_perf_int8 ${IMAGENET_DATA_PATH})
 
   ### Other tests
  
diff --git a/paddle/fluid/inference/tests/api/analyzer_qat_image_classification_tester.cc b/paddle/fluid/inference/tests/api/analyzer_quant_image_classification_tester.cc
similarity index 98%
rename from paddle/fluid/inference/tests/api/analyzer_qat_image_classification_tester.cc
rename to paddle/fluid/inference/tests/api/analyzer_quant_image_classification_tester.cc
index 7b2b1c31cc5a7ee84fefc5abc37c342155151d94..a5a3e60d04b90795f4caf43722e5f7a46e4ed13a 100644
--- a/paddle/fluid/inference/tests/api/analyzer_qat_image_classification_tester.cc
+++ b/paddle/fluid/inference/tests/api/analyzer_quant_image_classification_tester.cc
@@ -108,7 +108,7 @@ void SetInput(std::vector<std::vector<PaddleTensor>> *inputs,
   }
 }
 
-TEST(Analyzer_qat_image_classification, quantization) {
+TEST(Analyzer_quant_image_classification, quantization) {
   AnalysisConfig fp32_cfg;
   SetConfig(&fp32_cfg, FLAGS_fp32_model);
 
diff --git a/paddle/fluid/inference/tests/api/lite_mul_model_test.cc b/paddle/fluid/inference/tests/api/lite_mul_model_test.cc
index 35d20ce34eeee926aa257a595d8153ab67027cf4..0c6b6019c544c68f726b54a88dba0e655a9f7c19 100644
--- a/paddle/fluid/inference/tests/api/lite_mul_model_test.cc
+++ b/paddle/fluid/inference/tests/api/lite_mul_model_test.cc
@@ -47,8 +47,8 @@ int test_main(const AnalysisConfig& config, Barrier* barrier = nullptr) {
   std::vector<PaddleTensor> outputs;
   predictor->Run(inputs, &outputs);
   const std::vector<float> truth_values = {
-      -0.00621776, -0.00620937, 0.00990623,  -0.0039817, -0.00074315,
-      0.61229795,  -0.00491806, -0.00068755, 0.18409646, 0.30090684};
+      -0.00621776f, -0.00620937f, 0.00990623f,  -0.0039817f, -0.00074315f,
+      0.61229795f,  -0.00491806f, -0.00068755f, 0.18409646f, 0.30090684f};
   const size_t expected_size = 1;
   EXPECT_EQ(outputs.size(), expected_size);
   float* data_o = static_cast<float*>(outputs[0].data.data());
diff --git a/paddle/fluid/inference/tests/api/lite_resnet50_test.cc b/paddle/fluid/inference/tests/api/lite_resnet50_test.cc
index a3fd4b4f6de2377ab8d1e19d0c7307d81b64015c..0aea47ae7fab1be3bafe35af575e9a2bea2d8420 100644
--- a/paddle/fluid/inference/tests/api/lite_resnet50_test.cc
+++ b/paddle/fluid/inference/tests/api/lite_resnet50_test.cc
@@ -49,15 +49,17 @@ TEST(AnalysisPredictor, use_gpu) {
   ASSERT_TRUE(predictor->Run(inputs, &outputs));
 
   const std::vector<float> truth_values = {
-      127.780396, 738.16656,  1013.2264,  -438.17206, 366.4022,   927.66187,
-      736.2241,   -633.68567, -329.92737, -430.15637, -633.0639,  -146.54858,
-      -1324.2804, -1349.3661, -242.67671, 117.44864,  -801.7251,  -391.51495,
-      -404.8202,  454.16132,  515.48206,  -133.03114, 69.293076,  590.09753,
-      -1434.6917, -1070.8903, 307.0744,   400.52573,  -316.12177, -587.1265,
-      -161.05742, 800.3663,   -96.47157,  748.708,    868.17645,  -447.9403,
-      112.73656,  1127.1992,  47.43518,   677.7219,   593.1881,   -336.4011,
-      551.3634,   397.82474,  78.39835,   -715.4006,  405.96988,  404.25684,
-      246.01978,  -8.430191,  131.36617,  -648.0528};
+      127.780396f, 738.16656f,  1013.2264f,  -438.17206f, 366.4022f,
+      927.66187f,  736.2241f,   -633.68567f, -329.92737f, -430.15637f,
+      -633.0639f,  -146.54858f, -1324.2804f, -1349.3661f, -242.67671f,
+      117.44864f,  -801.7251f,  -391.51495f, -404.8202f,  454.16132f,
+      515.48206f,  -133.03114f, 69.293076f,  590.09753f,  -1434.6917f,
+      -1070.8903f, 307.0744f,   400.52573f,  -316.12177f, -587.1265f,
+      -161.05742f, 800.3663f,   -96.47157f,  748.708f,    868.17645f,
+      -447.9403f,  112.73656f,  1127.1992f,  47.43518f,   677.7219f,
+      593.1881f,   -336.4011f,  551.3634f,   397.82474f,  78.39835f,
+      -715.4006f,  405.96988f,  404.25684f,  246.01978f,  -8.430191f,
+      131.36617f,  -648.0528f};
 
   const size_t expected_size = 1;
   EXPECT_EQ(outputs.size(), expected_size);
diff --git a/paddle/fluid/inference/tests/api/trt_mobilenet_test.cc b/paddle/fluid/inference/tests/api/trt_mobilenet_test.cc
index 1dbdcccf41ba3a42dd21982cd9fac86f5e767382..8ffa3efdf0556bd7cde7efa615f60853ad18d903 100644
--- a/paddle/fluid/inference/tests/api/trt_mobilenet_test.cc
+++ b/paddle/fluid/inference/tests/api/trt_mobilenet_test.cc
@@ -43,6 +43,7 @@ TEST(AnalysisPredictor, use_gpu) {
   std::vector<PaddleTensor> outputs;
   for (auto& input : inputs_all) {
     ASSERT_TRUE(predictor->Run(input, &outputs));
+    predictor->ClearIntermediateTensor();
   }
 }
 
diff --git a/paddle/fluid/operators/CMakeLists.txt b/paddle/fluid/operators/CMakeLists.txt
index 40ed5869c276522dde65cb7028e553f0443e5d62..23509773fa9e0697159f0365cc21ba84fb0ab1bf 100644
--- a/paddle/fluid/operators/CMakeLists.txt
+++ b/paddle/fluid/operators/CMakeLists.txt
@@ -91,7 +91,7 @@ set(COMMON_OP_DEPS ${COMMON_OP_DEPS} selected_rows_functor selected_rows lod_ten
 set(COMMON_OP_DEPS ${COMMON_OP_DEPS} dynload_warpctc)
 set(COMMON_OP_DEPS ${COMMON_OP_DEPS} sequence_padding sequence_scale cos_sim_functor memory jit_kernel_helper concat_and_split cross_entropy softmax vol2col im2col sampler sample_prob tree2col)
 set(COMMON_OP_DEPS ${COMMON_OP_DEPS} sequence2batch lstm_compute matrix_bit_code gru_compute activation_functions beam_search fc matrix_inverse)
-set(COMMON_OP_DEPS ${COMMON_OP_DEPS} box_wrapper)
+set(COMMON_OP_DEPS ${COMMON_OP_DEPS} box_wrapper boost)
 if (WITH_GPU)
   set(COMMON_OP_DEPS ${COMMON_OP_DEPS} depthwise_conv prelu bert_encoder_functor)
 endif()
diff --git a/paddle/fluid/operators/activation_op.cc b/paddle/fluid/operators/activation_op.cc
index 59055a8679c14992e8356de06ae56ce621d0aba8..204f854a380abb5110e9b899834d0ee00579254e 100644
--- a/paddle/fluid/operators/activation_op.cc
+++ b/paddle/fluid/operators/activation_op.cc
@@ -225,14 +225,14 @@ $$out = |x|$$
 UNUSED constexpr char CeilDoc[] = R"DOC(
 Ceil Operator. Computes ceil of x element-wise.
 
-$$out = \left \lceil x \right \rceil$$
+$$out = \\left \\lceil x \\right \\rceil$$
 
 )DOC";
 
 UNUSED constexpr char FloorDoc[] = R"DOC(
-Floor Activation Operator.
+Floor Activation Operator. Computes floor of x element-wise.
 
-$$out = \left \lfloor x \right \rfloor$$
+$$out = \\left \\lfloor x \\right \\rfloor$$
 
 )DOC";
 
diff --git a/paddle/fluid/operators/batch_fc_op.cu b/paddle/fluid/operators/batch_fc_op.cu
index 414eeef2a6f7027c43ec75ef402e843df74a0567..9a39306ccad6a5a3a4d753b1060c0af169f0f60f 100644
--- a/paddle/fluid/operators/batch_fc_op.cu
+++ b/paddle/fluid/operators/batch_fc_op.cu
@@ -24,10 +24,6 @@ namespace paddle {
 namespace operators {
 using framework::Tensor;
 
-#define CUDA_KERNEL_LOOP(i, n)                                 \
-  for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < (n); \
-       i += blockDim.x * gridDim.x)
-
 const int CUDA_NUM_THREADS = 1024;
 static inline int GET_BLOCKS(const int N) {
   return (N + CUDA_NUM_THREADS - 1) / CUDA_NUM_THREADS;
diff --git a/paddle/fluid/operators/bce_loss_op.cu b/paddle/fluid/operators/bce_loss_op.cu
index 179e194a9c55e7d8a4e65b2f98c5bd21f8d53f6b..8e30f4eb15b6afde885512206c7eaeb721cdd44b 100644
--- a/paddle/fluid/operators/bce_loss_op.cu
+++ b/paddle/fluid/operators/bce_loss_op.cu
@@ -24,14 +24,10 @@ namespace operators {
 
 using Tensor = framework::Tensor;
 
-#define CUDA_1D_KERNEL_LOOP(i, n)                              \
-  for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < (n); \
-       i += blockDim.x * gridDim.x)
-
 template <typename T>
 __global__ void GPUBCELossForward(const T* x_data, const T* label_data,
                                   T* out_data, const int in_numel) {
-  CUDA_1D_KERNEL_LOOP(i, in_numel) {
+  CUDA_KERNEL_LOOP(i, in_numel) {
     T x = x_data[i];
     T label = label_data[i];
     T one = static_cast<T>(1.);
@@ -48,7 +44,7 @@ template <typename T>
 __global__ void GPUBCELossBackward(const T* x_data, const T* label_data,
                                    const T* dout_data, T* dx_data,
                                    const int in_numel) {
-  CUDA_1D_KERNEL_LOOP(i, in_numel) {
+  CUDA_KERNEL_LOOP(i, in_numel) {
     T x = x_data[i];
     T label = label_data[i];
     T dout = dout_data[i];
diff --git a/paddle/fluid/operators/bilateral_slice_op.cc b/paddle/fluid/operators/bilateral_slice_op.cc
new file mode 100644
index 0000000000000000000000000000000000000000..b742b4c0deea89dacd29a02588236b81ac13f6af
--- /dev/null
+++ b/paddle/fluid/operators/bilateral_slice_op.cc
@@ -0,0 +1,194 @@
+/* Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserve.
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+   http://www.apache.org/licenses/LICENSE-2.0
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License. */
+
+#include "paddle/fluid/operators/bilateral_slice_op.h"
+#include <memory>
+#include <string>
+#include <vector>
+#include "paddle/fluid/framework/op_registry.h"
+
+namespace paddle {
+namespace operators {
+
+using framework::Tensor;
+using DataLayout = framework::DataLayout;
+
+class BilateralSliceOp : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+
+ protected:
+  void InferShape(framework::InferShapeContext* ctx) const override {
+    OP_INOUT_CHECK(ctx->HasInput("X"), "Input", "X", "BilateralSlice");
+    OP_INOUT_CHECK(ctx->HasInput("Grid"), "Input", "Grid", "BilateralSlice");
+    OP_INOUT_CHECK(ctx->HasInput("Guide"), "Input", "Guide", "BilateralSlice");
+    OP_INOUT_CHECK(ctx->HasOutput("Out"), "Output", "Output", "BilateralSlice");
+
+    auto dim_x = ctx->GetInputDim("X");  // NCHW format
+    PADDLE_ENFORCE_EQ(
+        dim_x.size(), 4,
+        platform::errors::Unimplemented(
+            "Input(X) dimension must be 4, but got dimension = %d .",
+            dim_x.size()));
+
+    auto input_dims = ctx->GetInputDim("X");
+    auto grid_dims = ctx->GetInputDim("Grid");
+    auto guide_dims = ctx->GetInputDim("Guide");
+    bool has_offset = ctx->Attrs().Get<bool>("has_offset");
+    int64_t h = guide_dims[1];
+    int64_t w = guide_dims[2];
+    int64_t bs = grid_dims[0];
+    int64_t coeffs_chans = grid_dims[1];
+    int64_t input_chans = input_dims[1];
+
+    int64_t output_chans;
+    if (has_offset) {
+      PADDLE_ENFORCE_EQ((coeffs_chans % (input_chans + 1)), 0,
+                        platform::errors::InvalidArgument(
+                            "Slicing with affine offset, coefficients grid "
+                            "should have n_out*(n_in+1) channels, but got %d",
+                            coeffs_chans));
+      output_chans = coeffs_chans / (input_chans + 1);
+    } else {
+      PADDLE_ENFORCE_EQ((coeffs_chans % input_chans), 0,
+                        platform::errors::InvalidArgument(
+                            "Slicing without affine offset, coefficients grid "
+                            "should have n_out*n_in channels, but got %d .",
+                            coeffs_chans));
+      output_chans = coeffs_chans / input_chans;
+    }
+
+    std::vector<int64_t> output_dims;
+    output_dims.push_back(bs);
+    output_dims.push_back(output_chans);
+    output_dims.push_back(h);
+    output_dims.push_back(w);
+
+    ctx->SetOutputDim("Out", framework::make_ddim(output_dims));
+  }
+
+ protected:
+  framework::OpKernelType GetExpectedKernelType(
+      const framework::ExecutionContext& ctx) const override {
+    return framework::OpKernelType(
+        OperatorWithKernel::IndicateVarDataType(ctx, "X"), ctx.GetPlace());
+  }
+
+  framework::OpKernelType GetKernelTypeForVar(
+      const std::string& var_name, const Tensor& tensor,
+      const framework::OpKernelType& expected_kernel_type) const override {
+    return framework::OpKernelType(expected_kernel_type.data_type_,
+                                   tensor.place(), tensor.layout());
+  }
+};
+
+class BilateralSliceOpMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  void Make() override {
+    AddInput("X",
+             "The input tensor of bilateral_slice operator, "
+             "This is a 4-D tensor with shape of [N, C, H, W]");
+    AddInput("Grid",
+             "This is a 5-D tensor. "
+             "It should be [N, C, D, H, W].");
+    AddInput("Guide",
+             "This is a 3-D tensor "
+             "It should be [N, H, W].");
+    AddOutput("Out",
+              "The output tensor of bilateral slice operator, "
+              "This is a tensor in same rank with Input(X).");
+    AddAttr<bool>("has_offset", "an optional bool. Defaults to False. ")
+        .SetDefault(false);
+    AddComment(R"DOC(
+          This operator enhance input X according guide and grid
+          For details of bilateral slice, please refer to paper:
+          https://groups.csail.mit.edu/graphics/hdrnet/
+         )DOC");
+  }
+};
+
+class BilateralSliceOpGrad : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+
+ protected:
+  void InferShape(framework::InferShapeContext* ctx) const override {
+    OP_INOUT_CHECK(ctx->HasInput("X"), "Input", "X", "BilateralSliceOpGrad");
+    OP_INOUT_CHECK(ctx->HasInput("Grid"), "Input", "Grid",
+                   "BilateralSliceOpGrad");
+    OP_INOUT_CHECK(ctx->HasInput("Guide"), "Input", "Guide",
+                   "BilateralSliceOpGrad");
+    OP_INOUT_CHECK(ctx->HasInput(framework::GradVarName("Out")), "Input", "Out",
+                   "BilateralSliceOpGrad");
+
+    auto dim_x = ctx->GetInputDim("X");
+    auto dim_grid = ctx->GetInputDim("Grid");
+    auto dim_guide = ctx->GetInputDim("Guide");
+    if (ctx->HasOutput(framework::GradVarName("X"))) {
+      ctx->SetOutputDim(framework::GradVarName("X"), dim_x);
+    }
+    if (ctx->HasOutput(framework::GradVarName("Grid"))) {
+      ctx->SetOutputDim(framework::GradVarName("Grid"), dim_grid);
+    }
+    if (ctx->HasOutput(framework::GradVarName("Guide"))) {
+      ctx->SetOutputDim(framework::GradVarName("Guide"), dim_guide);
+    }
+  }
+
+  framework::OpKernelType GetExpectedKernelType(
+      const framework::ExecutionContext& ctx) const override {
+    return framework::OpKernelType(OperatorWithKernel::IndicateVarDataType(
+                                       ctx, framework::GradVarName("Out")),
+                                   ctx.GetPlace());
+  }
+};
+
+template <typename T>
+class BilateralSliceGradMaker : public framework::SingleGradOpMaker<T> {
+ public:
+  using framework::SingleGradOpMaker<T>::SingleGradOpMaker;
+
+ protected:
+  void Apply(GradOpPtr<T> op) const override {
+    op->SetType(this->ForwardOpType() + "_grad");
+    op->SetInput("X", this->Input("X"));
+    op->SetInput("Grid", this->Input("Grid"));
+    op->SetInput("Guide", this->Input("Guide"));
+
+    op->SetInput(framework::GradVarName("Out"), this->OutputGrad("Out"));
+    op->SetOutput(framework::GradVarName("X"), this->InputGrad("X"));
+    op->SetOutput(framework::GradVarName("Grid"), this->InputGrad("Grid"));
+    op->SetOutput(framework::GradVarName("Guide"), this->InputGrad("Guide"));
+    op->SetAttrMap(this->Attrs());
+  }
+};
+
+template <typename T>
+class BilateralSliceKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    PADDLE_ENFORCE_EQ(platform::is_gpu_place(ctx.GetPlace()), true,
+                      platform::errors::Unimplemented(
+                          "BilateralSlice only supports GPU now."));
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+REGISTER_OPERATOR(bilateral_slice, ops::BilateralSliceOp,
+                  ops::BilateralSliceOpMaker,
+                  ops::BilateralSliceGradMaker<paddle::framework::OpDesc>,
+                  ops::BilateralSliceGradMaker<paddle::imperative::OpBase>);
+REGISTER_OPERATOR(bilateral_slice_grad, ops::BilateralSliceOpGrad);
+REGISTER_OP_CPU_KERNEL(bilateral_slice, ops::BilateralSliceKernel<float>,
+                       ops::BilateralSliceKernel<double>);
diff --git a/paddle/fluid/operators/bilateral_slice_op.cu b/paddle/fluid/operators/bilateral_slice_op.cu
new file mode 100644
index 0000000000000000000000000000000000000000..e46950f61887dd64123135faec36ee0df11c0683
--- /dev/null
+++ b/paddle/fluid/operators/bilateral_slice_op.cu
@@ -0,0 +1,506 @@
+/* Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserve.
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+   http://www.apache.org/licenses/LICENSE-2.0
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License. */
+
+#include <algorithm>
+#include <string>
+#include "paddle/fluid/operators/bilateral_slice_op.h"
+#include "paddle/fluid/platform/cuda_primitives.h"
+#include "paddle/fluid/platform/gpu_launch_config.h"
+
+namespace paddle {
+namespace operators {
+
+using framework::Tensor;
+using DataLayout = framework::DataLayout;
+
+template <typename T>
+__device__ T DiffAbs(T x) {
+  T eps = 1e-8;
+  return sqrt(x * x + eps);
+}
+
+template <typename T>
+__device__ T DdiffAbs(T x) {
+  T eps = 1e-8;
+  return x / sqrt(x * x + eps);
+}
+
+template <typename T>
+__device__ T WeightZ(T x) {
+  T abx = DiffAbs(x);
+  return max(1.0f - abx, 0.0f);
+}
+
+template <typename T>
+__device__ T DweightZ(T x) {
+  T abx = DiffAbs(x);
+  if (abx > 1.0f) {
+    return 0.0f;
+  } else {
+    return DdiffAbs(x);
+  }
+}
+
+template <typename T>
+__global__ void BilateralSliceCudaForwardKernel(
+    T* output, const T* bilateral_grid, const T* guide, const T* input,
+    GridSizes gsz, bool has_offset, int total_count, int output_chans) {
+  int h = gsz.h;
+  int w = gsz.w;
+  int gd = gsz.gd;
+  int gh = gsz.gh;
+  int gw = gsz.gw;
+  int input_chans = gsz.input_chans;
+  int coeff_stride = input_chans;
+  int grid_chans = input_chans * output_chans;
+
+  if (has_offset) {
+    grid_chans += output_chans;
+    coeff_stride += 1;
+  }
+
+  for (int idx = blockIdx.x * blockDim.x + threadIdx.x; idx < total_count;
+       idx += blockDim.x * gridDim.x) {
+    int x = idx % w;
+    int y = (idx / w) % h;
+    int out_c = (idx / (h * w)) % output_chans;
+    int b = (idx / (output_chans * w * h));
+
+    T gx = (x + 0.5f) * gw / (1.0f * w);
+    T gy = (y + 0.5f) * gh / (1.0f * h);
+    T gz = guide[x + w * (y + h * b)] * gd;
+
+    int fx = static_cast<int>(floor(gx - 0.5f));
+    int fy = static_cast<int>(floor(gy - 0.5f));
+    int fz = static_cast<int>(floor(gz - 0.5f));
+
+    int sy = gw;
+    int sz = gw * gh;
+    int sc = gd * gw * gh;
+    int sb = grid_chans * gd * gw * gh;
+
+    T value = 0.0f;
+    for (int in_c = 0; in_c < coeff_stride; ++in_c) {
+      T coeff_sample = 0.0f;
+
+      for (int xx = fx; xx < fx + 2; ++xx) {
+        int x_ = max(min(xx, gw - 1), 0);
+        T wx = max(1.0f - abs(xx + 0.5 - gx), 0.0f);
+
+        for (int yy = fy; yy < fy + 2; ++yy) {
+          int y_ = max(min(yy, gh - 1), 0);
+          T wy = max(1.0f - abs(yy + 0.5 - gy), 0.0f);
+
+          for (int zz = fz; zz < fz + 2; ++zz) {
+            int z_ = max(min(zz, gd - 1), 0);
+            T wz = WeightZ(zz + 0.5 - gz);
+            int c_ = coeff_stride * out_c + in_c;
+            int grid_idx = x_ + sy * y_ + sz * z_ + sc * c_ + sb * b;
+
+            coeff_sample += bilateral_grid[grid_idx] * wx * wy * wz;
+          }
+        }
+      }
+      if (in_c < input_chans) {
+        int input_idx = x + w * (y + h * (in_c + input_chans * b));
+        value += coeff_sample * input[input_idx];
+      } else {
+        value += coeff_sample;
+      }
+    }
+
+    output[idx] = value;
+  }
+}
+
+template <typename T>
+class BilateralSliceOpCUDAKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    auto* input = ctx.Input<Tensor>("X");
+    auto* grid = ctx.Input<Tensor>("Grid");
+    auto* guide = ctx.Input<Tensor>("Guide");
+    auto* output = ctx.Output<Tensor>("Out");
+
+    auto* output_data = output->mutable_data<T>(ctx.GetPlace());
+    auto* grid_data = grid->data<T>();
+    auto* guide_data = guide->data<T>();
+    auto* input_data = input->data<T>();
+
+    bool has_offset = ctx.Attr<bool>("has_offset");
+    auto input_dims = input->dims();
+    auto output_dims = output->dims();
+    auto grid_dims = grid->dims();
+
+    int batch_size = input_dims[0];
+    int h = input_dims[2];
+    int w = input_dims[3];
+    int input_chans = input_dims[1];
+    int coeff_stride = input_chans;
+    int grid_chans = input_chans * output_dims[1];
+
+    int64_t coeffs_chans = grid_dims[1];
+    int64_t gd = grid_dims[2];
+    int64_t gh = grid_dims[3];
+    int64_t gw = grid_dims[4];
+
+    GridSizes grid_sizes;
+    grid_sizes.h = h;
+    grid_sizes.w = w;
+    grid_sizes.bs = batch_size;
+    grid_sizes.coeffs_chans = coeffs_chans;
+    grid_sizes.gd = gd;
+    grid_sizes.gh = gh;
+    grid_sizes.gw = gw;
+    grid_sizes.input_chans = input_chans;
+
+    int total_count = batch_size * h * w * output_dims[1];
+
+    platform::GpuLaunchConfig config =
+        platform::getGpuLaunchConfig(total_count, ctx);
+
+    BilateralSliceCudaForwardKernel<T><<<config.blocks, config.threads, 0,
+                                         ctx.cuda_device_context().stream()>>>(
+        output_data, grid_data, guide_data, input_data, grid_sizes, has_offset,
+        total_count, output_dims[1]);
+  }
+};
+
+template <typename T>
+__global__ void BilateralSliceCudaGridGradKernel(
+    T* out_grid_grad, const T* upstream_grad, const T* guide, const T* input,
+    GridSizes gsz, bool has_offset, int grid_count, int output_chans) {
+  int h = gsz.h;
+  int w = gsz.w;
+  int gd = gsz.gd;
+  int gh = gsz.gh;
+  int gw = gsz.gw;
+  int input_chans = gsz.input_chans;
+  int grid_chans = input_chans * output_chans;
+  int coeff_stride = input_chans;
+
+  if (has_offset) {
+    grid_chans += output_chans;
+    coeff_stride += 1;
+  }
+
+  for (int idx = blockIdx.x * blockDim.x + threadIdx.x; idx < grid_count;
+       idx += blockDim.x * gridDim.x) {
+    int gx = idx % gw;
+    int gy = (idx / gw) % gh;
+    int gz = (idx / (gh * gw)) % gd;
+    int c = (idx / (gd * gh * gw)) % grid_chans;
+    int b = (idx / (grid_chans * gd * gw * gh));
+
+    T scale_w = w * 1.0 / gw;
+    T scale_h = h * 1.0 / gh;
+
+    int left_x = static_cast<int>(floor(scale_w * (gx + 0.5 - 1)));
+    int right_x = static_cast<int>(ceil(scale_w * (gx + 0.5 + 1)));
+    int left_y = static_cast<int>(floor(scale_h * (gy + 0.5 - 1)));
+    int right_y = static_cast<int>(ceil(scale_h * (gy + 0.5 + 1)));
+
+    int sy = w;
+    int sc = w * h;
+    int sb = output_chans * w * h;
+
+    int isy = w;
+    int isc = h * w;
+    int isb = input_chans * h * w;
+
+    int out_c = c / coeff_stride;
+    int in_c = c % coeff_stride;
+
+    T value = 0.0f;
+    for (int x = left_x; x < right_x; ++x) {
+      int x_ = x;
+
+      if (x_ < 0) {
+        x_ = -x_ - 1;
+      }
+      if (x_ >= w) {
+        x_ = 2 * w - 1 - x_;
+      }
+
+      T gx2 = (x + 0.5f) / scale_w;
+      T wx = max(1.0f - abs(gx + 0.5 - gx2), 0.0f);
+
+      for (int y = left_y; y < right_y; ++y) {
+        int y_ = y;
+
+        if (y_ < 0) {
+          y_ = -y_ - 1;
+        }
+        if (y_ >= h) {
+          y_ = 2 * h - 1 - y_;
+        }
+
+        T gy2 = (y + 0.5f) / scale_h;
+        T wy = max(1.0f - abs(gy + 0.5 - gy2), 0.0f);
+
+        int guide_idx = x_ + w * y_ + h * w * b;
+        T gz2 = guide[guide_idx] * gd;
+        T wz = WeightZ(gz + 0.5f - gz2);
+        if (((gz == 0) && (gz2 < 0.5f)) ||
+            ((gz == (gd - 1)) && (gz2 > (gd - 0.5f)))) {
+          wz = 1.0f;
+        }
+
+        int back_idx = x_ + sy * y_ + sc * out_c + sb * b;
+        if (in_c < input_chans) {
+          int input_idx = x_ + isy * y_ + isc * in_c + isb * b;
+          value += wz * wx * wy * upstream_grad[back_idx] * input[input_idx];
+        } else {
+          value += wz * wx * wy * upstream_grad[back_idx];
+        }
+      }
+    }
+    out_grid_grad[idx] = value;
+  }
+}
+
+template <typename T>
+__global__ void BilateralSliceCudaGuideGradKernel(
+    T* out_guide_grad, const T* upstream_grad, const T* bilateral_grid,
+    const T* guide, const T* input, GridSizes gsz, bool has_offset,
+    int guide_count, int output_chans) {
+  int h = gsz.h;
+  int w = gsz.w;
+  int gd = gsz.gd;
+  int gh = gsz.gh;
+  int gw = gsz.gw;
+  int input_chans = gsz.input_chans;
+  int grid_chans = input_chans * output_chans;
+  int coeff_stride = input_chans;
+
+  if (has_offset) {
+    grid_chans += output_chans;
+    coeff_stride += 1;
+  }
+
+  for (int idx = blockIdx.x * blockDim.x + threadIdx.x; idx < guide_count;
+       idx += blockDim.x * gridDim.x) {
+    int x = idx % w;
+    int y = (idx / w) % h;
+    int b = (idx / (w * h));
+
+    T gx = (x + 0.5f) * gw / (1.0f * w);
+    T gy = (y + 0.5f) * gh / (1.0f * h);
+    T gz = guide[x + w * (y + h * b)] * gd;
+
+    int fx = static_cast<int>(floor(gx - 0.5f));
+    int fy = static_cast<int>(floor(gy - 0.5f));
+    int fz = static_cast<int>(floor(gz - 0.5f));
+
+    int sy = gw;
+    int sz = gh * gw;
+    int sc = gd * gh * gw;
+    int sb = grid_chans * gd * gw * gh;
+
+    T out_sum = 0.0f;
+    for (int out_c = 0; out_c < output_chans; ++out_c) {
+      T in_sum = 0.0f;
+      for (int in_c = 0; in_c < coeff_stride; ++in_c) {
+        T grid_sum = 0.0f;
+        for (int xx = fx; xx < fx + 2; ++xx) {
+          int x_ = max(min(xx, gw - 1), 0);
+          T wx = max(1.0f - abs(xx + 0.5 - gx), 0.0f);
+
+          for (int yy = fy; yy < fy + 2; ++yy) {
+            int y_ = max(min(yy, gh - 1), 0);
+            T wy = max(1.0f - abs(yy + 0.5 - gy), 0.0f);
+
+            for (int zz = fz; zz < fz + 2; ++zz) {
+              int z_ = max(min(zz, gd - 1), 0);
+              T dwz = gd * DweightZ(zz + 0.5 - gz);
+
+              int c_ = coeff_stride * out_c + in_c;
+              int grid_idx = x_ + sy * y_ + sz * z_ + sc * c_ + sb * b;
+              grid_sum += bilateral_grid[grid_idx] * wx * wy * dwz;
+            }
+          }
+        }
+
+        if (in_c < input_chans) {
+          in_sum +=
+              grid_sum * input[x + w * (y + h * (in_c + input_chans * b))];
+        } else {
+          in_sum += grid_sum;
+        }
+      }
+
+      out_sum +=
+          in_sum * upstream_grad[x + w * (y + h * (out_c + output_chans * b))];
+    }
+
+    out_guide_grad[idx] = out_sum;
+  }
+}
+
+template <typename T>
+__global__ void BilateralSliceCudaInputGradKernel(
+    T* out_input_grad, const T* upstream_grad, const T* bilateral_grid,
+    const T* guide, GridSizes gsz, bool has_offset, int input_count,
+    int output_chans) {
+  int h = gsz.h;
+  int w = gsz.w;
+  int gd = gsz.gd;
+  int gh = gsz.gh;
+  int gw = gsz.gw;
+  int input_chans = gsz.input_chans;
+  int grid_chans = input_chans * output_chans;
+  int coeff_stride = input_chans;
+
+  if (has_offset) {
+    grid_chans += output_chans;
+    coeff_stride += 1;
+  }
+
+  for (int idx = blockIdx.x * blockDim.x + threadIdx.x; idx < input_count;
+       idx += blockDim.x * gridDim.x) {
+    int x = idx % w;
+    int y = (idx / w) % h;
+    int in_c = (idx / (h * w)) % input_chans;
+    int b = (idx / (input_chans * w * h));
+
+    T gx = (x + 0.5f) * gw / (1.0f * w);
+    T gy = (y + 0.5f) * gh / (1.0f * h);
+    T gz = guide[x + w * (y + h * b)] * gd;
+
+    int fx = static_cast<int>(floor(gx - 0.5f));
+    int fy = static_cast<int>(floor(gy - 0.5f));
+    int fz = static_cast<int>(floor(gz - 0.5f));
+
+    int sy = gw;
+    int sz = gh * gw;
+    int sc = gd * gh * gw;
+    int sb = grid_chans * gd * gh * gw;
+
+    T value = 0.0f;
+    for (int out_c = 0; out_c < output_chans; ++out_c) {
+      T chan_val = 0.0f;
+
+      for (int xx = fx; xx < fx + 2; ++xx) {
+        int x_ = max(min(xx, gw - 1), 0);
+        T wx = max(1.0f - abs(xx + 0.5 - gx), 0.0f);
+
+        for (int yy = fy; yy < fy + 2; ++yy) {
+          int y_ = max(min(yy, gh - 1), 0);
+          T wy = max(1.0f - abs(yy + 0.5 - gy), 0.0f);
+
+          for (int zz = fz; zz < fz + 2; ++zz) {
+            int z_ = max(min(zz, gd - 1), 0);
+
+            T wz = WeightZ(zz + 0.5 - gz);
+
+            int c_ = coeff_stride * out_c + in_c;
+            int grid_idx = x_ + sy * y_ + sz * z_ + sc * c_ + sb * b;
+            chan_val += bilateral_grid[grid_idx] * wx * wy * wz;
+          }
+        }
+      }
+
+      value += chan_val *
+               upstream_grad[x + w * (y + h * (out_c + output_chans * b))];
+    }
+    out_input_grad[idx] = value;
+  }
+}
+
+template <typename T>
+class BilateralSliceGradOpCUDAKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    auto* input = ctx.Input<Tensor>("X");
+    auto* guide = ctx.Input<Tensor>("Guide");
+    auto* grid = ctx.Input<Tensor>("Grid");
+    auto* input_grad = ctx.Output<Tensor>(framework::GradVarName("X"));
+    auto* grid_grad = ctx.Output<Tensor>(framework::GradVarName("Grid"));
+    auto* guide_grad = ctx.Output<Tensor>(framework::GradVarName("Guide"));
+    auto* output_grad = ctx.Input<Tensor>(framework::GradVarName("Out"));
+
+    const T* input_data = input->data<T>();
+    const T* guide_data = guide->data<T>();
+    const T* grid_data = grid->data<T>();
+    const T* output_grad_data = output_grad->data<T>();
+
+    T* input_grad_data = input_grad->mutable_data<T>(ctx.GetPlace());
+    T* guide_grad_data = guide_grad->mutable_data<T>(ctx.GetPlace());
+    T* grid_grad_data = grid_grad->mutable_data<T>(ctx.GetPlace());
+
+    bool has_offset = ctx.Attr<bool>("has_offset");
+
+    auto input_grad_dims = input_grad->dims();
+    auto grid_dims = grid_grad->dims();
+
+    int batch_size = input_grad_dims[0];
+    int h = input_grad_dims[2];
+    int w = input_grad_dims[3];
+    int input_chans = input_grad_dims[1];
+
+    int64_t coeffs_chans = grid_dims[1];
+    int64_t gd = grid_dims[2];
+    int64_t gh = grid_dims[3];
+    int64_t gw = grid_dims[4];
+
+    int output_chans = 0;
+    if (has_offset) {
+      output_chans = coeffs_chans / (input_chans + 1);
+    } else {
+      output_chans = coeffs_chans / input_chans;
+    }
+    int grid_count = batch_size * gh * gw * gd * coeffs_chans;
+    int guide_count = batch_size * h * w;
+    int input_count = batch_size * h * w * input_chans;
+
+    GridSizes grid_sizes;
+    grid_sizes.h = h;
+    grid_sizes.w = w;
+    grid_sizes.bs = batch_size;
+    grid_sizes.coeffs_chans = coeffs_chans;
+    grid_sizes.gd = gd;
+    grid_sizes.gh = gh;
+    grid_sizes.gw = gw;
+    grid_sizes.input_chans = input_chans;
+
+    platform::GpuLaunchConfig config =
+        platform::getGpuLaunchConfig(grid_count, ctx, 512);
+
+    BilateralSliceCudaGridGradKernel<T><<<config.blocks, config.threads, 0,
+                                          ctx.cuda_device_context().stream()>>>(
+        grid_grad_data, output_grad_data, guide_data, input_data, grid_sizes,
+        has_offset, grid_count, output_chans);
+
+    config = platform::getGpuLaunchConfig(guide_count, ctx, 512);
+
+    BilateralSliceCudaGuideGradKernel<T><<<
+        config.blocks, config.threads, 0, ctx.cuda_device_context().stream()>>>(
+        guide_grad_data, output_grad_data, grid_data, guide_data, input_data,
+        grid_sizes, has_offset, guide_count, output_chans);
+
+    config = platform::getGpuLaunchConfig(input_count, ctx, 512);
+
+    BilateralSliceCudaInputGradKernel<T><<<
+        config.blocks, config.threads, 0, ctx.cuda_device_context().stream()>>>(
+        input_grad_data, output_grad_data, grid_data, guide_data, grid_sizes,
+        has_offset, input_count, output_chans);
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+REGISTER_OP_CUDA_KERNEL(bilateral_slice, ops::BilateralSliceOpCUDAKernel<float>,
+                        ops::BilateralSliceOpCUDAKernel<double>);
+REGISTER_OP_CUDA_KERNEL(bilateral_slice_grad,
+                        ops::BilateralSliceGradOpCUDAKernel<float>,
+                        ops::BilateralSliceGradOpCUDAKernel<double>);
diff --git a/paddle/fluid/operators/bilateral_slice_op.h b/paddle/fluid/operators/bilateral_slice_op.h
new file mode 100644
index 0000000000000000000000000000000000000000..0903fe4c71d3d7123c6f340d9e83d526c72dfccb
--- /dev/null
+++ b/paddle/fluid/operators/bilateral_slice_op.h
@@ -0,0 +1,33 @@
+/* Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserve.
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+   http://www.apache.org/licenses/LICENSE-2.0
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License. */
+#pragma once
+#include <algorithm>
+#include <string>
+#include <vector>
+#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/platform/hostdevice.h"
+
+namespace paddle {
+namespace operators {
+
+struct GridSizes {
+  int64_t h;
+  int64_t w;
+  int64_t bs;
+  int64_t coeffs_chans;
+  int64_t gd;
+  int64_t gh;
+  int64_t gw;
+  int64_t input_chans;
+};
+
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/fluid/operators/clip_by_norm_op.h b/paddle/fluid/operators/clip_by_norm_op.h
index a2562c44d70f4038c6b0b93d5464796014f30db3..f6f99369636268b2d628a211e636f8e0770cfbcf 100644
--- a/paddle/fluid/operators/clip_by_norm_op.h
+++ b/paddle/fluid/operators/clip_by_norm_op.h
@@ -82,7 +82,12 @@ class ClipByNormKernel : public framework::OpKernel<T> {
     auto scaling = temp + (static_cast<T>(1) - temp) * max_norm / x_norm;
     Eigen::array<int, 1> one_dim{{1}};
     Eigen::DSizes<int, 1> m_dsize(input->numel());
-    out.device(place) = x * scaling.reshape(one_dim).broadcast(m_dsize);
+    if (context.GetPlace() == platform::CPUPlace()) {
+      out.device(place) =
+          x * scaling.reshape(one_dim).eval().broadcast(m_dsize);
+    } else {
+      out.device(place) = x * scaling.reshape(one_dim).broadcast(m_dsize);
+    }
   }
 };
 
diff --git a/paddle/fluid/operators/collective/c_gen_nccl_id_op.cc b/paddle/fluid/operators/collective/c_gen_nccl_id_op.cc
index 0621fa938c9f854ef1c906620f3e474c375efb8a..e2b09be5a9dfff0111ab80d89bdd76b99517738f 100644
--- a/paddle/fluid/operators/collective/c_gen_nccl_id_op.cc
+++ b/paddle/fluid/operators/collective/c_gen_nccl_id_op.cc
@@ -1,8 +1,11 @@
 /* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
 You may obtain a copy of the License at
+
     http://www.apache.org/licenses/LICENSE-2.0
+
 Unless required by applicable law or agreed to in writing, software
 distributed under the License is distributed on an "AS IS" BASIS,
 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
@@ -21,7 +24,6 @@ limitations under the License. */
 #include "paddle/fluid/framework/lod_tensor.h"
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/framework/threadpool.h"
-#include "paddle/fluid/operators/distributed/barrier_monitor.h"
 #include "paddle/fluid/operators/distributed/distributed.h"
 #include "paddle/fluid/operators/distributed/request_handler_impl.h"
 
@@ -93,39 +95,22 @@ class CGenNCCLIdOp : public framework::OperatorBase {
         new RPCSERVER_T(endpoint, 1));
 
     rpc_service->RegisterRPC(distributed::kRequestSend, &rpc_h);
-    distributed::RequestNotifyHandler notify_h(
-        distributed::DistributedMode::kSync, -1);
-
-    rpc_service->RegisterRPC(distributed::kRequestSend, &rpc_h);
-    rpc_service->RegisterRPC(distributed::kRequestNotify, &notify_h);
+    rpc_h.SetRPCServer(rpc_service.get());
 
     framework::ProgramDesc empty_program;
     framework::Executor executor(dev_ctx.GetPlace());
-
-    rpc_h.SetRPCServer(rpc_service.get());
     rpc_h.SetScope(scope);
     rpc_h.SetDevCtx(&dev_ctx);
     rpc_h.SetProgram(&empty_program);
     rpc_h.SetExecutor(&executor);
 
-    notify_h.SetRPCServer(rpc_service.get());
-    notify_h.SetScope(scope);
-    notify_h.SetDevCtx(&dev_ctx);
-    notify_h.SetProgram(&empty_program);
-    notify_h.SetExecutor(&executor);
-
-    distributed::BarrierMonitor::Init(1);
-    auto* barrier = distributed::BarrierMonitor::GetInstance();
-    barrier->Reset(1, distributed::BarrierType::kSendBarrier);
-
     std::thread server_thread(
         std::bind(&distributed::RPCServer::StartServer, rpc_service.get()));
 
+    rpc_service->SetCond(distributed::kRequestSend);
     VLOG(3) << "start getting nccl id from trainer 0...";
-    barrier->WaitServerWeakup();
-    barrier->ServerWeakup();
+    rpc_service->WaitBarrier(distributed::kRequestSend);
     VLOG(3) << "got nccl id and stop server...";
-    barrier->Stop();
     rpc_service->ShutDown();
     VLOG(3) << "rpc server stopped";
     server_thread.join();
@@ -138,6 +123,7 @@ class CGenNCCLIdOpMaker : public framework::OpProtoAndCheckerMaker {
     AddOutput("Out", "Raw variable contains a NCCL UniqueId instaces.");
     AddComment(R"DOC(
 CGenNCCLId operator
+
 For trainer 0: generate a new UniqueId and send it to all the other trainers.
 For trainer 1~n: start a gRPC server to get the UniqueId, once got, stop the server.
 )DOC");
diff --git a/paddle/fluid/operators/concat_op.h b/paddle/fluid/operators/concat_op.h
index eceb68815e76854e47e3fe90f275aa2d9f96faae..c9dcda1adb3f7bd481df3aa483b9bd3338e9e211 100644
--- a/paddle/fluid/operators/concat_op.h
+++ b/paddle/fluid/operators/concat_op.h
@@ -31,12 +31,19 @@ static inline framework::DDim ComputeAndCheckShape(
   auto out_dims = inputs_dims[0];
   size_t in_zero_dims_size = out_dims.size();
   for (size_t i = 1; i < n; i++) {
+    PADDLE_ENFORCE_EQ(inputs_dims[i].size(), out_dims.size(),
+                      platform::errors::InvalidArgument(
+                          "The shape of input[0] and input[%d] "
+                          "is expected to be equal."
+                          "But received input[0]'s shape = "
+                          "[%s], input[%d]'s shape = [%s].",
+                          i, inputs_dims[0], i, inputs_dims[i]));
     for (size_t j = 0; j < in_zero_dims_size; j++) {
       if (j == axis) {
         if (is_runtime) {
           out_dims[axis] += inputs_dims[i][j];
         } else {
-          if (inputs_dims[i][j] == -1) {
+          if (inputs_dims[i][j] == -1 || out_dims[j] == -1) {
             out_dims[axis] = -1;
           } else {
             out_dims[axis] += inputs_dims[i][j];
@@ -55,6 +62,9 @@ static inline framework::DDim ComputeAndCheckShape(
                                 "[%s], input[%d]'s shape = [%s].",
                                 j, i, inputs_dims[0], i, inputs_dims[i]));
         }
+        if (!is_runtime && out_dims[j] == -1 && inputs_dims[i][j] > 0) {
+          out_dims[j] = inputs_dims[i][j];
+        }
       }
     }
   }
diff --git a/paddle/fluid/operators/controlflow/logical_op.cc b/paddle/fluid/operators/controlflow/logical_op.cc
index e1cecb0a049a508c93e7ffb64f0de6d5536f27a0..74589dcb6a74c79299ef682de0bce146f33ec261 100644
--- a/paddle/fluid/operators/controlflow/logical_op.cc
+++ b/paddle/fluid/operators/controlflow/logical_op.cc
@@ -24,12 +24,12 @@ class BinaryLogicalOpProtoMaker : public framework::OpProtoAndCheckerMaker {
   void Make() override {
     OpComment comment;
     AddInput("X", string::Sprintf("Left hand operand of %s operator. Must be "
-                                  "a LoDTensor or Tensor of type bool.",
+                                  "a Variable of type bool.",
                                   comment.type));
     AddInput("Y", string::Sprintf("Right hand operand of %s operator. Must be "
-                                  "a LoDTensor or Tensor of type bool.",
+                                  "a Variable of type bool.",
                                   comment.type));
-    AddOutput("Out", string::Sprintf("n-dim bool LoDTensor or Tensor"));
+    AddOutput("Out", string::Sprintf("n-dim bool Variable"));
     AddComment(string::Sprintf(R"DOC(%s Operator
 
 It operates element-wise on X and Y, and returns the Out. X, Y and Out are N-dim boolean LoDTensor or Tensor.
diff --git a/paddle/fluid/operators/cross_entropy_op.cc b/paddle/fluid/operators/cross_entropy_op.cc
index ca24261bcc84e2d476891ef5ab7b89a981437b36..2ea15c85f338165df06763afc9a886228de8722e 100644
--- a/paddle/fluid/operators/cross_entropy_op.cc
+++ b/paddle/fluid/operators/cross_entropy_op.cc
@@ -145,11 +145,10 @@ class CrossEntropyGradientOpBase : public framework::OperatorWithKernel {
             "But received: Y@Grad's rank is [%d], Y's rank is [%d]",
             dy_dims.size(), label_dims.size()));
 
-    bool check = true;
-    if ((!ctx->IsRuntime()) &&
-        (framework::product(x_dims) <= 0 || framework::product(dy_dims) <= 0)) {
-      check = false;
-    }
+    bool contain_unknown_dim = framework::contain_unknown_dim(x_dims) ||
+                               framework::contain_unknown_dim(dy_dims);
+
+    bool check = ctx->IsRuntime() || !contain_unknown_dim;
 
     if (check) {
       PADDLE_ENFORCE_EQ(
diff --git a/paddle/fluid/operators/cvm_op.cu b/paddle/fluid/operators/cvm_op.cu
index 1f8470caff1337a0869d5c14d40330634abb7197..75976c968c9e8b7dafb172d55168a297ec875238 100644
--- a/paddle/fluid/operators/cvm_op.cu
+++ b/paddle/fluid/operators/cvm_op.cu
@@ -25,10 +25,6 @@ using platform::PADDLE_CUDA_NUM_THREADS;
 using Tensor = framework::Tensor;
 using LoDTensor = framework::LoDTensor;
 
-#define CUDA_KERNEL_LOOP(i, n)                                 \
-  for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < (n); \
-       i += blockDim.x * gridDim.x)
-
 template <typename T>
 __global__ void CvmComputeKernel(const bool use_cvm, const int64_t item_width,
                                  const T* X, T* Y, int64_t numel) {
diff --git a/paddle/fluid/operators/data_norm_op.cu b/paddle/fluid/operators/data_norm_op.cu
index 483bb5ec5c7f6a609e63b592b4b2bb604a889301..9e284b1dcdaae932bbd0d59582294712f26fe663 100644
--- a/paddle/fluid/operators/data_norm_op.cu
+++ b/paddle/fluid/operators/data_norm_op.cu
@@ -30,10 +30,6 @@ using LoDTensor = framework::LoDTensor;
 using DataLayout = framework::DataLayout;
 using platform::PADDLE_CUDA_NUM_THREADS;
 
-#define CUDA_KERNEL_LOOP(i, n)                                 \
-  for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < (n); \
-       i += blockDim.x * gridDim.x)
-
 inline int GET_BLOCKS(const int N) {
   return (N + PADDLE_CUDA_NUM_THREADS - 1) / PADDLE_CUDA_NUM_THREADS;
 }
diff --git a/paddle/fluid/operators/deformable_psroi_pooling_op.cu b/paddle/fluid/operators/deformable_psroi_pooling_op.cu
index e977c70bf4d74fb768cdf02edd3569177f2ecccf..c1d4cc9d17ab4bfad80457964963c35595ff6a14 100644
--- a/paddle/fluid/operators/deformable_psroi_pooling_op.cu
+++ b/paddle/fluid/operators/deformable_psroi_pooling_op.cu
@@ -40,10 +40,6 @@ namespace operators {
 using Tensor = framework::Tensor;
 using LoDTensor = framework::LoDTensor;
 
-#define CUDA_KERNEL_LOOP(i, n)                                 \
-  for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < (n); \
-       i += blockDim.x * gridDim.x)
-
 const int CUDA_NUM_THREADS = 1024;
 static inline int GET_BLOCKS(const int N) {
   return (N + CUDA_NUM_THREADS - 1) / CUDA_NUM_THREADS;
diff --git a/paddle/fluid/operators/detection/CMakeLists.txt b/paddle/fluid/operators/detection/CMakeLists.txt
index 75a26ac2bdd077260954f0947ea419115aacea00..537063640e4ef6e49f7b991482f0f3122ecef02f 100644
--- a/paddle/fluid/operators/detection/CMakeLists.txt
+++ b/paddle/fluid/operators/detection/CMakeLists.txt
@@ -32,6 +32,7 @@ detection_library(rpn_target_assign_op SRCS rpn_target_assign_op.cc)
 detection_library(generate_proposal_labels_op SRCS generate_proposal_labels_op.cc)
 detection_library(multiclass_nms_op SRCS multiclass_nms_op.cc DEPS gpc)
 detection_library(locality_aware_nms_op SRCS locality_aware_nms_op.cc DEPS gpc)
+detection_library(matrix_nms_op SRCS matrix_nms_op.cc DEPS gpc)
 detection_library(box_clip_op SRCS box_clip_op.cc box_clip_op.cu)
 detection_library(yolov3_loss_op SRCS yolov3_loss_op.cc)
 detection_library(yolo_box_op SRCS yolo_box_op.cc yolo_box_op.cu)
diff --git a/paddle/fluid/operators/detection/anchor_generator_op.cu b/paddle/fluid/operators/detection/anchor_generator_op.cu
index 3cc9bbeee1eeed17142a6b1bd23b45aff9cf745f..b4c27a63dbd2f2fdbd9b018aa1606a79d5b0002d 100644
--- a/paddle/fluid/operators/detection/anchor_generator_op.cu
+++ b/paddle/fluid/operators/detection/anchor_generator_op.cu
@@ -24,8 +24,7 @@ __global__ void GenAnchors(T* out, const T* aspect_ratios, const int ar_num,
                            const int width, const T offset) {
   int num_anchors = as_num * ar_num;
   int box_num = height * width * num_anchors;
-  for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < box_num;
-       i += blockDim.x * gridDim.x) {
+  CUDA_KERNEL_LOOP(i, box_num) {
     int h_idx = i / (num_anchors * width);
     int w_idx = (i / num_anchors) % width;
     T stride_width = stride[0];
@@ -64,10 +63,7 @@ __global__ void GenAnchors(T* out, const T* aspect_ratios, const int ar_num,
 template <typename T>
 __global__ void SetVariance(T* out, const T* var, const int vnum,
                             const int num) {
-  for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < num;
-       i += blockDim.x * gridDim.x) {
-    out[i] = var[i % vnum];
-  }
+  CUDA_KERNEL_LOOP(i, num) { out[i] = var[i % vnum]; }
 }
 
 template <typename T>
diff --git a/paddle/fluid/operators/detection/collect_fpn_proposals_op.cu b/paddle/fluid/operators/detection/collect_fpn_proposals_op.cu
index 6fac90bf2da683157eabefcb6b9bfc32f9f51f1e..35222a85cd388f6fef3c61c440be7b36598d9e01 100644
--- a/paddle/fluid/operators/detection/collect_fpn_proposals_op.cu
+++ b/paddle/fluid/operators/detection/collect_fpn_proposals_op.cu
@@ -40,8 +40,7 @@ static inline int NumBlocks(const int N) {
 
 static __global__ void GetLengthLoD(const int nthreads, const int* batch_ids,
                                     int* length_lod) {
-  for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < (nthreads);
-       i += blockDim.x * gridDim.x) {
+  CUDA_KERNEL_LOOP(i, nthreads) {
     platform::CudaAtomicAdd(length_lod + batch_ids[i], 1);
   }
 }
diff --git a/paddle/fluid/operators/detection/distribute_fpn_proposals_op.cu b/paddle/fluid/operators/detection/distribute_fpn_proposals_op.cu
index 1a89af9697d8bd16b80285af8783a54264084da1..1e3cd9f36c595f978f5b5e5f5c5cf5cad6dc9059 100644
--- a/paddle/fluid/operators/detection/distribute_fpn_proposals_op.cu
+++ b/paddle/fluid/operators/detection/distribute_fpn_proposals_op.cu
@@ -31,10 +31,6 @@ using LoDTensor = framework::LoDTensor;
 static constexpr int kNumCUDAThreads = 64;
 static constexpr int kNumMaxinumNumBlocks = 4096;
 
-#define CUDA_1D_KERNEL_LOOP(i, n)                              \
-  for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < (n); \
-       i += blockDim.x * gridDim.x)
-
 int const BBoxSize = 4;
 
 static inline int NumBlocks(const int N) {
@@ -48,7 +44,7 @@ __global__ void GPUDistFpnProposalsHelper(
     const int refer_level, const int refer_scale, const int max_level,
     const int min_level, int* roi_batch_id_data, int* sub_lod_list,
     int* target_lvls) {
-  CUDA_1D_KERNEL_LOOP(i, nthreads) {
+  CUDA_KERNEL_LOOP(i, nthreads) {
     const T* offset_roi = rois + i * BBoxSize;
     int roi_batch_ind = roi_batch_id_data[i];
     // get the target level of current rois
diff --git a/paddle/fluid/operators/detection/generate_proposals_op.cc b/paddle/fluid/operators/detection/generate_proposals_op.cc
index 663fabd3c456185d73510a0d7570d534316b38da..981a368e8564fbcd3d688bc67d2def8664bcfe8d 100644
--- a/paddle/fluid/operators/detection/generate_proposals_op.cc
+++ b/paddle/fluid/operators/detection/generate_proposals_op.cc
@@ -378,7 +378,7 @@ class GenerateProposalsKernel : public framework::OpKernel<T> {
       for (int i = 0; i < num; i++) {
         lod_data[i] = tmp_lod[i];
       }
-      rpn_rois_lod->Resize({num, 1});
+      rpn_rois_lod->Resize({num});
     }
     rpn_rois->set_lod(lod);
     rpn_roi_probs->set_lod(lod);
diff --git a/paddle/fluid/operators/detection/generate_proposals_op.cu b/paddle/fluid/operators/detection/generate_proposals_op.cu
index aaa8dbfe60260038b1c9a22289cc1014ec6f5d59..fa7670f6d680a95da1c1abd5befe1651ccb7265f 100644
--- a/paddle/fluid/operators/detection/generate_proposals_op.cu
+++ b/paddle/fluid/operators/detection/generate_proposals_op.cu
@@ -33,9 +33,6 @@ using LoDTensor = framework::LoDTensor;
 namespace {
 
 #define DIVUP(m, n) ((m) / (n) + ((m) % (n) > 0))
-#define CUDA_1D_KERNEL_LOOP(i, n)                              \
-  for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < (n); \
-       i += blockDim.x * gridDim.x)
 
 int const kThreadsPerBlock = sizeof(uint64_t) * 8;
 
@@ -155,7 +152,7 @@ static __global__ void FilterBBoxes(const T *bboxes, const T *im_info,
   int cnt = 0;
   __shared__ int keep_index[BlockSize];
 
-  CUDA_1D_KERNEL_LOOP(i, num) {
+  CUDA_KERNEL_LOOP(i, num) {
     keep_index[threadIdx.x] = -1;
     __syncthreads();
 
diff --git a/paddle/fluid/operators/detection/matrix_nms_op.cc b/paddle/fluid/operators/detection/matrix_nms_op.cc
new file mode 100644
index 0000000000000000000000000000000000000000..f7d45bc85bf6b14f07e9bfda3615e0b2d51a09f1
--- /dev/null
+++ b/paddle/fluid/operators/detection/matrix_nms_op.cc
@@ -0,0 +1,389 @@
+/* Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+limitations under the License. */
+
+#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/operators/detection/nms_util.h"
+
+namespace paddle {
+namespace operators {
+
+using Tensor = framework::Tensor;
+using LoDTensor = framework::LoDTensor;
+
+class MatrixNMSOp : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+
+  void InferShape(framework::InferShapeContext* ctx) const override {
+    OP_INOUT_CHECK(ctx->HasInput("BBoxes"), "Input", "BBoxes", "MatrixNMS");
+    OP_INOUT_CHECK(ctx->HasInput("Scores"), "Input", "Scores", "MatrixNMS");
+    OP_INOUT_CHECK(ctx->HasOutput("Out"), "Output", "Out", "MatrixNMS");
+    auto box_dims = ctx->GetInputDim("BBoxes");
+    auto score_dims = ctx->GetInputDim("Scores");
+    auto score_size = score_dims.size();
+
+    if (ctx->IsRuntime()) {
+      PADDLE_ENFORCE_EQ(score_size == 3, true,
+                        platform::errors::InvalidArgument(
+                            "The rank of Input(Scores) must be 3. "
+                            "But received rank = %d.",
+                            score_size));
+      PADDLE_ENFORCE_EQ(box_dims.size(), 3,
+                        platform::errors::InvalidArgument(
+                            "The rank of Input(BBoxes) must be 3."
+                            "But received rank = %d.",
+                            box_dims.size()));
+      PADDLE_ENFORCE_EQ(box_dims[2] == 4, true,
+                        platform::errors::InvalidArgument(
+                            "The last dimension of Input (BBoxes) must be 4, "
+                            "represents the layout of coordinate "
+                            "[xmin, ymin, xmax, ymax]."));
+      PADDLE_ENFORCE_EQ(
+          box_dims[1], score_dims[2],
+          platform::errors::InvalidArgument(
+              "The 2nd dimension of Input(BBoxes) must be equal to "
+              "last dimension of Input(Scores), which represents the "
+              "predicted bboxes."
+              "But received box_dims[1](%s) != socre_dims[2](%s)",
+              box_dims[1], score_dims[2]));
+    }
+    ctx->SetOutputDim("Out", {box_dims[1], box_dims[2] + 2});
+    ctx->SetOutputDim("Index", {box_dims[1], 1});
+    if (!ctx->IsRuntime()) {
+      ctx->SetLoDLevel("Out", std::max(ctx->GetLoDLevel("BBoxes"), 1));
+      ctx->SetLoDLevel("Index", std::max(ctx->GetLoDLevel("BBoxes"), 1));
+    }
+  }
+
+ protected:
+  framework::OpKernelType GetExpectedKernelType(
+      const framework::ExecutionContext& ctx) const override {
+    return framework::OpKernelType(
+        OperatorWithKernel::IndicateVarDataType(ctx, "Scores"),
+        platform::CPUPlace());
+  }
+};
+
+template <typename T, bool gaussian>
+struct decay_score;
+
+template <typename T>
+struct decay_score<T, true> {
+  T operator()(T iou, T max_iou, T sigma) {
+    return std::exp((max_iou * max_iou - iou * iou) * sigma);
+  }
+};
+
+template <typename T>
+struct decay_score<T, false> {
+  T operator()(T iou, T max_iou, T sigma) {
+    return (1. - iou) / (1. - max_iou);
+  }
+};
+
+template <typename T, bool gaussian>
+void NMSMatrix(const Tensor& bbox, const Tensor& scores,
+               const T score_threshold, const T post_threshold,
+               const float sigma, const int64_t top_k, const bool normalized,
+               std::vector<int>* selected_indices,
+               std::vector<T>* decayed_scores) {
+  int64_t num_boxes = bbox.dims()[0];
+  int64_t box_size = bbox.dims()[1];
+
+  auto score_ptr = scores.data<T>();
+  auto bbox_ptr = bbox.data<T>();
+
+  std::vector<int32_t> perm(num_boxes);
+  std::iota(perm.begin(), perm.end(), 0);
+  auto end = std::remove_if(perm.begin(), perm.end(),
+                            [&score_ptr, score_threshold](int32_t idx) {
+                              return score_ptr[idx] <= score_threshold;
+                            });
+
+  auto sort_fn = [&score_ptr](int32_t lhs, int32_t rhs) {
+    return score_ptr[lhs] > score_ptr[rhs];
+  };
+
+  int64_t num_pre = std::distance(perm.begin(), end);
+  if (num_pre <= 0) {
+    return;
+  }
+  if (top_k > -1 && num_pre > top_k) {
+    num_pre = top_k;
+  }
+  std::partial_sort(perm.begin(), perm.begin() + num_pre, end, sort_fn);
+
+  std::vector<T> iou_matrix((num_pre * (num_pre - 1)) >> 1);
+  std::vector<T> iou_max(num_pre);
+
+  iou_max[0] = 0.;
+  for (int64_t i = 1; i < num_pre; i++) {
+    T max_iou = 0.;
+    auto idx_a = perm[i];
+    for (int64_t j = 0; j < i; j++) {
+      auto idx_b = perm[j];
+      auto iou = JaccardOverlap<T>(bbox_ptr + idx_a * box_size,
+                                   bbox_ptr + idx_b * box_size, normalized);
+      max_iou = std::max(max_iou, iou);
+      iou_matrix[i * (i - 1) / 2 + j] = iou;
+    }
+    iou_max[i] = max_iou;
+  }
+
+  if (score_ptr[perm[0]] > post_threshold) {
+    selected_indices->push_back(perm[0]);
+    decayed_scores->push_back(score_ptr[perm[0]]);
+  }
+
+  decay_score<T, gaussian> decay_fn;
+  for (int64_t i = 1; i < num_pre; i++) {
+    T min_decay = 1.;
+    for (int64_t j = 0; j < i; j++) {
+      auto max_iou = iou_max[j];
+      auto iou = iou_matrix[i * (i - 1) / 2 + j];
+      auto decay = decay_fn(iou, max_iou, sigma);
+      min_decay = std::min(min_decay, decay);
+    }
+    auto ds = min_decay * score_ptr[perm[i]];
+    if (ds <= post_threshold) continue;
+    selected_indices->push_back(perm[i]);
+    decayed_scores->push_back(ds);
+  }
+}
+
+template <typename T>
+class MatrixNMSKernel : public framework::OpKernel<T> {
+ public:
+  size_t MultiClassMatrixNMS(const Tensor& scores, const Tensor& bboxes,
+                             std::vector<T>* out, std::vector<int>* indices,
+                             int start, int64_t background_label,
+                             int64_t nms_top_k, int64_t keep_top_k,
+                             bool normalized, T score_threshold,
+                             T post_threshold, bool use_gaussian,
+                             float gaussian_sigma) const {
+    std::vector<int> all_indices;
+    std::vector<T> all_scores;
+    std::vector<T> all_classes;
+    all_indices.reserve(scores.numel());
+    all_scores.reserve(scores.numel());
+    all_classes.reserve(scores.numel());
+
+    size_t num_det = 0;
+    auto class_num = scores.dims()[0];
+    Tensor score_slice;
+    for (int64_t c = 0; c < class_num; ++c) {
+      if (c == background_label) continue;
+      score_slice = scores.Slice(c, c + 1);
+      if (use_gaussian) {
+        NMSMatrix<T, true>(bboxes, score_slice, score_threshold, post_threshold,
+                           gaussian_sigma, nms_top_k, normalized, &all_indices,
+                           &all_scores);
+      } else {
+        NMSMatrix<T, false>(bboxes, score_slice, score_threshold,
+                            post_threshold, gaussian_sigma, nms_top_k,
+                            normalized, &all_indices, &all_scores);
+      }
+      for (size_t i = 0; i < all_indices.size() - num_det; i++) {
+        all_classes.push_back(static_cast<T>(c));
+      }
+      num_det = all_indices.size();
+    }
+
+    if (num_det <= 0) {
+      return num_det;
+    }
+
+    if (keep_top_k > -1) {
+      auto k = static_cast<size_t>(keep_top_k);
+      if (num_det > k) num_det = k;
+    }
+
+    std::vector<int32_t> perm(all_indices.size());
+    std::iota(perm.begin(), perm.end(), 0);
+
+    std::partial_sort(perm.begin(), perm.begin() + num_det, perm.end(),
+                      [&all_scores](int lhs, int rhs) {
+                        return all_scores[lhs] > all_scores[rhs];
+                      });
+
+    for (size_t i = 0; i < num_det; i++) {
+      auto p = perm[i];
+      auto idx = all_indices[p];
+      auto cls = all_classes[p];
+      auto score = all_scores[p];
+      auto bbox = bboxes.data<T>() + idx * bboxes.dims()[1];
+      (*indices).push_back(start + idx);
+      (*out).push_back(cls);
+      (*out).push_back(score);
+      for (int j = 0; j < bboxes.dims()[1]; j++) {
+        (*out).push_back(bbox[j]);
+      }
+    }
+
+    return num_det;
+  }
+
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    auto* boxes = ctx.Input<LoDTensor>("BBoxes");
+    auto* scores = ctx.Input<LoDTensor>("Scores");
+    auto* outs = ctx.Output<LoDTensor>("Out");
+    auto* index = ctx.Output<LoDTensor>("Index");
+
+    auto background_label = ctx.Attr<int>("background_label");
+    auto nms_top_k = ctx.Attr<int>("nms_top_k");
+    auto keep_top_k = ctx.Attr<int>("keep_top_k");
+    auto normalized = ctx.Attr<bool>("normalized");
+    auto score_threshold = ctx.Attr<float>("score_threshold");
+    auto post_threshold = ctx.Attr<float>("post_threshold");
+    auto use_gaussian = ctx.Attr<bool>("use_gaussian");
+    auto gaussian_sigma = ctx.Attr<float>("gaussian_sigma");
+
+    auto score_dims = scores->dims();
+    auto batch_size = score_dims[0];
+    auto num_boxes = score_dims[2];
+    auto box_dim = boxes->dims()[2];
+    auto out_dim = box_dim + 2;
+
+    Tensor boxes_slice, scores_slice;
+    size_t num_out = 0;
+    std::vector<size_t> offsets = {0};
+    std::vector<T> detections;
+    std::vector<int> indices;
+    detections.reserve(out_dim * num_boxes * batch_size);
+    indices.reserve(num_boxes * batch_size);
+    for (int i = 0; i < batch_size; ++i) {
+      scores_slice = scores->Slice(i, i + 1);
+      scores_slice.Resize({score_dims[1], score_dims[2]});
+      boxes_slice = boxes->Slice(i, i + 1);
+      boxes_slice.Resize({score_dims[2], box_dim});
+      int start = i * score_dims[2];
+      num_out = MultiClassMatrixNMS(
+          scores_slice, boxes_slice, &detections, &indices, start,
+          background_label, nms_top_k, keep_top_k, normalized, score_threshold,
+          post_threshold, use_gaussian, gaussian_sigma);
+      offsets.push_back(offsets.back() + num_out);
+    }
+
+    int64_t num_kept = offsets.back();
+    if (num_kept == 0) {
+      outs->mutable_data<T>({0, out_dim}, ctx.GetPlace());
+      index->mutable_data<int>({0, 1}, ctx.GetPlace());
+    } else {
+      outs->mutable_data<T>({num_kept, out_dim}, ctx.GetPlace());
+      index->mutable_data<int>({num_kept, 1}, ctx.GetPlace());
+      std::copy(detections.begin(), detections.end(), outs->data<T>());
+      std::copy(indices.begin(), indices.end(), index->data<int>());
+    }
+
+    framework::LoD lod;
+    lod.emplace_back(offsets);
+    outs->set_lod(lod);
+    index->set_lod(lod);
+  }
+};
+
+class MatrixNMSOpMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  void Make() override {
+    AddInput("BBoxes",
+             "(Tensor) A 3-D Tensor with shape "
+             "[N, M, 4] represents the predicted locations of M bounding boxes"
+             ", N is the batch size. "
+             "Each bounding box has four coordinate values and the layout is "
+             "[xmin, ymin, xmax, ymax], when box size equals to 4.");
+    AddInput("Scores",
+             "(Tensor) A 3-D Tensor with shape [N, C, M] represents the "
+             "predicted confidence predictions. N is the batch size, C is the "
+             "class number, M is number of bounding boxes. For each category "
+             "there are total M scores which corresponding M bounding boxes. "
+             " Please note, M is equal to the 2nd dimension of BBoxes. ");
+    AddAttr<int>(
+        "background_label",
+        "(int, default: 0) "
+        "The index of background label, the background label will be ignored. "
+        "If set to -1, then all categories will be considered.")
+        .SetDefault(0);
+    AddAttr<float>("score_threshold",
+                   "(float) "
+                   "Threshold to filter out bounding boxes with low "
+                   "confidence score.");
+    AddAttr<float>("post_threshold",
+                   "(float, default 0.) "
+                   "Threshold to filter out bounding boxes with low "
+                   "confidence score AFTER decaying.")
+        .SetDefault(0.);
+    AddAttr<int>("nms_top_k",
+                 "(int64_t) "
+                 "Maximum number of detections to be kept according to the "
+                 "confidences after the filtering detections based on "
+                 "score_threshold");
+    AddAttr<int>("keep_top_k",
+                 "(int64_t) "
+                 "Number of total bboxes to be kept per image after NMS "
+                 "step. -1 means keeping all bboxes after NMS step.");
+    AddAttr<bool>("normalized",
+                  "(bool, default true) "
+                  "Whether detections are normalized.")
+        .SetDefault(true);
+    AddAttr<bool>("use_gaussian",
+                  "(bool, default false) "
+                  "Whether to use Gaussian as decreasing function.")
+        .SetDefault(false);
+    AddAttr<float>("gaussian_sigma",
+                   "(float) "
+                   "Sigma for Gaussian decreasing function, only takes effect ",
+                   "when 'use_gaussian' is enabled.")
+        .SetDefault(2.);
+    AddOutput("Out",
+              "(LoDTensor) A 2-D LoDTensor with shape [No, 6] represents the "
+              "detections. Each row has 6 values: "
+              "[label, confidence, xmin, ymin, xmax, ymax]. "
+              "the offsets in first dimension are called LoD, the number of "
+              "offset is N + 1, if LoD[i + 1] - LoD[i] == 0, means there is "
+              "no detected bbox.");
+    AddOutput("Index",
+              "(LoDTensor) A 2-D LoDTensor with shape [No, 1] represents the "
+              "index of selected bbox. The index is the absolute index cross "
+              "batches.");
+    AddComment(R"DOC(
+This operator does multi-class matrix non maximum suppression (NMS) on batched
+boxes and scores.
+In the NMS step, this operator greedily selects a subset of detection bounding
+boxes that have high scores larger than score_threshold, if providing this
+threshold, then selects the largest nms_top_k confidences scores if nms_top_k
+is larger than -1. Then this operator decays boxes score according to the
+Matrix NMS scheme.
+Aftern NMS step, at most keep_top_k number of total bboxes are to be kept
+per image if keep_top_k is larger than -1.
+This operator support multi-class and batched inputs. It applying NMS
+independently for each class. The outputs is a 2-D LoDTenosr, for each
+image, the offsets in first dimension of LoDTensor are called LoD, the number
+of offset is N + 1, where N is the batch size. If LoD[i + 1] - LoD[i] == 0,
+means there is no detected bbox for this image.
+
+For more information on Matrix NMS, please refer to:
+https://arxiv.org/abs/2003.10152
+)DOC");
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+REGISTER_OPERATOR(
+    matrix_nms, ops::MatrixNMSOp, ops::MatrixNMSOpMaker,
+    paddle::framework::EmptyGradOpMaker<paddle::framework::OpDesc>,
+    paddle::framework::EmptyGradOpMaker<paddle::imperative::OpBase>);
+REGISTER_OP_CPU_KERNEL(matrix_nms, ops::MatrixNMSKernel<float>,
+                       ops::MatrixNMSKernel<double>);
diff --git a/paddle/fluid/operators/detection/prior_box_op.cu b/paddle/fluid/operators/detection/prior_box_op.cu
index 1ea8cfc1d2af8cc6c332768a467cdcd4c0166319..1ef37e8719883c091733b47a290466b6895317d4 100644
--- a/paddle/fluid/operators/detection/prior_box_op.cu
+++ b/paddle/fluid/operators/detection/prior_box_op.cu
@@ -32,8 +32,7 @@ __global__ void GenPriorBox(T* out, const T* aspect_ratios, const int height,
                             bool min_max_aspect_ratios_order) {
   int num_priors = max_sizes ? as_num * min_num + min_num : as_num * min_num;
   int box_num = height * width * num_priors;
-  for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < box_num;
-       i += blockDim.x * gridDim.x) {
+  CUDA_KERNEL_LOOP(i, box_num) {
     int h = i / (num_priors * width);
     int w = (i / num_priors) % width;
     int p = i % num_priors;
@@ -87,10 +86,7 @@ __global__ void GenPriorBox(T* out, const T* aspect_ratios, const int height,
 template <typename T>
 __global__ void SetVariance(T* out, const T* var, const int vnum,
                             const int num) {
-  for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < num;
-       i += blockDim.x * gridDim.x) {
-    out[i] = var[i % vnum];
-  }
+  CUDA_KERNEL_LOOP(i, num) { out[i] = var[i % vnum]; }
 }
 
 template <typename T>
diff --git a/paddle/fluid/operators/detection/roi_perspective_transform_op.cu b/paddle/fluid/operators/detection/roi_perspective_transform_op.cu
index fe65162353eb860e90de85499186f82ee72c1a6e..7b34e197ffe214c80af85003600e05e0a392962d 100644
--- a/paddle/fluid/operators/detection/roi_perspective_transform_op.cu
+++ b/paddle/fluid/operators/detection/roi_perspective_transform_op.cu
@@ -30,10 +30,6 @@ namespace operators {
 #define idx4_2(index, d1, d2, d3, d4) ((index / d4 / d3) % d2)
 #define idx4_1(index, d1, d2, d3, d4) ((index / d4 / d3 / d2) % d1)
 
-#define CUDA_1D_KERNEL_LOOP(i, n)                              \
-  for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < (n); \
-       i += blockDim.x * gridDim.x)
-
 template <typename T>
 __device__ bool GT_E(T a, T b) {
   return (a > b) || Eigen::numext::abs(a - b) < 1e-4;
@@ -284,7 +280,7 @@ __global__ void RoiTransformKernel(const float* input_data,
                                    int* mask, T* transform_matrix) {
   int output_size =
       num_rois * transformed_height * transformed_width * channels;
-  CUDA_1D_KERNEL_LOOP(index, output_size) {
+  CUDA_KERNEL_LOOP(index, output_size) {
     // (n, c, out_h, out_w) is an element in the transformed output
     int out_w = idx4_4(index, num_rois, channels, transformed_height,
                        transformed_width);
@@ -463,7 +459,7 @@ __global__ void RoiTransformGradKernel(int out_size, const int* out2in_idx_data,
                                        const T* out2in_w_data,
                                        const T* out_grad_data,
                                        T* in_grad_data) {
-  CUDA_1D_KERNEL_LOOP(index, out_size * 4) {
+  CUDA_KERNEL_LOOP(index, out_size * 4) {
     int in_idx = out2in_idx_data[index];
     if (in_idx >= 0) {
       int out_idx = index / 4;
diff --git a/paddle/fluid/operators/detection/sigmoid_focal_loss_op.cu b/paddle/fluid/operators/detection/sigmoid_focal_loss_op.cu
index 4031554aa72b51a82feaaacc894af7c1dbf6e382..f12d60c8b0fc00742f6fba86aaf55cf12eab82d5 100644
--- a/paddle/fluid/operators/detection/sigmoid_focal_loss_op.cu
+++ b/paddle/fluid/operators/detection/sigmoid_focal_loss_op.cu
@@ -30,10 +30,6 @@ static inline int NumBlocks(const int N) {
                   kNumMaxinumNumBlocks);
 }
 
-#define CUDA_1D_KERNEL_LOOP(i, n)                              \
-  for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < (n); \
-       i += blockDim.x * gridDim.x)
-
 template <typename T>
 __global__ void GPUSigmoidFocalLossForward(const T *x_data,
                                            const int *label_data,
@@ -41,7 +37,7 @@ __global__ void GPUSigmoidFocalLossForward(const T *x_data,
                                            const T gamma, const T alpha,
                                            const int num_classes,
                                            const int limit, T *out_data) {
-  CUDA_1D_KERNEL_LOOP(i, limit) {
+  CUDA_KERNEL_LOOP(i, limit) {
     T x = x_data[i];
     int a = i / num_classes;  // current sample
     int d = i % num_classes;  // current class
@@ -79,7 +75,7 @@ __global__ void GPUSigmoidFocalLossBackward(
     const T *x_data, const int *label_data, const int *fg_num_data,
     const T gamma, const T alpha, const int num_classes, const T *dout_data,
     const int limit, T *dx_data) {
-  CUDA_1D_KERNEL_LOOP(i, limit) {
+  CUDA_KERNEL_LOOP(i, limit) {
     T x = x_data[i];
     T dout = dout_data[i];
 
diff --git a/paddle/fluid/operators/distributed/CMakeLists.txt b/paddle/fluid/operators/distributed/CMakeLists.txt
index 3cf15fe94168590f31c488648c0e67a82b7d1102..5aa91733fe3ed1bfc51b47b331488ce2211be2fb 100644
--- a/paddle/fluid/operators/distributed/CMakeLists.txt
+++ b/paddle/fluid/operators/distributed/CMakeLists.txt
@@ -15,8 +15,6 @@ cc_test(async_sparse_param_update_recorder_test SRCS async_sparse_param_update_r
 cc_library(heart_beat_monitor SRCS heart_beat_monitor.cc DEPS enforce simple_threadpool)
 cc_test(heart_beat_monitor_test SRCS heart_beat_monitor_test.cc DEPS heart_beat_monitor)
 
-cc_library(barrier_monitor SRCS barrier_monitor.cc DEPS enforce simple_threadpool trainer_desc_proto device_context)
-
 # FIXME(typhoonzero): use add_subdirectory once we clean the dependency of these files
 set(DISTRIBUTE_COMPILE_FLAGS "-Wno-non-virtual-dtor -Wno-error=non-virtual-dtor -Wno-error=delete-non-virtual-dtor")
 if(WITH_GRPC)
@@ -28,7 +26,7 @@ if(WITH_GRPC)
         collective_client.cc collective_server.cc
         ${GRPC_SRCS}
       PROTO send_recv.proto 
-      DEPS lod_tensor selected_rows_functor memory scope ${GRPC_DEPS} async_sparse_param_update_recorder heart_beat_monitor barrier_monitor)
+      DEPS lod_tensor selected_rows_functor memory scope ${GRPC_DEPS} async_sparse_param_update_recorder heart_beat_monitor)
 
   set_source_files_properties(grpc_serde_test.cc rpc_server_test.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS})
   set(RPC_DEPS sendrecvop_rpc ${GRPC_DEPS})
diff --git a/paddle/fluid/operators/distributed/barrier_monitor.cc b/paddle/fluid/operators/distributed/barrier_monitor.cc
deleted file mode 100644
index f6d82f5d8c3daea9b629c1937bfcbc5159cda461..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/distributed/barrier_monitor.cc
+++ /dev/null
@@ -1,166 +0,0 @@
-// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "paddle/fluid/operators/distributed/barrier_monitor.h"
-#include <gflags/gflags.h>
-
-#include <functional>
-#include <future>  // NOLINT
-#include <memory>
-#include <string>
-#include <unordered_map>
-#include <unordered_set>
-#include <utility>
-#include <vector>
-
-#include <thread>  // NOLINT
-
-#include <ThreadPool.h>
-
-#include "paddle/fluid/platform/enforce.h"
-
-namespace paddle {
-namespace operators {
-namespace distributed {
-bool BarrierMonitor::IncreaseBarrier(const int worker_id,
-                                     const std::string &barrier) {
-  release_ = false;
-
-  if (barrier == BATCH_BARRIER_MESSAGE) {
-    VLOG(4) << "BarrierMonitor send queue recv trainer: " << worker_id;
-    send_barrier_queue->Push(worker_id);
-  } else if (barrier == FETCH_BARRIER_MESSAGE) {
-    VLOG(4) << "BarrierMonitor recv queue recv trainer: " << worker_id;
-    recv_barrier_queue->Push(worker_id);
-  } else {
-    PADDLE_THROW(platform::errors::Unavailable(
-        "unknown Message status %s, only "
-        "BATCH_BARRIER_MESSAGE/FETCH_BARRIER_MESSAGE",
-        barrier));
-  }
-  return Wait();
-}
-
-void BarrierMonitor::DecreaseWorker() {
-  std::unique_lock<std::mutex> lck(mutex_);
-  workers_--;
-  VLOG(1) << "decrement worker num to " << workers_;
-}
-
-void BarrierMonitor::Reset(int workers, BarrierType type) {
-  std::unique_lock<std::mutex> lk(server_mutex_);
-
-  workers_ = workers;
-  barrier_type = type;
-
-  send_barrier_queue->Clear();
-  recv_barrier_queue->Clear();
-  VLOG(2) << "reset monitor workers: " << workers_ << " type: " << barrier_type;
-}
-
-void BarrierMonitor::Monitor() {
-  while (!IsReady() && running_) {
-    std::this_thread::sleep_for(std::chrono::milliseconds(1000));
-    VLOG(3) << "sync at first time, wait all trainer ready";
-  }
-
-  while (running_) {
-    int timer = 0;
-
-    if (IsReady()) {
-      Swap(true);
-    } else {
-      VLOG(4) << "running timer: " << timer << " barrier: " << barrier_type
-              << " sendQ:" << send_barrier_queue->Size()
-              << " recvQ: " << recv_barrier_queue->Size();
-
-      timer++;
-      if (max_wait_ms == -1 || timer < max_wait_ms) {
-        std::this_thread::sleep_for(std::chrono::milliseconds(1));
-      } else {
-        VLOG(1) << "time out of " << max_wait_ms
-                << ", need barreir: " << barrier_type << " retry";
-        Swap(false);
-      }
-    }
-  }
-}
-
-bool BarrierMonitor::IsReady() {
-  if (barrier_type == BarrierType::kSendBarrier) {
-    return static_cast<int>(send_barrier_queue->Size()) == workers_;
-  } else {
-    return static_cast<int>(recv_barrier_queue->Size()) == workers_;
-  }
-}
-
-void BarrierMonitor::Swap(bool is_valid) {
-  std::unique_lock<std::mutex> lck(mutex_);
-
-  valid_ = is_valid;
-  release_ = true;
-
-  if (barrier_type == BarrierType::kSendBarrier) {
-    barrier_type = BarrierType::kRecvBarrier;
-    send_barrier_queue->Clear();
-    VLOG(4) << "barrier monitor server clean up queue and barrier";
-    ServerWeakup();
-    VLOG(4) << "barrier monitor server weak up sync to do";
-    WaitServerWeakup();
-    VLOG(4) << "barrier monitor server weak up sync done";
-
-  } else {
-    barrier_type = BarrierType::kSendBarrier;
-    recv_barrier_queue->Clear();
-    VLOG(4) << "barrier monitor server switch to send barrier";
-  }
-
-  worker_cv_.notify_all();
-}
-
-void BarrierMonitor::Stop() {
-  valid_ = true;
-  release_ = true;
-  running_ = false;
-
-  barrier_type = BarrierType::kRecvBarrier;
-  send_barrier_queue->Clear();
-  recv_barrier_queue->Clear();
-
-  worker_cv_.notify_all();
-  server_cv_.notify_all();
-
-  if (monitor_thread_) monitor_thread_->join();
-  monitor_thread_ = nullptr;
-}
-
-bool BarrierMonitor::Wait() {
-  std::unique_lock<std::mutex> lk(mutex_);
-  worker_cv_.wait(lk, [this] { return (release_); });
-  return valid_;
-}
-
-void BarrierMonitor::WaitServerWeakup() {
-  std::unique_lock<std::mutex> lk(server_mutex_);
-  server_cv_.wait(lk);
-}
-
-void BarrierMonitor::ServerWeakup() { server_cv_.notify_all(); }
-
-std::once_flag BarrierMonitor::init_flag_;
-std::unique_ptr<BarrierMonitor> BarrierMonitor::monitor_(nullptr);
-
-}  // namespace distributed
-}  // namespace operators
-}  // namespace paddle
diff --git a/paddle/fluid/operators/distributed/barrier_monitor.h b/paddle/fluid/operators/distributed/barrier_monitor.h
deleted file mode 100644
index f9556d7720f7a7ebcadcc1f86ad6051786777041..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/distributed/barrier_monitor.h
+++ /dev/null
@@ -1,186 +0,0 @@
-// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#pragma once
-
-#include <gflags/gflags.h>
-
-#include <chrono>  // NOLINT
-#include <deque>
-#include <functional>
-#include <future>  // NOLINT
-#include <memory>
-#include <string>
-#include <unordered_map>
-#include <unordered_set>
-#include <utility>
-#include <vector>
-
-#include <thread>  // NOLINT
-
-#include <ThreadPool.h>
-
-#include "paddle/fluid/operators/distributed/rpc_server.h"
-#include "paddle/fluid/platform/enforce.h"
-
-namespace paddle {
-namespace operators {
-namespace distributed {
-
-enum BarrierType { kSendBarrier, kRecvBarrier };
-
-constexpr int64_t kMaxWaitMS = 120000;
-
-template <typename T>
-class BlockingQueueForBarrier {
- public:
-  explicit BlockingQueueForBarrier(size_t capacity) : capacity_(capacity) {
-    PADDLE_ENFORCE_GT(capacity_, 0,
-                      platform::errors::InvalidArgument(
-                          "The capacity must be greater than 0."));
-  }
-
-  bool Push(const T &elem) {
-    {
-      std::unique_lock<std::mutex> lock(mutex_);
-      worker_cv_.wait(lock, [&] { return queue_.size() < capacity_; });
-      queue_.push_back(elem);
-    }
-    worker_cv_.notify_one();
-    return true;
-  }
-
-  bool Push(T &&elem) {
-    {
-      std::unique_lock<std::mutex> lock(mutex_);
-      worker_cv_.wait(lock, [&] { return queue_.size() < capacity_; });
-      queue_.emplace_back(std::move(elem));
-    }
-    worker_cv_.notify_one();
-    return true;
-  }
-
-  T Pop() {
-    std::unique_lock<std::mutex> lock(mutex_);
-    worker_cv_.wait(lock, [=] { return !queue_.empty(); });
-    T rc(std::move(queue_.front()));
-    queue_.pop_front();
-    worker_cv_.notify_one();
-    return rc;
-  }
-
-  size_t Cap() const {
-    std::lock_guard<std::mutex> lock(mutex_);
-    return capacity_;
-  }
-
-  size_t Size() const {
-    std::lock_guard<std::mutex> lock(mutex_);
-    return queue_.size();
-  }
-
-  void Clear() {
-    std::lock_guard<std::mutex> lock(mutex_);
-    std::deque<T>().swap(queue_);
-  }
-
- private:
-  const size_t capacity_;
-  std::deque<T> queue_;
-
-  mutable std::mutex mutex_;
-  std::condition_variable worker_cv_;
-};
-
-class BarrierMonitor {
- public:
-  explicit BarrierMonitor(int workers)
-      : BarrierMonitor(workers, BarrierType::kRecvBarrier, kMaxWaitMS) {}
-
-  explicit BarrierMonitor(int workers, BarrierType type, int64_t max_wait_times)
-      : workers_(workers), barrier_type(type), max_wait_ms(max_wait_times) {
-    PADDLE_ENFORCE_GT(workers, 0, platform::errors::InvalidArgument(
-                                      "trainers must have one or more"));
-
-    send_barrier_queue =
-        std::make_shared<BlockingQueueForBarrier<int>>(workers);
-    recv_barrier_queue =
-        std::make_shared<BlockingQueueForBarrier<int>>(workers);
-
-    running_ = true;
-    monitor_thread_.reset(
-        new std::thread(std::bind(&BarrierMonitor::Monitor, this)));
-  }
-
-  static BarrierMonitor *Init(int workers) {
-    InitImpl(workers);
-    return GetInstance();
-  }
-
-  static BarrierMonitor *GetInstance() { return monitor_.get(); }
-
-  bool IncreaseBarrier(const int worker_id, const std::string &barrier);
-
-  void DecreaseWorker();
-
-  int GetWorkerNum() { return workers_; }
-
-  void Monitor();
-
-  void Swap(bool is_valid);
-
-  void Stop();
-
-  bool IsReady();
-
-  bool Wait();
-
-  void WaitServerWeakup();
-
-  void ServerWeakup();
-
-  void WorkerWeakup();
-
-  void Reset(int workers, BarrierType type);
-
- private:
-  // Init is called by GetInstance.
-  static void InitImpl(int workers) {
-    monitor_.reset(new BarrierMonitor(workers));
-  }
-
-  static std::once_flag init_flag_;
-  static std::unique_ptr<BarrierMonitor> monitor_;
-
-  int workers_;
-  bool running_ = false;
-  bool valid_ = false;
-  bool release_ = false;
-
-  std::condition_variable worker_cv_;
-  std::condition_variable server_cv_;
-
-  std::mutex server_mutex_;
-  std::mutex mutex_;
-
-  BarrierType barrier_type;
-  int64_t max_wait_ms;
-  std::unique_ptr<std::thread> monitor_thread_{nullptr};
-  std::shared_ptr<BlockingQueueForBarrier<int>> send_barrier_queue;
-  std::shared_ptr<BlockingQueueForBarrier<int>> recv_barrier_queue;
-};
-
-}  // namespace distributed
-}  // namespace operators
-}  // namespace paddle
diff --git a/paddle/fluid/operators/distributed/grpc/grpc_client.cc b/paddle/fluid/operators/distributed/grpc/grpc_client.cc
index ca93f7eb958cde66b933612f05bdfc2965cd2a75..0652f8691218dc688732bd4243315b188cd0b053 100644
--- a/paddle/fluid/operators/distributed/grpc/grpc_client.cc
+++ b/paddle/fluid/operators/distributed/grpc/grpc_client.cc
@@ -260,7 +260,7 @@ VarHandlePtr GRPCClient::AsyncPrefetchVar(const std::string& ep,
   while (true) {
     GetProcessor* s = new GetProcessor(ch);
     VarHandlePtr h(new VarHandle(ep, method, out_var_name_val, p_ctx, p_scope));
-    s->Prepare(h, time_out);
+    s->Prepare(h, kPrefetchTimeout);
 
     framework::AsyncIO([in_var_name_val, out_var_name_val, ep_val, p_scope,
                         p_ctx, s, method, h, table_name_val, this] {
@@ -306,19 +306,52 @@ VarHandlePtr GRPCClient::AsyncPrefetchVar(const std::string& ep,
 
 VarHandlePtr GRPCClient::AsyncSendBatchBarrier(const std::string& ep,
                                                int64_t time_out) {
-  platform::CPUDeviceContext ctx;
-  auto* scope = new framework::Scope();
-  auto h = AsyncDistributeNotify(ep, ctx, *scope, BATCH_BARRIER_MESSAGE);
-  delete scope;
+  const auto ch = GetChannel(ep);
+
+  BatchBarrierProcessor* s = new BatchBarrierProcessor(ch);
+  const std::string method = kBatchBarrierRPC;
+  VarHandlePtr h(
+      new VarHandle(ep, method, BATCH_BARRIER_MESSAGE, nullptr, nullptr));
+  s->Prepare(h, time_out);
+
+  sendrecv::VariableMessage req;
+  req.set_varname(BATCH_BARRIER_MESSAGE);
+
+  platform::RecordRPCEvent record_event(method);
+
+  auto rpc = s->stub_->AsyncSendVariable(s->context_.get(), req, &cq_);
+  rpc->Finish(&s->reply_, &s->status_, reinterpret_cast<void*>(s));
+  req_count_++;
+
+  if (UNLIKELY(platform::IsProfileEnabled())) {
+    h->Wait();
+  }
+
   return h;
 }
 
 VarHandlePtr GRPCClient::AsyncSendFetchBarrier(const std::string& ep,
                                                int64_t time_out) {
-  platform::CPUDeviceContext ctx;
-  auto* scope = new framework::Scope();
-  auto h = AsyncDistributeNotify(ep, ctx, *scope, FETCH_BARRIER_MESSAGE);
-  delete scope;
+  const auto ch = GetChannel(ep);
+  FetchBarrierProcessor* s = new FetchBarrierProcessor(ch);
+  const std::string method = kFetchBarrierRPC;
+  VarHandlePtr h(
+      new VarHandle(ep, method, FETCH_BARRIER_MESSAGE, nullptr, nullptr));
+  s->Prepare(h, time_out);
+
+  sendrecv::VariableMessage req;
+  req.set_varname(FETCH_BARRIER_MESSAGE);
+
+  platform::RecordRPCEvent record_event(method);
+
+  auto rpc = s->stub_->AsyncGetVariable(s->context_.get(), req, &cq_);
+  rpc->Finish(&s->reply_, &s->status_, reinterpret_cast<void*>(s));
+  req_count_++;
+
+  if (UNLIKELY(platform::IsProfileEnabled())) {
+    h->Wait();
+  }
+
   return h;
 }
 
@@ -351,10 +384,27 @@ VarHandlePtr GRPCClient::AsyncGetMonomerBarrier(const std::string& ep,
 
 VarHandlePtr GRPCClient::AsyncSendComplete(const std::string& ep,
                                            int64_t time_out) {
-  platform::CPUDeviceContext ctx;
-  auto* scope = new framework::Scope();
-  auto h = AsyncDistributeNotify(ep, ctx, *scope, COMPLETE_MESSAGE);
-  delete scope;
+  const auto ch = GetChannel(ep);
+
+  BatchBarrierProcessor* s = new BatchBarrierProcessor(ch);
+  const std::string method = kSendCompleteRPC;
+  VarHandlePtr h(new VarHandle(ep, method, COMPLETE_MESSAGE, nullptr, nullptr));
+  s->Prepare(h, time_out);
+
+  sendrecv::VariableMessage req;
+  req.set_trainer_id(trainer_id_);
+  req.set_varname(COMPLETE_MESSAGE);
+
+  platform::RecordRPCEvent record_event(method);
+
+  auto rpc = s->stub_->AsyncSendVariable(s->context_.get(), req, &cq_);
+  rpc->Finish(&s->reply_, &s->status_, reinterpret_cast<void*>(s));
+  req_count_++;
+
+  if (UNLIKELY(platform::IsProfileEnabled())) {
+    h->Wait();
+  }
+
   return h;
 }
 
@@ -404,21 +454,10 @@ VarHandlePtr GRPCClient::AsyncDistributeNotify(
   s->Prepare(h, time_out);
 
   framework::AsyncIO([var_name_val, p_scope, p_ctx, s, method, h, this] {
-    ::grpc::ByteBuffer buf;
+    auto* var = p_scope->FindVar(var_name_val);
 
-    if (var_name_val == BATCH_BARRIER_MESSAGE ||
-        var_name_val == FETCH_BARRIER_MESSAGE ||
-        var_name_val == COMPLETE_MESSAGE) {
-      // prepare input
-      sendrecv::VariableMessage req;
-      req.set_varname(var_name_val);
-      req.set_out_varname(var_name_val);
-      req.set_trainer_id(trainer_id_);
-      RequestToByteBuffer<sendrecv::VariableMessage>(req, &buf);
-    } else {
-      auto* var = p_scope->FindVar(var_name_val);
-      SerializeToByteBuffer(var_name_val, var, *p_ctx, &buf, "", trainer_id_);
-    }
+    ::grpc::ByteBuffer req;
+    SerializeToByteBuffer(var_name_val, var, *p_ctx, &req, "", trainer_id_);
 
     VLOG(3) << s->GetVarHandlePtr()->String() << " begin";
 
@@ -428,7 +467,7 @@ VarHandlePtr GRPCClient::AsyncDistributeNotify(
     platform::RecordRPCEvent record_event(method);
 
     auto call = s->stub_g_.PrepareUnaryCall(
-        s->context_.get(), "/sendrecv.SendRecvService/DistributeNotify", buf,
+        s->context_.get(), "/sendrecv.SendRecvService/DistributeNotify", req,
         &cq_);
     call->StartCall();
     call->Finish(&s->reply_, &s->status_, reinterpret_cast<void*>(s));
@@ -448,6 +487,18 @@ bool GRPCClient::Wait() {
   return ok_;
 }
 
+inline bool ShouldRetry(const std::string& method, int error_code) {
+  if (method == kPrefetchRPC) {
+    return true;
+  }
+
+  if (error_code == grpc::StatusCode::DEADLINE_EXCEEDED) {
+    return true;
+  }
+
+  return false;
+}
+
 void GRPCClient::Proceed() {
   void* tag = nullptr;
   bool ok = false;
@@ -461,19 +512,9 @@ void GRPCClient::Proceed() {
     if (c->status_.ok()) {
       VLOG(3) << c->GetVarHandlePtr()->String() << " process";
       c->Process();
-    } else if (c->status_.error_code() == grpc::StatusCode::DEADLINE_EXCEEDED) {
-      PADDLE_THROW(platform::errors::External(
-          "%s meets grpc error, error_code is %d, error message is %s, error "
-          "details is %s.",
-          c->GetVarHandlePtr()->String(), c->status_.error_code(),
-          c->status_.error_message(), c->status_.error_details()));
-      {
-        std::lock_guard<std::mutex> lk(sync_mutex_);
-        ok_ = false;
-      }
-      c->Finish(false);
-    } else if (c->status_.error_code() == grpc::StatusCode::UNAVAILABLE) {
-      VLOG(3) << c->GetVarHandlePtr()->String()
+    } else if (ShouldRetry(c->GetVarHandlePtr()->method(),
+                           c->status_.error_code())) {
+      VLOG(0) << c->GetVarHandlePtr()->String()
               << " meets grpc error, error_code:" << c->status_.error_code()
               << " error_message:" << c->status_.error_message()
               << " error_details:" << c->status_.error_details()
diff --git a/paddle/fluid/operators/distributed/parameter_prefetch.cc b/paddle/fluid/operators/distributed/parameter_prefetch.cc
index c9bbc3c193558a6e07e118a482c63f7f8427a27e..428ee6ee1843deb46267e877e847f4b31df3e41f 100644
--- a/paddle/fluid/operators/distributed/parameter_prefetch.cc
+++ b/paddle/fluid/operators/distributed/parameter_prefetch.cc
@@ -209,16 +209,20 @@ void prefetchs(const std::vector<std::string>& id_var_names,
   TableAndEndpoints tables;
 
   for (auto& id_name : id_var_names) {
-    auto& id_tensor = scope.FindVar(id_name)->Get<framework::LoDTensor>();
-    auto* id_data = id_tensor.data<int64_t>();
+    auto* id_tensor =
+        scope.FindVar(id_name)->GetMutable<framework::LoDTensor>();
+    auto id_dims = id_tensor->dims();
+    id_tensor->Resize(framework::make_ddim(
+        {static_cast<int64_t>(id_dims[0] * id_dims[1]), 1}));
+    auto* id_data = id_tensor->data<int64_t>();
     std::vector<int64_t> ids;
 
-    for (int64_t i = 0; i < id_tensor.numel(); ++i) {
+    for (int64_t i = 0; i < id_tensor->numel(); ++i) {
       ids.push_back(id_data[i]);
       ids_union.push_back(id_data[i]);
     }
     ids_group.push_back(ids);
-    ids_lods.push_back(id_tensor.lod());
+    ids_lods.push_back(id_tensor->lod());
   }
 
   std::unordered_set<int64_t> s(ids_union.begin(), ids_union.end());
diff --git a/paddle/fluid/operators/distributed/request_handler.h b/paddle/fluid/operators/distributed/request_handler.h
index 74d9fc78cedc25ea64f684b6aed830021fbbd5cc..7cccf259b596f2116d14b23d19dba6df229d3cd7 100644
--- a/paddle/fluid/operators/distributed/request_handler.h
+++ b/paddle/fluid/operators/distributed/request_handler.h
@@ -57,6 +57,7 @@ constexpr char kFetchBarrierRPC[] = "FetchBarrierRPC";
 constexpr char kSendMonomerFetchBarrierRPC[] = "SendMonomerFetchBarrierRPC";
 constexpr char kSendCompleteRPC[] = "SendCompleteRPC";
 constexpr char kCheckPointNotifyRPC[] = "CheckPointNotifyRPC";
+constexpr int64_t kPrefetchTimeout = 60000;
 
 #define LISTEN_TERMINATE_MESSAGE "TERMINATE@RECV"
 #define BATCH_BARRIER_MESSAGE "BATCH_BARRIER@RECV"
diff --git a/paddle/fluid/operators/distributed/request_handler_impl.cc b/paddle/fluid/operators/distributed/request_handler_impl.cc
index 5871bd14fc8033ea50c829e99b40fc2322033b16..0205bab0504d75df4e2b8bf15326a8aec9127544 100644
--- a/paddle/fluid/operators/distributed/request_handler_impl.cc
+++ b/paddle/fluid/operators/distributed/request_handler_impl.cc
@@ -28,7 +28,6 @@
 #include "paddle/fluid/string/split.h"
 
 #include "paddle/fluid/operators/distributed/async_sparse_param_update_recorder.h"
-#include "paddle/fluid/operators/distributed/barrier_monitor.h"
 #include "paddle/fluid/operators/distributed/heart_beat_monitor.h"
 
 namespace paddle {
@@ -39,130 +38,161 @@ namespace distributed {
 // to directory specified.
 constexpr char LOOKUP_TABLE_PATH[] = "kLookupTablePath";
 
-bool RequestSendHandler::Handle(const std::string &varname,
-                                framework::Scope *scope,
-                                framework::Variable *invar,
-                                framework::Variable **outvar,
+bool RequestSendHandler::Handle(const std::string& varname,
+                                framework::Scope* scope,
+                                framework::Variable* invar,
+                                framework::Variable** outvar,
                                 const int trainer_id,
-                                const std::string &out_var_name,
-                                const std::string &table_name) {
+                                const std::string& out_var_name,
+                                const std::string& table_name) {
   VLOG(4) << "RequestSendHandler:" << varname;
 
-  if (invar == nullptr) {
-    PADDLE_THROW(platform::errors::NotFound(
-        "sync: Can not find server side var: %s", varname));
-    return false;
-  }
+  // Sync
+  if (varname == BATCH_BARRIER_MESSAGE) {
+    VLOG(3) << "sync: recv BATCH_BARRIER_MESSAGE";
+    rpc_server_->IncreaseBatchBarrier(kRequestSend);
+  } else if (varname == COMPLETE_MESSAGE) {
+    VLOG(3) << "sync: recv complete message";
 
-  if (distributed_mode_ == DistributedMode::kSync) {
-    return true;
-  }
+    if (HeartBeatMonitor::GetInstance() != nullptr) {
+      HeartBeatMonitor::GetInstance()->Update(trainer_id, "", COMPLETED);
+    }
 
-  HeartBeatMonitor::GetInstance()->Update(trainer_id, varname, RUNNING);
+    rpc_server_->Complete();
+  } else {
+    // Async
+    if (distributed_mode_ != DistributedMode::kSync) {
+      VLOG(3) << "async process var: " << varname;
+      if (varname == BATCH_BARRIER_MESSAGE) {
+        PADDLE_THROW(
+            "async mode should not recv BATCH_BARRIER_MESSAGE or "
+            "COMPLETE_MESSAGE");
+      }
+      HeartBeatMonitor::GetInstance()->Update(trainer_id, varname, RUNNING);
 
-  std::string run_varname = varname;
-  string::Piece part_piece("@PIECE");
-  string::Piece var_name_piece = string::Piece(varname);
+      std::string run_varname = varname;
 
-  if (string::Contains(var_name_piece, part_piece)) {
-    auto varname_splits = paddle::string::Split(varname, '@');
-    run_varname = varname_splits[0];
-    scope->Rename(varname, run_varname);
-  }
+      string::Piece part_piece("@PIECE");
+      string::Piece var_name_piece = string::Piece(varname);
 
-  if (distributed_mode_ == DistributedMode::kGeo &&
-      AsyncSparseParamUpdateRecorder::GetInstance()->HasGrad(run_varname)) {
-    auto &grad_slr =
-        scope->FindVar(run_varname)->Get<framework::SelectedRows>();
-    AsyncSparseParamUpdateRecorder::GetInstance()->Update(run_varname,
-                                                          grad_slr.rows());
-  }
+      if (string::Contains(var_name_piece, part_piece)) {
+        auto varname_splits = paddle::string::Split(varname, '@');
+        PADDLE_ENFORCE_EQ(varname_splits.size(), 3);
+        run_varname = varname_splits[0];
+        scope->Rename(varname, run_varname);
+      }
 
-  executor_->RunPreparedContext((*grad_to_prepared_ctx_)[run_varname].get(),
-                                scope);
+      if (distributed_mode_ == DistributedMode::kGeo &&
+          AsyncSparseParamUpdateRecorder::GetInstance()->HasGrad(run_varname)) {
+        auto& grad_slr =
+            scope->FindVar(run_varname)->Get<framework::SelectedRows>();
+        AsyncSparseParamUpdateRecorder::GetInstance()->Update(run_varname,
+                                                              grad_slr.rows());
+      }
+      executor_->RunPreparedContext((*grad_to_prepared_ctx_)[run_varname].get(),
+                                    scope);
 
+      return true;
+    } else {  // sync
+      rpc_server_->WaitCond(kRequestSend);
+      VLOG(3) << "sync: processing received var: " << varname;
+      PADDLE_ENFORCE_NOT_NULL(
+          invar, platform::errors::NotFound(
+                     "sync: Can not find server side var %s.", varname));
+    }
+  }
   return true;
 }
 
-bool RequestGetHandler::Handle(const std::string &varname,
-                               framework::Scope *scope,
-                               framework::Variable *invar,
-                               framework::Variable **outvar,
+bool RequestGetHandler::Handle(const std::string& varname,
+                               framework::Scope* scope,
+                               framework::Variable* invar,
+                               framework::Variable** outvar,
                                const int trainer_id,
-                               const std::string &out_var_name,
-                               const std::string &table_name) {
+                               const std::string& out_var_name,
+                               const std::string& table_name) {
   VLOG(3) << "RequestGetHandler:" << varname
           << " out_var_name: " << out_var_name << " trainer_id: " << trainer_id
           << " table_name: " << table_name;
 
   if (distributed_mode_ == DistributedMode::kSync) {
-    *outvar = scope_->FindVar(varname);
-  } else {
-    if (enable_dc_asgd_) {
-      // NOTE: the format is determined by distribute_transpiler.py
-      std::string param_bak_name =
-          string::Sprintf("%s.trainer_%d_bak", varname, trainer_id);
-      VLOG(3) << "getting " << param_bak_name << " trainer_id " << trainer_id;
-      auto var = scope_->FindVar(varname);
-      auto t_orig = var->Get<framework::LoDTensor>();
-      auto param_bak = scope_->Var(param_bak_name);
-      auto t = param_bak->GetMutable<framework::LoDTensor>();
-      t->mutable_data(dev_ctx_->GetPlace(), t_orig.type());
-      VLOG(3) << "copying " << varname << " to " << param_bak_name;
-      framework::TensorCopy(t_orig, dev_ctx_->GetPlace(), t);
+    if (varname == FETCH_BARRIER_MESSAGE) {
+      VLOG(3) << "sync: recv fetch barrier message";
+      rpc_server_->IncreaseBatchBarrier(kRequestGet);
+    } else {
+      rpc_server_->WaitCond(kRequestGet);
+      *outvar = scope_->FindVar(varname);
     }
-
-    if (distributed_mode_ == DistributedMode::kGeo &&
-        AsyncSparseParamUpdateRecorder::GetInstance()->HasParam(varname) &&
-        !table_name.empty()) {
-      std::vector<int64_t> updated_rows;
-      AsyncSparseParamUpdateRecorder::GetInstance()->GetAndClear(
-          varname, trainer_id, &updated_rows);
-      if (VLOG_IS_ON(3)) {
-        std::ostringstream sstream;
-        sstream << "[";
-        for (auto &row_id : updated_rows) {
-          sstream << row_id << ", ";
-        }
-        sstream << "]";
-        VLOG(3) << "updated_rows size: " << updated_rows.size() << " "
-                << sstream.str();
+  } else {
+    if (varname != FETCH_BARRIER_MESSAGE && varname != COMPLETE_MESSAGE) {
+      if (enable_dc_asgd_) {
+        // NOTE: the format is determined by distribute_transpiler.py
+        std::string param_bak_name =
+            string::Sprintf("%s.trainer_%d_bak", varname, trainer_id);
+        VLOG(3) << "getting " << param_bak_name << " trainer_id " << trainer_id;
+        auto var = scope_->FindVar(varname);
+        auto t_orig = var->Get<framework::LoDTensor>();
+        auto param_bak = scope_->Var(param_bak_name);
+        auto t = param_bak->GetMutable<framework::LoDTensor>();
+        t->mutable_data(dev_ctx_->GetPlace(), t_orig.type());
+        VLOG(3) << "copying " << varname << " to " << param_bak_name;
+        framework::TensorCopy(t_orig, dev_ctx_->GetPlace(), t);
       }
-      auto &origin_tensor =
-          scope_->FindVar(varname)->Get<framework::LoDTensor>();
-      auto *origin_tensor_data = origin_tensor.data<float>();
-      auto &dims = origin_tensor.dims();
-      *outvar = scope->Var();
-      auto *out_slr = (*outvar)->GetMutable<framework::SelectedRows>();
-      out_slr->set_rows(updated_rows);
-      out_slr->set_height(dims[0]);
-      auto out_dims = framework::make_ddim(
-          {static_cast<int64_t>(updated_rows.size()), dims[1]});
-      auto *data = out_slr->mutable_value()->mutable_data<float>(
-          out_dims, origin_tensor.place());
-      auto width = dims[1];
-      for (size_t i = 0; i < updated_rows.size(); ++i) {
-        PADDLE_ENFORCE_LT(updated_rows[i], dims[0],
-                          platform::errors::OutOfRange(
-                              "expected >= 0 and < %ld, but got %ld.", dims[0],
-                              updated_rows[i]));
-        memcpy(data + i * width, origin_tensor_data + updated_rows[i] * width,
-               sizeof(float) * width);
+      VLOG(1) << "Table name empty? " << table_name.empty();
+      if (distributed_mode_ == DistributedMode::kGeo) {
+        VLOG(1) << "AsyncSparseParamUpdateRecorder " << varname << " exist "
+                << AsyncSparseParamUpdateRecorder::GetInstance()->HasParam(
+                       varname);
+      }
+      if (distributed_mode_ == DistributedMode::kGeo &&
+          AsyncSparseParamUpdateRecorder::GetInstance()->HasParam(varname) &&
+          !table_name.empty()) {
+        std::vector<int64_t> updated_rows;
+        AsyncSparseParamUpdateRecorder::GetInstance()->GetAndClear(
+            varname, trainer_id, &updated_rows);
+        if (VLOG_IS_ON(3)) {
+          std::ostringstream sstream;
+          sstream << "[";
+          for (auto& row_id : updated_rows) {
+            sstream << row_id << ", ";
+          }
+          sstream << "]";
+          VLOG(3) << "updated_rows size: " << updated_rows.size() << " "
+                  << sstream.str();
+        }
+        auto& origin_tensor =
+            scope_->FindVar(varname)->Get<framework::LoDTensor>();
+        auto* origin_tensor_data = origin_tensor.data<float>();
+        auto& dims = origin_tensor.dims();
+        *outvar = scope->Var();
+        auto* out_slr = (*outvar)->GetMutable<framework::SelectedRows>();
+        out_slr->set_rows(updated_rows);
+        out_slr->set_height(dims[0]);
+        auto out_dims = framework::make_ddim(
+            {static_cast<int64_t>(updated_rows.size()), dims[1]});
+        auto* data = out_slr->mutable_value()->mutable_data<float>(
+            out_dims, origin_tensor.place());
+        auto width = dims[1];
+        for (size_t i = 0; i < updated_rows.size(); ++i) {
+          PADDLE_ENFORCE_LT(updated_rows[i], dims[0]);
+          memcpy(data + i * width, origin_tensor_data + updated_rows[i] * width,
+                 sizeof(float) * width);
+        }
+      } else {
+        *outvar = scope_->FindVar(varname);
       }
-    } else {
-      *outvar = scope_->FindVar(varname);
     }
   }
   return true;
 }
 
-bool RequestGetNoBarrierHandler::Handle(const std::string &varname,
-                                        framework::Scope *scope,
-                                        framework::Variable *invar,
-                                        framework::Variable **outvar,
+bool RequestGetNoBarrierHandler::Handle(const std::string& varname,
+                                        framework::Scope* scope,
+                                        framework::Variable* invar,
+                                        framework::Variable** outvar,
                                         const int trainer_id,
-                                        const std::string &out_var_name,
-                                        const std::string &table_name) {
+                                        const std::string& out_var_name,
+                                        const std::string& table_name) {
   VLOG(4) << "RequestGetNoBarrierHandler:" << varname
           << " out_var_name: " << out_var_name;
 
@@ -177,19 +207,18 @@ bool RequestGetNoBarrierHandler::Handle(const std::string &varname,
     *outvar = scope_->FindVar(var_name_piece.ToString());
     return true;
   } else {
-    PADDLE_THROW(platform::errors::InvalidArgument(
-        "GetNoBarrier must contain %s", WITHOUT_BARRIER_MESSAGE));
+    PADDLE_THROW("GetNoBarrier must contain %s", WITHOUT_BARRIER_MESSAGE);
   }
   return true;
 }
 
-bool RequestPrefetchHandler::Handle(const std::string &varname,
-                                    framework::Scope *scope,
-                                    framework::Variable *invar,
-                                    framework::Variable **outvar,
+bool RequestPrefetchHandler::Handle(const std::string& varname,
+                                    framework::Scope* scope,
+                                    framework::Variable* invar,
+                                    framework::Variable** outvar,
                                     const int trainer_id,
-                                    const std::string &out_var_name,
-                                    const std::string &table_name) {
+                                    const std::string& out_var_name,
+                                    const std::string& table_name) {
   VLOG(4) << "RequestPrefetchHandler " << varname;
 
   if (table_name.empty()) {
@@ -207,20 +236,19 @@ bool RequestPrefetchHandler::Handle(const std::string &varname,
   return true;
 }
 
-bool RequestCheckpointHandler::Handle(const std::string &varname,
-                                      framework::Scope *scope,
-                                      framework::Variable *invar,
-                                      framework::Variable **outvar,
+bool RequestCheckpointHandler::Handle(const std::string& varname,
+                                      framework::Scope* scope,
+                                      framework::Variable* invar,
+                                      framework::Variable** outvar,
                                       const int trainer_id,
-                                      const std::string &out_var_name,
-                                      const std::string &table_name) {
-  PADDLE_ENFORCE_NE(
-      checkpoint_notify_id, -1,
-      platform::errors::Unavailable(
-          "when checkpoint_notify_id = -1, there should be no RPC invoke."));
+                                      const std::string& out_var_name,
+                                      const std::string& table_name) {
+  PADDLE_ENFORCE(
+      checkpoint_notify_id != -1,
+      "when checkpoint_notify_id = -1, there should be no RPC invoke.");
 
   // TODO(tangwei12): find out why scope will be error.
-  auto *lt_var = scope_->FindVar(LOOKUP_TABLE_PATH)->GetMutable<std::string>();
+  auto* lt_var = scope_->FindVar(LOOKUP_TABLE_PATH)->GetMutable<std::string>();
   lt_var->clear();
   lt_var->append(out_var_name);
   VLOG(4) << "RequestCheckpointHandler update var kLookupTablePath to: "
@@ -229,56 +257,33 @@ bool RequestCheckpointHandler::Handle(const std::string &varname,
   return true;
 }
 
-bool RequestNotifyHandler::Handle(const std::string &varname,
-                                  framework::Scope *scope,
-                                  framework::Variable *invar,
-                                  framework::Variable **outvar,
+bool RequestNotifyHandler::Handle(const std::string& varname,
+                                  framework::Scope* scope,
+                                  framework::Variable* invar,
+                                  framework::Variable** outvar,
                                   const int trainer_id,
-                                  const std::string &out_var_name,
-                                  const std::string &table_name) {
+                                  const std::string& out_var_name,
+                                  const std::string& table_name) {
+  VLOG(4) << "RequestNotifyHandler: " << varname;
   VLOG(3) << "async process var: " << varname << ", trainer_id: " << trainer_id;
 
   string::Piece decay_piece(LEARNING_RATE_DECAY_COUNTER);
-  string::Piece batch_piece(BATCH_BARRIER_MESSAGE);
-  string::Piece fetch_piece(FETCH_BARRIER_MESSAGE);
-  string::Piece complete_piece(COMPLETE_MESSAGE);
-
   string::Piece var_name_piece = string::Piece(varname);
-
-  if (string::Contains(var_name_piece, batch_piece)) {
-    return BarrierMonitor::GetInstance()->IncreaseBarrier(
-        trainer_id, BATCH_BARRIER_MESSAGE);
-  } else if (string::Contains(var_name_piece, fetch_piece)) {
-    return BarrierMonitor::GetInstance()->IncreaseBarrier(
-        trainer_id, FETCH_BARRIER_MESSAGE);
-  } else if (string::Contains(var_name_piece, complete_piece)) {
-    if (HeartBeatMonitor::GetInstance() != nullptr) {
-      HeartBeatMonitor::GetInstance()->Update(trainer_id, "", COMPLETED);
-    }
-    rpc_server_->Complete();
-    BarrierMonitor::GetInstance()->DecreaseWorker();
-    return true;
-  } else if (string::Contains(var_name_piece, decay_piece)) {
+  if (string::Contains(var_name_piece, decay_piece)) {
     VLOG(3) << "LearningRate Decay Counter Update";
     PADDLE_ENFORCE_NE(
         lr_decay_block_id, -1,
-        platform::errors::InvalidArgument(
-            "when lr_decay_block_id = -1, there should be no RPC invoke."));
-    auto *origin_var = scope_->FindVar(varname);
+        "when lr_decay_block_id = -1, there should be no RPC invoke.");
+    auto* origin_var = scope_->FindVar(varname);
     auto origin_var_tensor = origin_var->Get<framework::LoDTensor>();
-    auto *send_var = scope->FindVar(varname);
+    auto* send_var = scope->FindVar(varname);
     auto send_var_tensor = send_var->Get<framework::LoDTensor>();
-    int64_t *origin_value =
+    int64_t* origin_value =
         origin_var_tensor.mutable_data<int64_t>(origin_var_tensor.place());
-    int64_t *send_value =
+    int64_t* send_value =
         send_var_tensor.mutable_data<int64_t>(send_var_tensor.place());
     origin_value[0] += send_value[0];
     executor_->RunPreparedContext(lr_decay_prepared_ctx_.get(), scope_);
-
-    return true;
-  } else {
-    PADDLE_THROW(platform::errors::InvalidArgument(
-        "unkown varname %s with RequestNotifyHandler", varname));
   }
   return true;
 }
diff --git a/paddle/fluid/operators/distributed/rpc_server_test.cc b/paddle/fluid/operators/distributed/rpc_server_test.cc
index bc17c84645116df7868107a6acf3de620dd9f798..d36a433db7dda89b5a9edb6fb8db8552ecce7854 100644
--- a/paddle/fluid/operators/distributed/rpc_server_test.cc
+++ b/paddle/fluid/operators/distributed/rpc_server_test.cc
@@ -1,8 +1,11 @@
 /* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
 You may obtain a copy of the License at
+
     http://www.apache.org/licenses/LICENSE-2.0
+
 Unless required by applicable law or agreed to in writing, software
 distributed under the License is distributed on an "AS IS" BASIS,
 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
@@ -21,7 +24,6 @@ limitations under the License. */
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/framework/operator.h"
 
-#include "paddle/fluid/operators/distributed/barrier_monitor.h"
 #include "paddle/fluid/operators/distributed/distributed.h"
 #include "paddle/fluid/operators/distributed/heart_beat_monitor.h"
 #include "paddle/fluid/operators/distributed/request_handler_impl.h"
@@ -117,7 +119,6 @@ void StartServer(const std::string& rpc_name) {
   g_rpc_service->RegisterRPC(rpc_name, g_req_handler.get());
 
   distributed::HeartBeatMonitor::Init(2, true, "w@grad");
-  distributed::BarrierMonitor::Init(2);
 
   g_req_handler->SetRPCServer(g_rpc_service.get());
 
@@ -163,9 +164,6 @@ TEST(PREFETCH, CPU) {
     }
   }
 
-  auto* barrier = distributed::BarrierMonitor::GetInstance();
-  barrier->Stop();
-
   g_rpc_service->ShutDown();
   server_thread.join();
   LOG(INFO) << "begin reset";
@@ -176,24 +174,20 @@ TEST(PREFETCH, CPU) {
 TEST(COMPLETE, CPU) {
   setenv("http_proxy", "", 1);
   setenv("https_proxy", "", 1);
-  g_req_handler.reset(new distributed::RequestNotifyHandler(
-      distributed::DistributedMode::kSync, -1));
+  g_req_handler.reset(
+      new distributed::RequestSendHandler(distributed::DistributedMode::kSync));
   g_rpc_service.reset(new RPCSERVER_T("127.0.0.1:0", 2));
-
   distributed::RPCClient* client =
       distributed::RPCClient::GetInstance<RPCCLIENT_T>(0);
   PADDLE_ENFORCE(client != nullptr);
-  std::thread server_thread(StartServer, distributed::kRequestNotify);
+  std::thread server_thread(StartServer, distributed::kRequestSend);
   g_rpc_service->WaitServerReady();
   int port = g_rpc_service->GetSelectedPort();
   std::string ep = paddle::string::Sprintf("127.0.0.1:%d", port);
   client->AsyncSendComplete(ep);
   client->Wait();
 
-  auto* barrier = distributed::BarrierMonitor::GetInstance();
-  EXPECT_EQ(barrier->GetWorkerNum(), 1);
-
-  barrier->Stop();
+  EXPECT_EQ(g_rpc_service->GetClientNum(), 1);
 
   g_rpc_service->ShutDown();
   server_thread.join();
diff --git a/paddle/fluid/operators/distributed_ops/CMakeLists.txt b/paddle/fluid/operators/distributed_ops/CMakeLists.txt
index 244d3ece48ecc201465a6badeb5cd44bbf71f4a8..79f14d75d279d0ae1a68bf857ab9f46d6b71c42f 100644
--- a/paddle/fluid/operators/distributed_ops/CMakeLists.txt
+++ b/paddle/fluid/operators/distributed_ops/CMakeLists.txt
@@ -2,9 +2,9 @@ include(operators)
 
 set(DISTRIBUTE_DEPS "")
 if(WITH_GRPC)
-    set(DISTRIBUTE_DEPS sendrecvop_rpc parameter_send parameter_recv barrier_monitor communicator async_sparse_param_update_recorder grpc++_unsecure grpc_unsecure gpr zlib protobuf node)
+    set(DISTRIBUTE_DEPS sendrecvop_rpc parameter_send parameter_recv communicator async_sparse_param_update_recorder grpc++_unsecure grpc_unsecure gpr zlib protobuf node)
 else()
-    set(DISTRIBUTE_DEPS sendrecvop_rpc parameter_send parameter_recv barrier_monitor communicator async_sparse_param_update_recorder brpc leveldb protobuf ssl crypto zlib node)
+    set(DISTRIBUTE_DEPS sendrecvop_rpc parameter_send parameter_recv communicator async_sparse_param_update_recorder brpc leveldb protobuf ssl crypto zlib node)
     if(WITH_BRPC_RDMA)
         find_library(IBVERBS_LIBRARY NAMES ibverbs)
         ADD_LIBRARY(ibverbs SHARED IMPORTED GLOBAL)
diff --git a/paddle/fluid/operators/distributed_ops/distributed_lookup_table_op.cc b/paddle/fluid/operators/distributed_ops/distributed_lookup_table_op.cc
index f37d2b1eee108d835a8f06035f13ca1abbaf0d69..77150c4e48ea26e457c234b19193008a019f67b8 100644
--- a/paddle/fluid/operators/distributed_ops/distributed_lookup_table_op.cc
+++ b/paddle/fluid/operators/distributed_ops/distributed_lookup_table_op.cc
@@ -26,7 +26,7 @@ class DistributedLookupTableOp : public framework::OperatorWithKernel {
  public:
   using framework::OperatorWithKernel::OperatorWithKernel;
 
-  void InferShape(framework::InferShapeContext *ctx) const override {
+  void InferShape(framework::InferShapeContext* ctx) const override {
     PADDLE_ENFORCE(ctx->HasInputs("Ids"),
                    "Input(Ids) of LookupTableOp should not be null.");
     PADDLE_ENFORCE(ctx->HasInput("W"),
@@ -40,11 +40,9 @@ class DistributedLookupTableOp : public framework::OperatorWithKernel {
     PADDLE_ENFORCE_EQ(table_dims.size(), 2,
                       "Only 2 dimensions of the 'Embedding' is supported.");
 
-    for (auto &ids_dim : ids_dims) {
+    for (auto& ids_dim : ids_dims) {
       PADDLE_ENFORCE_EQ(ids_dim.size(), 2,
                         "The dimension of the 'Ids' tensor must be 2.");
-      PADDLE_ENFORCE_EQ(ids_dim[1], 1,
-                        "The last dimension of the 'Ids' tensor must be 1.");
     }
 
     auto lookup_tables =
@@ -52,6 +50,8 @@ class DistributedLookupTableOp : public framework::OperatorWithKernel {
     auto height_sections =
         ctx->Attrs().Get<std::vector<int64_t>>("height_sections");
     auto endpoints = ctx->Attrs().Get<std::vector<std::string>>("endpoints");
+    auto lookup_table_version =
+        ctx->Attrs().Get<std::string>("lookup_table_version");
 
     PADDLE_ENFORCE(lookup_tables.size() == height_sections.size() &&
                        lookup_tables.size() == endpoints.size() &&
@@ -61,8 +61,15 @@ class DistributedLookupTableOp : public framework::OperatorWithKernel {
 
     auto outputs_dims = std::vector<framework::DDim>();
 
-    for (auto &ids_dim : ids_dims) {
-      outputs_dims.push_back(framework::make_ddim({ids_dim[0], table_dims[1]}));
+    for (auto& ids_dim : ids_dims) {
+      if (lookup_table_version == "lookup_table") {
+        outputs_dims.push_back(
+            framework::make_ddim({ids_dim[0], table_dims[1]}));
+      } else if (lookup_table_version == "lookup_table_v2") {
+        outputs_dims.push_back(framework::make_ddim(
+            {static_cast<int64_t>(ids_dim[0]), static_cast<int64_t>(ids_dim[1]),
+             static_cast<int64_t>(table_dims[1])}));
+      }
     }
 
     ctx->SetOutputsDim("Outputs", outputs_dims);
@@ -71,7 +78,7 @@ class DistributedLookupTableOp : public framework::OperatorWithKernel {
 
  protected:
   framework::OpKernelType GetExpectedKernelType(
-      const framework::ExecutionContext &ctx) const override {
+      const framework::ExecutionContext& ctx) const override {
     return framework::OpKernelType(
         framework::proto::VarType::Type(ctx.Attr<int>("dtype")),
         ctx.GetPlace());
@@ -81,7 +88,7 @@ class DistributedLookupTableOp : public framework::OperatorWithKernel {
 template <typename T>
 class DistributedLookupTableKernel : public framework::OpKernel<T> {
  public:
-  void Compute(const framework::ExecutionContext &context) const override {
+  void Compute(const framework::ExecutionContext& context) const override {
     auto ids_vars = context.MultiInputVar("Ids");
     auto emb_vars = context.MultiOutput<framework::Tensor>("Embeddings");
 
@@ -93,10 +100,30 @@ class DistributedLookupTableKernel : public framework::OpKernel<T> {
     auto height_sections =
         context.Attr<std::vector<int64_t>>("height_sections");
     auto endpoints = context.Attr<std::vector<std::string>>("endpoints");
+    auto lookup_table_version =
+        context.Attr<std::string>("lookup_table_version");
 
     operators::distributed::prefetchs(
         id_names, out_names, embedding_name, false, lookup_tables, endpoints,
         height_sections, context, context.scope());
+
+    if (lookup_table_version == "lookup_table_v2") {
+      auto& scope = context.scope();
+      auto emb_dim =
+          scope.FindVar(embedding_name)->Get<framework::LoDTensor>().dims()[1];
+
+      for (size_t i = 0; i < id_names.size(); ++i) {
+        auto* id_var = scope.FindVar(id_names[i]);
+        auto* out_var = scope.FindVar(out_names[i]);
+        auto* id_tensor = id_var->GetMutable<framework::LoDTensor>();
+        auto* out_tensor = out_var->GetMutable<framework::LoDTensor>();
+
+        auto id_dims = id_tensor->dims();
+        out_tensor->Resize(framework::make_ddim(
+            {static_cast<int64_t>(id_dims[0]), static_cast<int64_t>(id_dims[1]),
+             static_cast<int64_t>(emb_dim)}));
+      }
+    }
   }
 };
 
@@ -134,6 +161,12 @@ class DistributedLookupTableOpMaker : public framework::OpProtoAndCheckerMaker {
 
     AddAttr<int>("trainer_id", "trainer id from 0 ~ worker_num.").SetDefault(0);
 
+    AddAttr<std::string>(
+        "lookup_table_version",
+        "(string, default lookup_table) "
+        "To distinguish between different versions of embedding OP")
+        .SetDefault(std::string("lookup_table"));
+
     AddAttr<int64_t>("padding_idx",
                      "(int64, default -1) "
                      "If the value is -1, it makes no effect to lookup. "
diff --git a/paddle/fluid/operators/distributed_ops/gen_nccl_id_op.cc b/paddle/fluid/operators/distributed_ops/gen_nccl_id_op.cc
index cf8322905297156ba5e36c5b21e009739daa194f..e63f882478351cde16bde969b86e020181d6d4e5 100644
--- a/paddle/fluid/operators/distributed_ops/gen_nccl_id_op.cc
+++ b/paddle/fluid/operators/distributed_ops/gen_nccl_id_op.cc
@@ -1,8 +1,11 @@
 /* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
 You may obtain a copy of the License at
+
     http://www.apache.org/licenses/LICENSE-2.0
+
 Unless required by applicable law or agreed to in writing, software
 distributed under the License is distributed on an "AS IS" BASIS,
 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
@@ -18,7 +21,6 @@ limitations under the License. */
 #include "paddle/fluid/framework/lod_tensor.h"
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/framework/threadpool.h"
-#include "paddle/fluid/operators/distributed/barrier_monitor.h"
 #include "paddle/fluid/operators/distributed/distributed.h"
 #include "paddle/fluid/operators/distributed/request_handler_impl.h"
 #include "paddle/fluid/platform/nccl_helper.h"
@@ -28,16 +30,16 @@ namespace operators {
 
 class GenNCCLIdOp : public framework::OperatorBase {
  public:
-  GenNCCLIdOp(const std::string &type, const framework::VariableNameMap &inputs,
-              const framework::VariableNameMap &outputs,
-              const framework::AttributeMap &attrs)
+  GenNCCLIdOp(const std::string& type, const framework::VariableNameMap& inputs,
+              const framework::VariableNameMap& outputs,
+              const framework::AttributeMap& attrs)
       : OperatorBase(type, inputs, outputs, attrs) {}
 
-  void RunImpl(const framework::Scope &scope,
-               const platform::Place &dev_place) const override {
-    platform::DeviceContextPool &pool = platform::DeviceContextPool::Instance();
+  void RunImpl(const framework::Scope& scope,
+               const platform::Place& dev_place) const override {
+    platform::DeviceContextPool& pool = platform::DeviceContextPool::Instance();
     // put nccl id in CPUPlace
-    auto &dev_ctx = *pool.Get(platform::CPUPlace());
+    auto& dev_ctx = *pool.Get(platform::CPUPlace());
     int trainer_id = Attr<int>("trainer_id");
 
     std::vector<std::string> trainers =
@@ -53,7 +55,7 @@ class GenNCCLIdOp : public framework::OperatorBase {
 
     std::string endpoint = trainers[trainer_id];
 
-    framework::Scope &local_scope = scope.NewScope();
+    framework::Scope& local_scope = scope.NewScope();
 
     int nccl_comm_num = Attr<int>("nccl_comm_num");
     int use_hierarchical_allreduce = Attr<bool>("use_hierarchical_allreduce");
@@ -169,10 +171,10 @@ class GenNCCLIdOp : public framework::OperatorBase {
   }
 
  private:
-  void GenerateAndSend(framework::Scope *scope,
-                       const platform::DeviceContext &dev_ctx,
-                       const std::string &nccl_id_name,
-                       const std::vector<std::string> &endpoint_list) const {
+  void GenerateAndSend(framework::Scope* scope,
+                       const platform::DeviceContext& dev_ctx,
+                       const std::string& nccl_id_name,
+                       const std::vector<std::string>& endpoint_list) const {
     auto var = scope->FindVar(nccl_id_name);
     PADDLE_ENFORCE_NOT_NULL(
         var, platform::errors::NotFound("Variable with name %s is not found",
@@ -180,96 +182,76 @@ class GenNCCLIdOp : public framework::OperatorBase {
     auto id = var->GetMutable<ncclUniqueId>();
     PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::ncclGetUniqueId(id));
 
-    distributed::RPCClient *client =
+    distributed::RPCClient* client =
         distributed::RPCClient::GetInstance<RPCCLIENT_T>(0);
 
-    for (auto &ep : endpoint_list) {
+    for (auto& ep : endpoint_list) {
       VLOG(3) << "sending nccl_id_var:" << nccl_id_name << " to " << ep;
       client->AsyncSendVar(ep, dev_ctx, *scope, nccl_id_name);
     }
     client->Wait();
-    for (auto &ep : endpoint_list) {
+    for (auto& ep : endpoint_list) {
       client->AsyncSendBatchBarrier(ep);
     }
     client->Wait();
     VLOG(3) << "sending completed...";
   }
 
-  void GetIdByServer(const std::string &endpoint, framework::Scope *scope,
-                     const platform::DeviceContext &dev_ctx, int nccl_comm_num,
+  void GetIdByServer(const std::string& endpoint, framework::Scope* scope,
+                     const platform::DeviceContext& dev_ctx, int nccl_comm_num,
                      bool use_hierarchical_allreduce, int trainer_id,
                      int inter_trainer_id, int exter_trainer_id) const {
     // std::string endpoint = Attr<std::string>("endpoint");
     // NOTE: Can not use unique_ptr here because the default
     // deleter will call GRPC Server's base class's dtor and
     // that will cause a wired crash.
-
+    distributed::RequestSendHandler rpc_h(distributed::DistributedMode::kSync);
     std::unique_ptr<distributed::RPCServer> rpc_service(
         new RPCSERVER_T(endpoint, 1));
 
-    distributed::RequestSendHandler rpc_h(distributed::DistributedMode::kSync);
-
-    distributed::RequestNotifyHandler notify_h(
-        distributed::DistributedMode::kSync, -1);
-
     rpc_service->RegisterRPC(distributed::kRequestSend, &rpc_h);
-    rpc_service->RegisterRPC(distributed::kRequestNotify, &notify_h);
+    rpc_h.SetRPCServer(rpc_service.get());
 
     framework::ProgramDesc empty_program;
     framework::Executor executor(dev_ctx.GetPlace());
-
-    rpc_h.SetRPCServer(rpc_service.get());
     rpc_h.SetScope(scope);
     rpc_h.SetDevCtx(&dev_ctx);
     rpc_h.SetProgram(&empty_program);
     rpc_h.SetExecutor(&executor);
 
-    notify_h.SetRPCServer(rpc_service.get());
-    notify_h.SetScope(scope);
-    notify_h.SetDevCtx(&dev_ctx);
-    notify_h.SetProgram(&empty_program);
-    notify_h.SetExecutor(&executor);
-
-    distributed::BarrierMonitor::Init(1);
-    auto *barrier = distributed::BarrierMonitor::GetInstance();
-    barrier->Reset(1, distributed::BarrierType::kSendBarrier);
-
     std::thread server_thread(
         std::bind(&distributed::RPCServer::StartServer, rpc_service.get()));
 
     for (int i = 0; i < nccl_comm_num; i++) {
-      barrier->WaitServerWeakup();
-      barrier->Reset(1, distributed::BarrierType::kSendBarrier);
-      barrier->ServerWeakup();
-
+      rpc_service->SetCond(distributed::kRequestSend);
       VLOG(3) << "trainer_id:" << trainer_id
               << " start getting nccl id from trainer 0, nccl_comm_no:" << i;
+      rpc_service->WaitBarrier(distributed::kRequestSend);
+      rpc_service->ResetBarrierCounter();
     }
 
     if (use_hierarchical_allreduce) {
       if (inter_trainer_id > 0) {
         for (int i = 0; i < nccl_comm_num; i++) {
-          barrier->WaitServerWeakup();
-          barrier->Reset(1, distributed::BarrierType::kSendBarrier);
-          barrier->ServerWeakup();
-
+          rpc_service->SetCond(distributed::kRequestSend);
           VLOG(3) << "trainer_id:" << trainer_id
                   << ", inter_trainer_id:" << inter_trainer_id
                   << " start getting nccl id from inter_trainer:" << i;
+          rpc_service->WaitBarrier(distributed::kRequestSend);
+          rpc_service->ResetBarrierCounter();
         }
       }
 
       if (exter_trainer_id > 0) {
         for (int i = 0; i < nccl_comm_num; i++) {
-          barrier->WaitServerWeakup();
-          barrier->Reset(1, distributed::BarrierType::kSendBarrier);
-          barrier->ServerWeakup();
-
+          rpc_service->SetCond(distributed::kRequestSend);
           VLOG(3)
               << "trainer_id:" << trainer_id
               << ", exter_trainer_id:" << exter_trainer_id
               << " start getting nccl id from exter_trainer 0, nccl_comm_no:"
               << i;
+          rpc_service->WaitBarrier(distributed::kRequestSend);
+          rpc_service->ResetBarrierCounter();
         }
       }
     }
@@ -278,7 +260,6 @@ class GenNCCLIdOp : public framework::OperatorBase {
             << ", inter_trainer_id:" << inter_trainer_id
             << ", exter_trainer_id:" << exter_trainer_id
             << " got nccl id and stop server...";
-    barrier->Stop();
     rpc_service->ShutDown();
     VLOG(3) << "rpc server stopped";
     server_thread.join();
@@ -291,6 +272,7 @@ class GenNCCLIdOpMaker : public framework::OpProtoAndCheckerMaker {
     AddOutput("NCCLID", "Raw variable contains a NCCL UniqueId instaces.");
     AddComment(R"DOC(
 GenNCCLId operator
+
 For trainer 0: generate a new UniqueId and send it to all the other trainers.
 For trainer 1~n: start a gRPC server to get the UniqueId, once got, stop the server.
 )DOC");
diff --git a/paddle/fluid/operators/distributed_ops/listen_and_serv_op.cc b/paddle/fluid/operators/distributed_ops/listen_and_serv_op.cc
index c8c0316e74739622f46cb577ae051fc88dd39bb7..d40df6f9de0c1e22ea892993d66a2cdfa808b1c7 100644
--- a/paddle/fluid/operators/distributed_ops/listen_and_serv_op.cc
+++ b/paddle/fluid/operators/distributed_ops/listen_and_serv_op.cc
@@ -1,8 +1,11 @@
 /* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
 You may obtain a copy of the License at
+
     http://www.apache.org/licenses/LICENSE-2.0
+
 Unless required by applicable law or agreed to in writing, software
 distributed under the License is distributed on an "AS IS" BASIS,
 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
@@ -22,7 +25,6 @@ limitations under the License. */
 #include "paddle/fluid/operators/math/math_function.h"
 
 #include "paddle/fluid/operators/distributed/async_sparse_param_update_recorder.h"
-#include "paddle/fluid/operators/distributed/barrier_monitor.h"
 #include "paddle/fluid/operators/distributed/heart_beat_monitor.h"
 #include "paddle/fluid/operators/distributed/request_handler_impl.h"
 #include "paddle/fluid/operators/distributed_ops/listen_and_serv_op.h"
@@ -36,13 +38,10 @@ DEFINE_int32(rpc_prefetch_thread_num, 12, "number of threads for rpc prefetch");
 namespace paddle {
 namespace operators {
 
-volatile sig_atomic_t gSignalStatus;
-
 void RunServer(std::shared_ptr<distributed::RPCServer> service) {
   service->StartServer();
   VLOG(4) << "RunServer thread end";
 }
-
 static void split(const std::string &str, char sep,
                   std::vector<std::string> *pieces) {
   pieces->clear();
@@ -127,7 +126,6 @@ void ListenAndServOp::RunSyncLoop(
   for (size_t i = 1; i < program->Size(); ++i) {
     optimize_blocks_list.push_back(i);
   }
-
   auto optimize_prepared = executor->Prepare(*program, optimize_blocks_list);
   // Insert placeholder for block0 which holds current op itself,
   // NOTE the first block in `optimize_prepared` should never be ran.
@@ -137,15 +135,21 @@ void ListenAndServOp::RunSyncLoop(
 
   // Trainers will get all parameters from pserver in the
   // startup program, so we will wait RequestGet first
-  auto *barrier = distributed::BarrierMonitor::GetInstance();
+  rpc_service_->SetCond(distributed::kRequestGet);
+  rpc_service_->WaitBarrier(distributed::kRequestGet);
+  rpc_service_->ResetBarrierCounter();
 
   while (true) {
     // Get from multiple trainers, we don't care about the order in which
     // the gradients arrives, just add suffix 0~n and merge the gradient.
-    barrier->WaitServerWeakup();
+    VLOG(3) << "wait all clients to send gradient";
+    rpc_service_->SetCond(distributed::kRequestSend);
+    VLOG(3) << "wait all clients to send send_barrier";
+    rpc_service_->WaitBarrier(distributed::kRequestSend);
 
-    if (gSignalStatus != 0) {
+    if (rpc_service_->IsExit()) {
       LOG(WARNING) << "get exit!rpc_processor break!";
+      rpc_service_->SetCond(distributed::kRequestGet);
       break;
     }
 
@@ -176,8 +180,12 @@ void ListenAndServOp::RunSyncLoop(
     VLOG(3) << "ResetReceivedVars";
     ResetReceivedVars(recv_scope, dev_ctx, rpc_service_->NeedResetAllVars());
 
-    barrier->ServerWeakup();
-    VLOG(3) << "kRecvBarrier to push params to trainers";
+    VLOG(3) << "wait all clients to get parameters back";
+    rpc_service_->SetCond(distributed::kRequestGet);
+    VLOG(3) << "wait all clients to send fetch_barrier";
+    rpc_service_->WaitBarrier(distributed::kRequestGet);
+    VLOG(3) << "ResetBarrierCounter";
+    rpc_service_->ResetBarrierCounter();
   }  // while(true)
 }
 
@@ -273,7 +281,7 @@ void ListenAndServOp::RunAsyncLoop(framework::Executor *executor,
   request_prefetch_handler_->SetGradToPreparedCtx(&grad_to_prepared_ctx);
 
   while (true) {
-    if (gSignalStatus != 0) {
+    if (rpc_service_->IsExit()) {
       VLOG(4) << "get exit!rpc_processor break!";
       break;
     }
@@ -383,7 +391,7 @@ void ListenAndServOp::RunImpl(const framework::Scope &scope,
   rpc_service_->RegisterRPC(distributed::kRequestGetNoBarrier,
                             request_get_no_barrier_handler_.get());
   rpc_service_->RegisterRPC(distributed::kRequestNotify,
-                            request_notify_handler_.get(), fan_in * 2);
+                            request_notify_handler_.get(), rpc_send_thread_num);
 
   auto optimize_blocks =
       Attr<std::vector<framework::BlockDesc *>>(kOptimizeBlocks);
@@ -432,7 +440,6 @@ void ListenAndServOp::RunImpl(const framework::Scope &scope,
   std::unordered_map<std::string,
                      std::shared_ptr<framework::ExecutorPrepareContext>>
       prefetch_var_name_to_prepared_ctx;
-
   for (size_t i = 0; i < prefetch_block_id_list.size(); ++i) {
     auto block_id = prefetch_block_id_list[i];
     auto prefetch_var_name = block_id_to_prefetch_var_name[block_id];
@@ -441,10 +448,8 @@ void ListenAndServOp::RunImpl(const framework::Scope &scope,
 
   // parse attr of kSparseGradToParam  sparse_grad_name -> param_name
   std::unordered_map<std::string, std::string> sparse_grad_name_to_param_name;
-
   auto sparse_grad_name_to_param_name_str =
       Attr<std::vector<std::string>>(kSparseGradToParam);
-
   for (const auto &sparse_grad_name_and_param_name :
        sparse_grad_name_to_param_name_str) {
     std::vector<std::string> pieces;
@@ -472,18 +477,17 @@ void ListenAndServOp::RunImpl(const framework::Scope &scope,
   signal(SIGINT, SignalHandler::StopAndExit);
   signal(SIGTERM, SignalHandler::StopAndExit);
 
-  distributed::BarrierMonitor::Init(fan_in);
-
   if (distributed_mode == distributed::DistributedMode::kSync) {
     // start the server listening after all member initialized.
     server_thread_.reset(new std::thread(RunServer, rpc_service_));
     VLOG(3) << "wait server thread to become ready...";
     rpc_service_->WaitServerReady();
-    // Write to a file of server selected port for python use.
-    SavePort();
 
     CacheVarsType(inputs, recv_scope);
 
+    // Write to a file of server selected port for python use.
+    SavePort();
+
     RunSyncLoop(&executor, program, &recv_scope, &dev_ctx,
                 prefetch_block_id_list, checkpoint_block_id);
   } else {
@@ -570,8 +574,9 @@ class ListenAndServOpMaker : public framework::OpProtoAndCheckerMaker {
 void SignalHandler::StopAndExit(int signal_num) {
   // Do not use VLOG here for the device for printing maybe already released.
   // exit will release interal allocated resoureces.
-  distributed::BarrierMonitor::GetInstance()->Stop();
-  gSignalStatus = signal_num;
+  auto file_path = string::Sprintf("/tmp/paddle.%d.port", ::getpid());
+  remove(file_path.c_str());
+  exit(0);
 }
 
 }  // namespace operators
diff --git a/paddle/fluid/operators/distributed_ops/test_send_nccl_id.cc b/paddle/fluid/operators/distributed_ops/test_send_nccl_id.cc
index 3f5fdadc22342bf17f54d86e39bdc5114915c001..b65621a0886b02fd8d3c029c979348469014cadc 100644
--- a/paddle/fluid/operators/distributed_ops/test_send_nccl_id.cc
+++ b/paddle/fluid/operators/distributed_ops/test_send_nccl_id.cc
@@ -1,8 +1,11 @@
 /* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
 You may obtain a copy of the License at
+
     http://www.apache.org/licenses/LICENSE-2.0
+
 Unless required by applicable law or agreed to in writing, software
 distributed under the License is distributed on an "AS IS" BASIS,
 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
@@ -17,7 +20,6 @@ limitations under the License. */
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/framework/operator.h"
 #include "paddle/fluid/framework/program_desc.h"
-#include "paddle/fluid/operators/distributed/barrier_monitor.h"
 #include "paddle/fluid/operators/distributed/distributed.h"
 #include "paddle/fluid/operators/distributed/request_handler_impl.h"
 #include "paddle/fluid/operators/distributed_ops/listen_and_serv_op.h"
@@ -40,7 +42,6 @@ namespace string = paddle::string;
 
 std::unique_ptr<distributed::RPCServer> g_rpc_service;
 std::unique_ptr<distributed::RequestHandler> g_req_handler;
-std::unique_ptr<distributed::RequestNotifyHandler> g_notify_handler;
 
 void StartServer() {
   f::Scope scope;
@@ -51,35 +52,21 @@ void StartServer() {
 
   f::ProgramDesc empty_program;
   f::Executor executor(dev_ctx.GetPlace());
-
   g_req_handler->SetScope(&scope);
   g_req_handler->SetDevCtx(&dev_ctx);
   g_req_handler->SetProgram(&empty_program);
   g_req_handler->SetExecutor(&executor);
-  g_req_handler->SetRPCServer(g_rpc_service.get());
-
-  g_notify_handler.SetRPCServer(rpc_service.get());
-  g_notify_handler.SetScope(scope);
-  g_notify_handler.SetDevCtx(&dev_ctx);
-  g_notify_handler.SetProgram(&empty_program);
-  g_notify_handler.SetExecutor(&executor);
 
   g_rpc_service->RegisterRPC(distributed::kRequestSend, g_req_handler.get());
-  g_rpc_service->RegisterRPC(distributed::RequestNotifyHandler,
-                             g_notify_handler.get());
-
-  distributed::BarrierMonitor::Init(1);
-  auto* barrier = distributed::BarrierMonitor::GetInstance();
-  barrier->Reset(1, distributed::BarrierType::kSendBarrier);
+  g_req_handler->SetRPCServer(g_rpc_service.get());
 
   std::thread server_thread(
       std::bind(&distributed::RPCServer::StartServer, g_rpc_service.get()));
 
-  barrier->WaitServerWeakup();
-  barrier->ServerWeakup();
+  g_rpc_service->SetCond(distributed::kRequestSend);
+  g_rpc_service->WaitBarrier(distributed::kRequestSend);
 
   LOG(INFO) << "got nccl id and stop server...";
-  barrier->Stop();
   g_rpc_service->ShutDown();
   server_thread.join();
 }
@@ -87,10 +74,6 @@ void StartServer() {
 TEST(SendNcclId, RPCServer) {
   g_req_handler.reset(
       new distributed::RequestSendHandler(distributed::DistributedMode::kSync));
-
-  g_notify_handler.reset(new distributed::RequestNotifyHandler(
-      distributed::DistributedMode::kSync, -1));
-
   g_rpc_service.reset(new RPCSERVER_T("127.0.0.1:0", 1));
 
   std::thread server_thread(StartServer);
@@ -121,5 +104,4 @@ TEST(SendNcclId, RPCServer) {
   server_thread.join();
   g_rpc_service.reset(nullptr);
   g_req_handler.reset(nullptr);
-  g_notify_handler.reset(nullptr);
 }
diff --git a/paddle/fluid/operators/fake_dequantize_op.cc b/paddle/fluid/operators/fake_dequantize_op.cc
index a1cf30ca7f5fe689a67c5afc2c08a9a2e0b9d859..0d2b951ee1c544151e99af8216db7809e2a77852 100644
--- a/paddle/fluid/operators/fake_dequantize_op.cc
+++ b/paddle/fluid/operators/fake_dequantize_op.cc
@@ -29,7 +29,7 @@ struct DequantizeFunctor<platform::CPUDeviceContext, T> {
     auto out_e = framework::EigenVector<T>::Flatten(*out);
 
     auto& dev = *dev_ctx.eigen_device();
-    out_e.device(dev) = scale_factor[0] * in_e / max_range;
+    out_e.device(dev) = in_e * scale_factor[0] / max_range;
   }
 };
 
@@ -48,7 +48,7 @@ struct ChannelDequantizeFunctor<platform::CPUDeviceContext, T> {
         auto in_e = framework::EigenVector<T>::Flatten(one_channel_in);
         auto out_e = framework::EigenVector<T>::Flatten(one_channel_out);
         auto& dev = *dev_ctx.eigen_device();
-        out_e.device(dev) = s * in_e / max_range;
+        out_e.device(dev) = in_e * s / max_range;
       }
     } else if (scale_num == 2) {
       int batch_size = in->dims()[0];
@@ -67,7 +67,7 @@ struct ChannelDequantizeFunctor<platform::CPUDeviceContext, T> {
           auto in_e = framework::EigenVector<T>::Flatten(one_channel_in);
           auto out_e = framework::EigenVector<T>::Flatten(one_channel_out);
           auto& dev = *dev_ctx.eigen_device();
-          out_e.device(dev) = (s * scale_two[0]) * in_e / max_range;
+          out_e.device(dev) = in_e * s * scale_two[0] / max_range;
         }
       }
     }
diff --git a/paddle/fluid/operators/fake_quantize_op.cc b/paddle/fluid/operators/fake_quantize_op.cc
index 8c07e445a6f7a3ff54a0919dd653d6d3615e30fc..16a32a3f6cfb12e5e0674219dc5e532d7875199c 100644
--- a/paddle/fluid/operators/fake_quantize_op.cc
+++ b/paddle/fluid/operators/fake_quantize_op.cc
@@ -82,7 +82,7 @@ struct ClipAndFakeQuantDequantFunctor<platform::CPUDeviceContext, T> {
           out->mutable_data<T>(ctx.GetPlace()), ClipFunctor<T>(-s, s));
     auto out_e = framework::EigenVector<T>::Flatten(*out);
     out_e.device(*ctx.eigen_device()) =
-        (s / bin_cnt) * (bin_cnt * inv_s * out_e).round();
+        (bin_cnt * inv_s * out_e).round() * s / static_cast<T>(bin_cnt);
   }
 };
 template struct ClipAndFakeQuantDequantFunctor<platform::CPUDeviceContext,
@@ -171,20 +171,21 @@ struct FindMovingAverageAbsMaxFunctor<platform::CPUDeviceContext, T> {
 template struct FindMovingAverageAbsMaxFunctor<platform::CPUDeviceContext,
                                                float>;
 
-class FakeQuantizeAbsMaxOp : public framework::OperatorWithKernel {
+class FakeQuantOrWithDequantAbsMaxOp : public framework::OperatorWithKernel {
  public:
-  FakeQuantizeAbsMaxOp(const std::string& type,
-                       const framework::VariableNameMap& inputs,
-                       const framework::VariableNameMap& outputs,
-                       const framework::AttributeMap& attrs)
+  FakeQuantOrWithDequantAbsMaxOp(const std::string& type,
+                                 const framework::VariableNameMap& inputs,
+                                 const framework::VariableNameMap& outputs,
+                                 const framework::AttributeMap& attrs)
       : OperatorWithKernel(type, inputs, outputs, attrs) {}
 
   void InferShape(framework::InferShapeContext* ctx) const override {
-    OP_INOUT_CHECK(ctx->HasInput("X"), "Input", "X", "FakeQuantizeAbsMax");
+    OP_INOUT_CHECK(ctx->HasInput("X"), "Input", "X",
+                   "FakeQuantOrWithDequantAbsMaxOp");
     OP_INOUT_CHECK(ctx->HasOutput("Out"), "Output", "Out",
-                   "FakeQuantizeAbsMax");
+                   "FakeQuantOrWithDequantAbsMaxOp");
     OP_INOUT_CHECK(ctx->HasOutput("OutScale"), "Output", "OutScale",
-                   "FakeQuantizeAbsMax");
+                   "FakeQuantOrWithDequantAbsMaxOp");
     ctx->SetOutputDim("Out", ctx->GetInputDim("X"));
     ctx->SetOutputDim("OutScale", {1});
     ctx->ShareLoD("X", /*->*/ "Out");
@@ -199,7 +200,8 @@ class FakeQuantizeAbsMaxOp : public framework::OperatorWithKernel {
   }
 };
 
-class FakeQuantizeAbsMaxOpMaker : public framework::OpProtoAndCheckerMaker {
+class FakeQuantOrWithDequantAbsMaxOpMaker
+    : public framework::OpProtoAndCheckerMaker {
  public:
   void Make() override {
     AddInput("X", "(Tensor) Input is float data type.");
@@ -217,12 +219,19 @@ class FakeQuantizeAbsMaxOpMaker : public framework::OpProtoAndCheckerMaker {
                                 bit_length));
         });
     AddComment(R"DOC(
-FakeQuantize operator
+This is a Base Op which supports FakeQuantAbsMaxOpMaker and FakeQuantDequantAbsMaxOpMaker.
+FakeQuantAbsMaxOp operator is used in the dynamic quantization.
 
 $$scale = max(abs(X))$$
 $$range = 2^{bit_length - 1} - 1$$
 $$Out = round(X/scale * range)$$
 
+FakeQuantDequantAbsMaxOp operator does the abs_max quantization and then dequantization.
+
+$$scale = max(abs(X))$$
+$$range = 2^{bit\_length - 1} - 1$$
+$$Out = round(X/scale * range) * scale / range$$
+
 )DOC");
   }
 };
@@ -414,14 +423,14 @@ class FakeQuantOrWithDequantMovingAverageAbsMaxOpMaker
                   "for training. Some layers may run faster when this is true.")
         .SetDefault(false);
     AddComment(R"DOC(
-This is a Base Op which support FakeQuantMovingAverageAbsMaxOp and FakeQuantDequantMovingAverageAbsMaxOp
-FakeQuantMovingAverageAbsMaxOp operator is used in static quantization.
+This is a Base Op which supports FakeQuantMovingAverageAbsMaxOp and FakeQuantDequantMovingAverageAbsMaxOp.
+FakeQuantMovingAverageAbsMaxOp operator is used in the static quantization.
 
 $$scale = (moving\_rate*accum+max(abs(x)))/(moving\_rate*state+1)$$
 $$range = 2^{bit\_length - 1} - 1$$
 $$Out = round(X/scale * range)$$
 
-FakeQuantDequantMovingAverageAbsMaxOp operator do the moving_average_abs_max op quant and then dequant.
+FakeQuantDequantMovingAverageAbsMaxOp operator does the moving_average_abs_max quant and then dequant.
 
 $$scale = (moving\_rate*accum+max(abs(x)))/(moving\_rate*state+1)$$
 $$range = 2^{bit\_length - 1} - 1$$
@@ -490,6 +499,43 @@ $$Out = X$$
   }
 };
 
+class FakeQuantDequantGradOp : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+
+  void InferShape(framework::InferShapeContext* ctx) const override {
+    auto out_grad_name = framework::GradVarName("Out");
+    auto x_grad_name = framework::GradVarName("X");
+    OP_INOUT_CHECK(ctx->HasInput(out_grad_name), "Input", out_grad_name,
+                   "FakeQuantDequantGradOp");
+    OP_INOUT_CHECK(ctx->HasOutput(x_grad_name), "Output", x_grad_name,
+                   "FakeQuantDequantGradOp");
+
+    ctx->SetOutputDim(x_grad_name, ctx->GetInputDim(out_grad_name));
+  }
+
+  framework::OpKernelType GetExpectedKernelType(
+      const framework::ExecutionContext& ctx) const override {
+    auto input_data_type = OperatorWithKernel::IndicateVarDataType(
+        ctx, framework::GradVarName("Out"));
+    return framework::OpKernelType(input_data_type, ctx.GetPlace());
+  }
+};
+
+template <typename T>
+class FakeQuantDequantGradMaker : public framework::SingleGradOpMaker<T> {
+ public:
+  using framework::SingleGradOpMaker<T>::SingleGradOpMaker;
+
+ protected:
+  void Apply(GradOpPtr<T> grad_op) const override {
+    grad_op->SetType("fake_quantize_dequantize_grad");
+    grad_op->SetInput(framework::GradVarName("Out"), this->OutputGrad("Out"));
+    grad_op->SetOutput(framework::GradVarName("X"), this->InputGrad("X"));
+    grad_op->SetAttrMap(this->Attrs());
+  }
+};
+
 }  // namespace operators
 }  // namespace paddle
 
@@ -497,13 +543,21 @@ namespace ops = paddle::operators;
 using CPU = paddle::platform::CPUDeviceContext;
 
 REGISTER_OPERATOR(
-    fake_quantize_abs_max, ops::FakeQuantizeAbsMaxOp,
-    ops::FakeQuantizeAbsMaxOpMaker,
+    fake_quantize_abs_max, ops::FakeQuantOrWithDequantAbsMaxOp,
+    ops::FakeQuantOrWithDequantAbsMaxOpMaker,
     paddle::framework::EmptyGradOpMaker<paddle::framework::OpDesc>,
     paddle::framework::EmptyGradOpMaker<paddle::imperative::OpBase>);
 REGISTER_OP_CPU_KERNEL(fake_quantize_abs_max,
                        ops::FakeQuantizeAbsMaxKernel<CPU, float>);
 
+REGISTER_OPERATOR(fake_quantize_dequantize_abs_max,
+                  ops::FakeQuantOrWithDequantAbsMaxOp,
+                  ops::FakeQuantOrWithDequantAbsMaxOpMaker,
+                  ops::FakeQuantDequantGradMaker<paddle::framework::OpDesc>,
+                  ops::FakeQuantDequantGradMaker<paddle::imperative::OpBase>);
+REGISTER_OP_CPU_KERNEL(fake_quantize_dequantize_abs_max,
+                       ops::FakeQuantizeDequantizeAbsMaxKernel<CPU, float>);
+
 REGISTER_OPERATOR(
     fake_quantize_range_abs_max, ops::FakeQuantizeRangeAbsMaxOp,
     ops::FakeQuantizeRangeAbsMaxOpMaker,
@@ -518,16 +572,14 @@ REGISTER_OPERATOR(
     ops::FakeQuantOrWithDequantMovingAverageAbsMaxOpMaker,
     paddle::framework::EmptyGradOpMaker<paddle::framework::OpDesc>,
     paddle::framework::EmptyGradOpMaker<paddle::imperative::OpBase>);
-
 REGISTER_OP_CPU_KERNEL(fake_quantize_moving_average_abs_max,
                        ops::FakeQuantizeMovingAverageAbsMaxKernel<CPU, float>);
 
-REGISTER_OPERATOR(
-    fake_quantize_dequantize_moving_average_abs_max,
-    ops::FakeQuantOrWithDequantMovingAverageAbsMaxOp,
-    ops::FakeQuantOrWithDequantMovingAverageAbsMaxOpMaker,
-    paddle::framework::EmptyGradOpMaker<paddle::framework::OpDesc>,
-    paddle::framework::EmptyGradOpMaker<paddle::imperative::OpBase>);
+REGISTER_OPERATOR(fake_quantize_dequantize_moving_average_abs_max,
+                  ops::FakeQuantOrWithDequantMovingAverageAbsMaxOp,
+                  ops::FakeQuantOrWithDequantMovingAverageAbsMaxOpMaker,
+                  ops::FakeQuantDequantGradMaker<paddle::framework::OpDesc>,
+                  ops::FakeQuantDequantGradMaker<paddle::imperative::OpBase>);
 REGISTER_OP_CPU_KERNEL(
     fake_quantize_dequantize_moving_average_abs_max,
     ops::FakeQuantizeDequantizeMovingAverageAbsMaxKernel<CPU, float>);
@@ -547,3 +599,7 @@ REGISTER_OPERATOR(
     paddle::framework::EmptyGradOpMaker<paddle::imperative::OpBase>);
 REGISTER_OP_CPU_KERNEL(moving_average_abs_max_scale,
                        ops::MovingAverageAbsMaxScaleKernel<CPU, float>);
+
+REGISTER_OPERATOR(fake_quantize_dequantize_grad, ops::FakeQuantDequantGradOp);
+REGISTER_OP_CPU_KERNEL(fake_quantize_dequantize_grad,
+                       ops::FakeQuantDequantGradKernel<CPU, float>);
diff --git a/paddle/fluid/operators/fake_quantize_op.cu b/paddle/fluid/operators/fake_quantize_op.cu
index 6813c03933fd64575318488be02adcc1fddfa9e2..75a55fa821f0af664ad18cc20c90cd2f3d61d5d0 100644
--- a/paddle/fluid/operators/fake_quantize_op.cu
+++ b/paddle/fluid/operators/fake_quantize_op.cu
@@ -138,9 +138,9 @@ __global__ void ClipAndQuantDequantKernel(const T* in, const T* scale,
   int tid = threadIdx.x;
 
   T s = scale[0];
+  T inv_s = inverse(s);
   for (int i = bid; i < n; i += blockDim.x * gridDim.x) {
     T x = in[i];
-    T inv_s = inverse(s);
     T v = x > s ? s : x;
     v = v < -s ? -s : v;
     v = bin_cnt * inv_s * v;
@@ -335,6 +335,8 @@ namespace ops = paddle::operators;
 using CUDA = paddle::platform::CUDADeviceContext;
 REGISTER_OP_CUDA_KERNEL(fake_quantize_abs_max,
                         ops::FakeQuantizeAbsMaxKernel<CUDA, float>);
+REGISTER_OP_CUDA_KERNEL(fake_quantize_dequantize_abs_max,
+                        ops::FakeQuantizeDequantizeAbsMaxKernel<CUDA, float>);
 REGISTER_OP_CUDA_KERNEL(fake_channel_wise_quantize_abs_max,
                         ops::FakeChannelWiseQuantizeAbsMaxKernel<CUDA, float>);
 REGISTER_OP_CUDA_KERNEL(fake_quantize_range_abs_max,
@@ -347,3 +349,5 @@ REGISTER_OP_CUDA_KERNEL(moving_average_abs_max_scale,
 REGISTER_OP_CUDA_KERNEL(
     fake_quantize_dequantize_moving_average_abs_max,
     ops::FakeQuantizeDequantizeMovingAverageAbsMaxKernel<CUDA, float>);
+REGISTER_OP_CUDA_KERNEL(fake_quantize_dequantize_grad,
+                        ops::FakeQuantDequantGradKernel<CUDA, float>);
diff --git a/paddle/fluid/operators/fake_quantize_op.h b/paddle/fluid/operators/fake_quantize_op.h
index 5c27ee87481bf4fa51e6c4cc62ba2e8784753e37..fa5048852e7532d36c712b31109243bcce8abd33 100644
--- a/paddle/fluid/operators/fake_quantize_op.h
+++ b/paddle/fluid/operators/fake_quantize_op.h
@@ -21,6 +21,7 @@ limitations under the License. */
 #include "paddle/fluid/memory/malloc.h"
 #include "paddle/fluid/operators/math/blas.h"
 #include "paddle/fluid/platform/hostdevice.h"
+#include "paddle/fluid/platform/transform.h"
 
 namespace paddle {
 namespace operators {
@@ -81,7 +82,7 @@ struct FindMovingAverageAbsMaxFunctor {
 };
 
 template <typename DeviceContext, typename T>
-class FakeQuantizeAbsMaxKernel : public framework::OpKernel<T> {
+class FakeAbsMaxKernelBase : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& context) const override {
     auto* in = context.Input<framework::Tensor>("X");
@@ -95,8 +96,38 @@ class FakeQuantizeAbsMaxKernel : public framework::OpKernel<T> {
     auto& dev_ctx = context.template device_context<DeviceContext>();
     const T* in_data = in->data<T>();
     FindAbsMaxFunctor<DeviceContext, T>()(dev_ctx, in_data, in->numel(), out_s);
-    ClipAndFakeQuantFunctor<DeviceContext, T>()(dev_ctx, *in, *out_scale,
-                                                bin_cnt, out);
+    RunClipFunctor(dev_ctx, *in, *out_scale, bin_cnt, out);
+  }
+
+  virtual ~FakeAbsMaxKernelBase() = default;
+
+ protected:
+  virtual void RunClipFunctor(const DeviceContext& dev_ctx,
+                              const framework::Tensor& in,
+                              const framework::Tensor& scale, int bin_cnt,
+                              framework::Tensor* out) const = 0;
+};
+
+template <typename DeviceContext, typename T>
+class FakeQuantizeAbsMaxKernel : public FakeAbsMaxKernelBase<DeviceContext, T> {
+ protected:
+  void RunClipFunctor(const DeviceContext& dev_ctx, const framework::Tensor& in,
+                      const framework::Tensor& scale, int bin_cnt,
+                      framework::Tensor* out) const override {
+    ClipAndFakeQuantFunctor<DeviceContext, T>()(dev_ctx, in, scale, bin_cnt,
+                                                out);
+  }
+};
+
+template <typename DeviceContext, typename T>
+class FakeQuantizeDequantizeAbsMaxKernel
+    : public FakeAbsMaxKernelBase<DeviceContext, T> {
+ protected:
+  void RunClipFunctor(const DeviceContext& dev_ctx, const framework::Tensor& in,
+                      const framework::Tensor& scale, int bin_cnt,
+                      framework::Tensor* out) const override {
+    ClipAndFakeQuantDequantFunctor<DeviceContext, T>()(dev_ctx, in, scale,
+                                                       bin_cnt, out);
   }
 };
 
@@ -167,11 +198,6 @@ class FakeQuantizeRangeAbsMaxKernel : public framework::OpKernel<T> {
 template <typename DeviceContext, typename T>
 class FakeMovingAverageAbsMaxKernelBase : public framework::OpKernel<T> {
  public:
-  ~FakeMovingAverageAbsMaxKernelBase() {}
-  virtual void RunClipFunctor(const DeviceContext& dev_ctx,
-                              const framework::Tensor& in,
-                              const framework::Tensor& in_scale, int bin_cnt,
-                              framework::Tensor* out) const = 0;
   void Compute(const framework::ExecutionContext& context) const override {
     auto* in = context.Input<framework::Tensor>("X");
     auto* in_scale = context.Input<framework::Tensor>("InScale");
@@ -212,12 +238,20 @@ class FakeMovingAverageAbsMaxKernelBase : public framework::OpKernel<T> {
 
     RunClipFunctor(dev_ctx, *in, *out_scale, bin_cnt, out);
   }
+
+  virtual ~FakeMovingAverageAbsMaxKernelBase() = default;
+
+ protected:
+  virtual void RunClipFunctor(const DeviceContext& dev_ctx,
+                              const framework::Tensor& in,
+                              const framework::Tensor& in_scale, int bin_cnt,
+                              framework::Tensor* out) const = 0;
 };
 
 template <typename DeviceContext, typename T>
 class FakeQuantizeMovingAverageAbsMaxKernel
     : public FakeMovingAverageAbsMaxKernelBase<DeviceContext, T> {
- public:
+ protected:
   void RunClipFunctor(const DeviceContext& dev_ctx, const framework::Tensor& in,
                       const framework::Tensor& in_scale, int bin_cnt,
                       framework::Tensor* out) const override {
@@ -229,7 +263,7 @@ class FakeQuantizeMovingAverageAbsMaxKernel
 template <typename DeviceContext, typename T>
 class FakeQuantizeDequantizeMovingAverageAbsMaxKernel
     : public FakeMovingAverageAbsMaxKernelBase<DeviceContext, T> {
- public:
+ protected:
   void RunClipFunctor(const DeviceContext& dev_ctx, const framework::Tensor& in,
                       const framework::Tensor& in_scale, int bin_cnt,
                       framework::Tensor* out) const override {
@@ -277,5 +311,24 @@ class MovingAverageAbsMaxScaleKernel : public framework::OpKernel<T> {
   }
 };
 
+template <typename DeviceContext, typename T>
+class FakeQuantDequantGradKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& context) const override {
+    auto* d_out =
+        context.Input<framework::LoDTensor>(framework::GradVarName("Out"));
+    auto x_grad_name = framework::GradVarName("X");
+    auto* d_x = context.Output<framework::LoDTensor>(x_grad_name);
+    PADDLE_ENFORCE_NOT_NULL(
+        d_x, platform::errors::PreconditionNotMet(
+                 "FakeQuantDequantGradOp doesn't have the output named %s.",
+                 x_grad_name));
+
+    // Initialize dx as same as d_out
+    d_x->mutable_data<T>(context.GetPlace());
+    framework::TensorCopy(*d_out, context.GetPlace(), d_x);
+  }
+};
+
 }  // namespace operators
 }  // namespace paddle
diff --git a/paddle/fluid/operators/fill_constant_op.cc b/paddle/fluid/operators/fill_constant_op.cc
index 36873f16808e60730425c4ef4383d3159e5c8854..35d54577bfef88000cbd75607e68d35e72ac2a18 100644
--- a/paddle/fluid/operators/fill_constant_op.cc
+++ b/paddle/fluid/operators/fill_constant_op.cc
@@ -51,6 +51,17 @@ class FillConstantOp : public framework::OperatorWithKernel {
   }
 
  protected:
+  framework::OpKernelType GetKernelTypeForVar(
+      const std::string& var_name, const framework::Tensor& tensor,
+      const framework::OpKernelType& expected_kernel_type) const override {
+    if (var_name == "ShapeTensor" || var_name == "ShapeTensorList") {
+      return expected_kernel_type;
+    } else {
+      return framework::OpKernelType(expected_kernel_type.data_type_,
+                                     tensor.place(), tensor.layout());
+    }
+  }
+
   framework::OpKernelType GetExpectedKernelType(
       const framework::ExecutionContext& ctx) const override {
     return framework::OpKernelType(
diff --git a/paddle/fluid/operators/flip_op.cc b/paddle/fluid/operators/flip_op.cc
index 7d0df5ffbd8945ca054fe24088b5fd7b6f5ef167..fc17657594b7a88d15bd2d9f184bc1bf71a71bc2 100644
--- a/paddle/fluid/operators/flip_op.cc
+++ b/paddle/fluid/operators/flip_op.cc
@@ -36,46 +36,52 @@ class FlipOp : public framework::OperatorWithKernel {
                       platform::errors::NotFound(
                           "Output(Out) of FlipOp should not be null."));
     auto x_dims = ctx->GetInputDim("X");
-    auto flip_dims = ctx->Attrs().Get<std::vector<int>>("dims");
+    auto flip_dims = ctx->Attrs().Get<std::vector<int>>("axis");
     size_t flip_dims_size = flip_dims.size();
 
-    // check if dims axis within range
-    auto min_max_d = std::minmax_element(flip_dims.begin(), flip_dims.end());
-    PADDLE_ENFORCE_LT(*min_max_d.first, x_dims.size(),
-                      platform::errors::InvalidArgument(
-                          "min(dims) should be less than the input tensor X's "
-                          "dimensions of FlipOp. But received min(dims) = %d,  "
-                          "X's dimensions = %d, X's shape = [%s]",
-                          *min_max_d.first, x_dims.size(), x_dims));
-    PADDLE_ENFORCE_GE(
-        *min_max_d.first, x_dims.size() * -1,
-        platform::errors::InvalidArgument(
-            "min(dims) should be greater than or equal to the input tensor X's "
-            "dimensions of FlipOp times -1. But received min(dims) = %d,  X's "
-            "dimensions = %d, X's shape = [%s]",
-            *min_max_d.first, x_dims.size() * -1, x_dims));
-    PADDLE_ENFORCE_LT(*min_max_d.second, x_dims.size(),
-                      platform::errors::InvalidArgument(
-                          "max(dims) should be less than the input tensor X's "
-                          "dimensions of FlipOp. But received max(dims) = %d,  "
-                          "X's dimensions = %d, X's shape = [%s]",
-                          *min_max_d.second, x_dims.size(), x_dims));
-    PADDLE_ENFORCE_GE(
-        *min_max_d.second, x_dims.size() * -1,
-        platform::errors::InvalidArgument(
-            "max(dims) should be greater than or equal to the input tensor X's "
-            "dimensions of FlipOp times -1. But received max(dims) = %d,  X's "
-            "dimensions = %d, X's shape = [%s]",
-            *min_max_d.second, x_dims.size() * -1, x_dims));
-
-    // check duplicates in dims
-    flip_dims.erase(std::unique(flip_dims.begin(), flip_dims.end()),
-                    flip_dims.end());
-    PADDLE_ENFORCE_EQ(flip_dims.size(), flip_dims_size,
-                      platform::errors::InvalidArgument(
-                          "dims has duplicates, original flip dims size=%d, "
-                          "but unique flip dims size=%d.)",
-                          flip_dims_size, flip_dims.size()));
+    if (flip_dims_size > 0) {
+      // check if dims axis within range
+      auto min_max_d = std::minmax_element(flip_dims.begin(), flip_dims.end());
+      PADDLE_ENFORCE_LT(
+          *min_max_d.first, x_dims.size(),
+          platform::errors::InvalidArgument(
+              "min(axes) should be less than the input tensor X's "
+              "axes of FlipOp. But received min(axes) = %d,  "
+              "X's axes = %d, X's shape = [%s]",
+              *min_max_d.first, x_dims.size(), x_dims));
+      PADDLE_ENFORCE_GE(*min_max_d.first, x_dims.size() * -1,
+                        platform::errors::InvalidArgument(
+                            "min(axes) should be greater than or equal to the "
+                            "input tensor X's "
+                            "axes of FlipOp times -1. But received "
+                            "min(axes) = %d,  X's "
+                            "axes = %d, X's shape = [%s]",
+                            *min_max_d.first, x_dims.size() * -1, x_dims));
+      PADDLE_ENFORCE_LT(
+          *min_max_d.second, x_dims.size(),
+          platform::errors::InvalidArgument(
+              "max(axes) should be less than the input tensor X's "
+              "axes of FlipOp. But received max(axes) = %d,  "
+              "X's axes = %d, X's shape = [%s]",
+              *min_max_d.second, x_dims.size(), x_dims));
+      PADDLE_ENFORCE_GE(*min_max_d.second, x_dims.size() * -1,
+                        platform::errors::InvalidArgument(
+                            "max(axes) should be greater than or equal to the "
+                            "input tensor X's "
+                            "axes of FlipOp times -1. But received "
+                            "max(axes) = %d,  X's "
+                            "axes = %d, X's shape = [%s]",
+                            *min_max_d.second, x_dims.size() * -1, x_dims));
+
+      // check duplicates in dims
+      flip_dims.erase(std::unique(flip_dims.begin(), flip_dims.end()),
+                      flip_dims.end());
+      PADDLE_ENFORCE_EQ(flip_dims.size(), flip_dims_size,
+                        platform::errors::InvalidArgument(
+                            "axes has duplicates, original flip axes size=%d, "
+                            "but unique flip axes size=%d.)",
+                            flip_dims_size, flip_dims.size()));
+    }
 
     VLOG(3) << "flip operator x.shape=" << x_dims;
 
@@ -104,10 +110,10 @@ class FlipOpMaker : public framework::OpProtoAndCheckerMaker {
   void Make() override {
     AddInput("X", "(Tensor), The input tensor of flip op.");
     AddOutput("Out", "(Tensor), The output tensor of flip op.");
-    AddAttr<std::vector<int>>("dims", "The axes to flip on.");
+    AddAttr<std::vector<int>>("axis", "The axes to flip on.");
     AddComment(R"DOC(
           Flip Operator.
-          Reverse the order of a n-D tensor along given axis in dims.
+          Reverse the order of a n-D tensor along given axis in axes.
       )DOC");
   }
 };
diff --git a/paddle/fluid/operators/flip_op.cu b/paddle/fluid/operators/flip_op.cu
index 41aae1e1f35a6bda1d926dec711b4ce01ea65f4b..581a994ba84b5e288690ed8f9fb07bc092b67569 100644
--- a/paddle/fluid/operators/flip_op.cu
+++ b/paddle/fluid/operators/flip_op.cu
@@ -81,7 +81,7 @@ class FlipKernel<platform::CUDADeviceContext, T>
     Tensor* out = ctx.Output<Tensor>("Out");
     auto* in_data = x->data<T>();
     auto* out_data = out->mutable_data<T>(ctx.GetPlace());
-    auto flip_dims = ctx.template Attr<std::vector<int>>("dims");
+    auto flip_dims = ctx.template Attr<std::vector<int>>("axis");
 
     const int flip_dims_size = static_cast<int>(flip_dims.size());
     auto x_dims = x->dims();
diff --git a/paddle/fluid/operators/flip_op.h b/paddle/fluid/operators/flip_op.h
index 73d73f5d0f2e06dc4049f4b10ea7a12d63193c40..b77827b782b1aa6999b447c8b64bb2339af7b8e3 100644
--- a/paddle/fluid/operators/flip_op.h
+++ b/paddle/fluid/operators/flip_op.h
@@ -41,7 +41,7 @@ class FlipKernel<platform::CPUDeviceContext, T>
   void Compute(const framework::ExecutionContext& ctx) const override {
     const Tensor* x = ctx.Input<Tensor>("X");
     Tensor* out = ctx.Output<Tensor>("Out");
-    auto flip_dims = ctx.template Attr<std::vector<int>>("dims");
+    auto flip_dims = ctx.template Attr<std::vector<int>>("axis");
 
     auto x_dims = x->dims();
     const int total_dims = x_dims.size();
diff --git a/paddle/fluid/operators/gather.cu.h b/paddle/fluid/operators/gather.cu.h
index 979deb8919ed6ec583248c8f084e83b805f75d87..f59d46ec79bd0960392ed1b8b3c8ee27b2317e39 100644
--- a/paddle/fluid/operators/gather.cu.h
+++ b/paddle/fluid/operators/gather.cu.h
@@ -27,15 +27,11 @@ namespace operators {
 using framework::Tensor;
 using platform::DeviceContext;
 
-#define CUDA_1D_KERNEL_LOOP(i, n)                              \
-  for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < (n); \
-       i += blockDim.x * gridDim.x)
-
 template <typename T, typename IndexT = int>
 __global__ void GatherCUDAKernel(const T* params, const IndexT* indices,
                                  T* output, size_t index_size,
                                  size_t slice_size) {
-  CUDA_1D_KERNEL_LOOP(i, index_size * slice_size) {
+  CUDA_KERNEL_LOOP(i, index_size * slice_size) {
     int indices_i = i / slice_size;
     int slice_i = i - indices_i * slice_size;  // offset inside the slice
     IndexT gather_i = indices[indices_i];
@@ -49,7 +45,7 @@ __global__ void GatherNdCUDAKernel(const T* input, const int* input_dims,
                                    const IndexT* indices, T* output,
                                    size_t remain_size, size_t slice_size,
                                    size_t end_size) {
-  CUDA_1D_KERNEL_LOOP(i, remain_size * slice_size) {
+  CUDA_KERNEL_LOOP(i, remain_size * slice_size) {
     int indices_i = i / slice_size;
     int slice_i = i - indices_i * slice_size;  // offset inside the slice
     IndexT gather_i = 0;
diff --git a/paddle/fluid/operators/gather_tree_op.cu b/paddle/fluid/operators/gather_tree_op.cu
index 7ea3641b99f1a824b133c04029354b3a2f59578b..c53f1e81cef54e266ce36147baa89d104d2ec99d 100644
--- a/paddle/fluid/operators/gather_tree_op.cu
+++ b/paddle/fluid/operators/gather_tree_op.cu
@@ -19,15 +19,11 @@ limitations under the License. */
 namespace paddle {
 namespace operators {
 
-#define CUDA_1D_KERNEL_LOOP(i, n)                              \
-  for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < (n); \
-       i += blockDim.x * gridDim.x)
-
 template <typename T>
 __global__ void GatherTree(const T *ids_data, const T *parents_data,
                            T *out_data, const int64_t max_length,
                            const int64_t batch_size, const int64_t beam_size) {
-  CUDA_1D_KERNEL_LOOP(i, batch_size * beam_size) {
+  CUDA_KERNEL_LOOP(i, batch_size * beam_size) {
     int batch = i / beam_size;
     int beam = i % beam_size;
     auto idx =
diff --git a/paddle/fluid/operators/gaussian_random_op.cc b/paddle/fluid/operators/gaussian_random_op.cc
index 0f1e4de5cbb88f10bc77791a385c73b5e8df24f0..253078751ce66dd2a6d52dbdd5fe6b5c0ed21849 100644
--- a/paddle/fluid/operators/gaussian_random_op.cc
+++ b/paddle/fluid/operators/gaussian_random_op.cc
@@ -98,7 +98,7 @@ class GaussianRandomOp : public framework::OperatorWithKernel {
 
       return;
     }
-    if (!(ctx->HasInput("ShapeTensor") && !ctx->HasInputs("ShapeTensorList"))) {
+    if (!ctx->HasInput("ShapeTensor") && !ctx->HasInputs("ShapeTensorList")) {
       PADDLE_ENFORCE_GT(
           shape.size(), 0UL,
           platform::errors::InvalidArgument(
diff --git a/paddle/fluid/operators/histogram_op.cu b/paddle/fluid/operators/histogram_op.cu
index 359e90bfc3ac59620c6634fad6607d78aec53ec8..3de24ead0de36245f96af4bb7b6c72209b37f885 100644
--- a/paddle/fluid/operators/histogram_op.cu
+++ b/paddle/fluid/operators/histogram_op.cu
@@ -27,10 +27,6 @@ using IndexType = int64_t;
 using Tensor = framework::Tensor;
 using platform::PADDLE_CUDA_NUM_THREADS;
 
-#define CUDA_KERNEL_LOOP(i, n)                                 \
-  for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < (n); \
-       i += blockDim.x * gridDim.x)
-
 inline int GET_BLOCKS(const int N) {
   return (N + PADDLE_CUDA_NUM_THREADS - 1) / PADDLE_CUDA_NUM_THREADS;
 }
diff --git a/paddle/fluid/operators/index_select_op.h b/paddle/fluid/operators/index_select_op.h
index 96ec18d9a0d219bfd5fa10f45ca63e8c6fb7bce3..70714b7f3a0644e55c9bac27a88edb0b8a9921a4 100644
--- a/paddle/fluid/operators/index_select_op.h
+++ b/paddle/fluid/operators/index_select_op.h
@@ -52,6 +52,23 @@ void IndexSelectInner(const framework::ExecutionContext& context,
   TensorToVector(index, context.device_context(), &index_vec);
   std::vector<T> out_vec(output->numel());
 
+  for (int i = 0; i < index_size; i++) {
+    PADDLE_ENFORCE_GE(
+        index_vec[i], 0,
+        platform::errors::InvalidArgument(
+            "Variable value (index) of OP(index_select) "
+            "expected >= 0 and < %ld, but got %ld. Please check input "
+            "value.",
+            input_dim[dim], index_vec[i]));
+    PADDLE_ENFORCE_LT(
+        index_vec[i], input_dim[dim],
+        platform::errors::InvalidArgument(
+            "Variable value (index) of OP(index_select) "
+            "expected >= 0 and < %ld, but got %ld. Please check input "
+            "value.",
+            input_dim[dim], index_vec[i]));
+  }
+
   VLOG(3) << "Index_Select_Debug; outer_nums: " << outer_nums
           << "; slice_size: " << slice_size << "; input_width: " << input_width
           << "; output_width: " << output_width
diff --git a/paddle/fluid/operators/instance_norm_op.cc b/paddle/fluid/operators/instance_norm_op.cc
index d2b59a239a2e338012de01434b79d657dd96053a..f72f7e8b85b873d9be57c8ff348e6adb2251d65d 100644
--- a/paddle/fluid/operators/instance_norm_op.cc
+++ b/paddle/fluid/operators/instance_norm_op.cc
@@ -24,8 +24,6 @@ namespace operators {
 
 void InstanceNormOp::InferShape(framework::InferShapeContext *ctx) const {
   OP_INOUT_CHECK(ctx->HasInput("X"), "Input", "X", "InstanceNorm");
-  OP_INOUT_CHECK(ctx->HasInput("Scale"), "Input", "Scale", "InstanceNorm");
-  OP_INOUT_CHECK(ctx->HasInput("Bias"), "Input", "Bias", "InstanceNorm");
   OP_INOUT_CHECK(ctx->HasOutput("Y"), "Output", "Y", "InstanceNorm");
   OP_INOUT_CHECK(ctx->HasOutput("SavedMean"), "Output", "SavedMean",
                  "InstanceNorm");
@@ -51,37 +49,45 @@ void InstanceNormOp::InferShape(framework::InferShapeContext *ctx) const {
   auto C = x_dims[1];
   auto NxC = N * C;
 
-  auto scale_dim = ctx->GetInputDim("Scale");
-  auto bias_dim = ctx->GetInputDim("Bias");
-
-  PADDLE_ENFORCE_EQ(
-      scale_dim.size(), 1UL,
-      platform::errors::InvalidArgument(
-          "ShapeError: the dimension of scale must equal to 1."
-          "But received: the shape of scale is [%s], the dimension "
-          "of scale is [%d]",
-          scale_dim, scale_dim.size()));
-  PADDLE_ENFORCE_EQ(bias_dim.size(), 1UL,
-                    platform::errors::InvalidArgument(
-                        "ShapeError: the dimension of bias must equal to 1."
-                        "But received: the shape of bias is [%s],the dimension "
-                        "of bias is [%d]",
-                        bias_dim, bias_dim.size()));
-
-  bool check = !((!ctx->IsRuntime()) && (framework::product(scale_dim) <= 0 ||
-                                         framework::product(bias_dim) <= 0));
-
-  if (check) {
-    PADDLE_ENFORCE_EQ(scale_dim[0], C,
-                      platform::errors::InvalidArgument(
-                          "ShapeError: the shape of scale must equal to [%d]"
-                          "But received: the shape of scale is [%d]",
-                          C, scale_dim[0]));
-    PADDLE_ENFORCE_EQ(bias_dim[0], C,
-                      platform::errors::InvalidArgument(
-                          "ShapeError: the shape of bias must equal to [%d]"
-                          "But received: the shape of bias is [%d]",
-                          C, bias_dim[0]));
+  if (ctx->HasInput("Scale")) {
+    auto scale_dim = ctx->GetInputDim("Scale");
+
+    PADDLE_ENFORCE_EQ(
+        scale_dim.size(), 1UL,
+        platform::errors::InvalidArgument(
+            "ShapeError: the dimension of scale must equal to 1."
+            "But received: the shape of scale is [%s], the dimension "
+            "of scale is [%d]",
+            scale_dim, scale_dim.size()));
+
+    bool check = !((!ctx->IsRuntime()) && (framework::product(scale_dim) <= 0));
+
+    if (check) {
+      PADDLE_ENFORCE_EQ(scale_dim[0], C,
+                        platform::errors::InvalidArgument(
+                            "ShapeError: the shape of scale must equal to [%d]"
+                            "But received: the shape of scale is [%d]",
+                            C, scale_dim[0]));
+    }
+  }
+  if (ctx->HasInput("Bias")) {
+    auto bias_dim = ctx->GetInputDim("Bias");
+    PADDLE_ENFORCE_EQ(
+        bias_dim.size(), 1UL,
+        platform::errors::InvalidArgument(
+            "ShapeError: the dimension of bias must equal to 1."
+            "But received: the shape of bias is [%s],the dimension "
+            "of bias is [%d]",
+            bias_dim, bias_dim.size()));
+
+    bool check = !((!ctx->IsRuntime()) && (framework::product(bias_dim) <= 0));
+    if (check) {
+      PADDLE_ENFORCE_EQ(bias_dim[0], C,
+                        platform::errors::InvalidArgument(
+                            "ShapeError: the shape of bias must equal to [%d]"
+                            "But received: the shape of bias is [%d]",
+                            C, bias_dim[0]));
+    }
   }
 
   ctx->SetOutputDim("Y", x_dims);
@@ -100,12 +106,16 @@ framework::OpKernelType InstanceNormOp::GetExpectedKernelType(
   if (input_data_type == framework::proto::VarType::FP64) {
     in_param_type = framework::proto::VarType::FP64;
   }
-  PADDLE_ENFORCE_EQ(
-      in_param_type, ctx.Input<Tensor>("Scale")->type(),
-      platform::errors::InvalidArgument("Scale input should be of float type"));
-  PADDLE_ENFORCE_EQ(
-      in_param_type, ctx.Input<Tensor>("Bias")->type(),
-      platform::errors::InvalidArgument("Bias input should be of float type"));
+  if (ctx.HasInput("Scale")) {
+    PADDLE_ENFORCE_EQ(in_param_type, ctx.Input<Tensor>("Scale")->type(),
+                      platform::errors::InvalidArgument(
+                          "Scale input should be of float type"));
+  }
+  if (ctx.HasInput("Bias")) {
+    PADDLE_ENFORCE_EQ(in_param_type, ctx.Input<Tensor>("Bias")->type(),
+                      platform::errors::InvalidArgument(
+                          "Bias input should be of float type"));
+  }
 
   return framework::OpKernelType(input_data_type, ctx.GetPlace());
 }
@@ -121,10 +131,12 @@ void InstanceNormOpMaker::Make() {
   AddInput("X", "The input tensor");
   AddInput("Scale",
            "Scale is a 1-dimensional tensor of size C "
-           "that is applied to the output");
+           "that is applied to the output")
+      .AsDispensable();
   AddInput("Bias",
            "Bias is a 1-dimensional tensor of size C "
-           "that is applied to the output");
+           "that is applied to the output")
+      .AsDispensable();
   AddOutput("Y", "result after normalization");
   AddOutput("SavedMean",
             "Mean of the current mini batch, "
@@ -199,9 +211,26 @@ class InstanceNormKernel<platform::CPUDeviceContext, T>
 
     const auto *scale = ctx.Input<Tensor>("Scale");
     const auto *bias = ctx.Input<Tensor>("Bias");
-    auto scale_e = framework::EigenVector<T>::Flatten(*scale);
+
+    Tensor scale_data;
+    Tensor bias_data;
+    if (!scale) {
+      scale_data.mutable_data<T>({C}, ctx.GetPlace());
+      set_constant(dev_ctx, &scale_data, static_cast<T>(1));
+    }
+
+    if (!bias) {
+      bias_data.mutable_data<T>({C}, ctx.GetPlace());
+      set_constant(dev_ctx, &bias_data, static_cast<T>(0));
+    }
+    auto scale_e = scale
+                       ? framework::EigenVector<T>::Flatten(*scale)
+                       : framework::EigenVector<T>::Flatten(
+                             const_cast<const framework::Tensor &>(scale_data));
     auto scale_arr = scale_e.reshape(C_shape);
-    auto bias_e = framework::EigenVector<T>::Flatten(*bias);
+    auto bias_e = bias ? framework::EigenVector<T>::Flatten(*bias)
+                       : framework::EigenVector<T>::Flatten(
+                             const_cast<const framework::Tensor &>(bias_data));
     auto bias_arr = bias_e.reshape(C_shape);
 
     y->mutable_data<T>(ctx.GetPlace());
@@ -219,7 +248,6 @@ class InstanceNormKernel<platform::CPUDeviceContext, T>
 
 void InstanceNormGradOp::InferShape(framework::InferShapeContext *ctx) const {
   OP_INOUT_CHECK(ctx->HasInput("X"), "Input", "X", "InstanceNormGrad");
-  OP_INOUT_CHECK(ctx->HasInput("Scale"), "Input", "Scale", "InstanceNormGrad");
   OP_INOUT_CHECK(ctx->HasInput(framework::GradVarName("Y")), "Input",
                  framework::GradVarName("Y"), "InstanceNormGrad");
   OP_INOUT_CHECK(ctx->HasInput("SavedMean"), "Input", "SavedMean",
@@ -230,15 +258,13 @@ void InstanceNormGradOp::InferShape(framework::InferShapeContext *ctx) const {
   // check output
   OP_INOUT_CHECK(ctx->HasOutput(framework::GradVarName("X")), "Output",
                  framework::GradVarName("X"), "InstanceNormGrad");
-  if (ctx->HasOutput(framework::GradVarName("Scale"))) {
-    OP_INOUT_CHECK(ctx->HasOutput(framework::GradVarName("Bias")), "Output",
-                   framework::GradVarName("Bias"), "InstanceNormGrad");
-  }
   const auto x_dims = ctx->GetInputDim("X");
   const int C = x_dims[1];
   ctx->SetOutputDim(framework::GradVarName("X"), x_dims);
   if (ctx->HasOutput(framework::GradVarName("Scale"))) {
     ctx->SetOutputDim(framework::GradVarName("Scale"), {C});
+  }
+  if (ctx->HasOutput(framework::GradVarName("Bias"))) {
     ctx->SetOutputDim(framework::GradVarName("Bias"), {C});
   }
 }
@@ -299,7 +325,18 @@ class InstanceNormGradKernel<platform::CPUDeviceContext, T>
     Eigen::DSizes<int, 2> param_shape(N, C);
     Eigen::DSizes<int, 2> shape(NxC, sample_size);
 
-    auto scale_e = framework::EigenVector<T>::Flatten(*scale);
+    math::SetConstant<platform::CPUDeviceContext, T> set_constant;
+
+    Tensor scale_data;
+    if (!scale) {
+      scale_data.mutable_data<T>({C}, ctx.GetPlace());
+      set_constant(dev_ctx, &scale_data, static_cast<T>(1));
+    }
+
+    auto scale_e = scale
+                       ? framework::EigenVector<T>::Flatten(*scale)
+                       : framework::EigenVector<T>::Flatten(
+                             const_cast<const framework::Tensor &>(scale_data));
     auto mean_e = framework::EigenVector<T>::Flatten(*saved_mean);
     auto inv_var_e = framework::EigenVector<T>::Flatten(*saved_inv_variance);
     auto dy_e = framework::EigenVector<T>::Flatten(*d_y);
@@ -314,7 +351,6 @@ class InstanceNormGradKernel<platform::CPUDeviceContext, T>
     auto tmp = (x_arr - mean_arr.eval().broadcast(bcast)) *
                inv_var_arr.eval().broadcast(bcast);
 
-    math::SetConstant<platform::CPUDeviceContext, T> set_constant;
     // math: d_bias = np.sum(d_y, axis=(n,h,w))
     // math: d_scale = np.sum((X-mean) / inv_std * dy, axis=(n, h,w))
     if (d_scale && d_bias) {
@@ -324,8 +360,8 @@ class InstanceNormGradKernel<platform::CPUDeviceContext, T>
       set_constant(dev_ctx, d_bias, static_cast<T>(0));
 
       auto d_scale_e = framework::EigenVector<T>::Flatten(*d_scale);
-      auto d_bias_e = framework::EigenVector<T>::Flatten(*d_bias);
       auto d_scale_data = d_scale_e.reshape(C_shape);
+      auto d_bias_e = framework::EigenVector<T>::Flatten(*d_bias);
       auto d_bias_data = d_bias_e.reshape(C_shape);
       d_bias_data.device(*place) =
           dy_arr.sum(mean_rdims).reshape(param_shape).sum(rdims);
@@ -360,8 +396,6 @@ class InstanceNormGradKernel<platform::CPUDeviceContext, T>
 void InstanceNormDoubleGradOp::InferShape(
     framework::InferShapeContext *ctx) const {
   OP_INOUT_CHECK(ctx->HasInput("X"), "Input", "X", "InstanceNormDoubleGrad");
-  OP_INOUT_CHECK(ctx->HasInput("Scale"), "Input", "Scale",
-                 "InstanceNormDoubleGrad");
   OP_INOUT_CHECK(ctx->HasInput("SavedMean"), "Input", "SavedMean",
                  "InstanceNormDoubleGrad");
   OP_INOUT_CHECK(ctx->HasInput("SavedVariance"), "Input", "SavedVariance",
@@ -426,6 +460,9 @@ class InstanceNormDoubleGradKernel<platform::CPUDeviceContext, T>
     auto *dScale = ctx.Output<Tensor>("DScale");
     auto *ddY = ctx.Output<Tensor>("DDY");
 
+    auto &dev_ctx = ctx.template device_context<platform::CPUDeviceContext>();
+    math::SetConstant<platform::CPUDeviceContext, T> set_constant;
+
     const auto &x_dims = X->dims();
     int N, C, H, W, D;
     ExtractNCWHD(x_dims, DataLayout::kNCHW, &N, &C, &H, &W, &D);
@@ -455,7 +492,13 @@ class InstanceNormDoubleGradKernel<platform::CPUDeviceContext, T>
     mean_tile_data = mean_arr.transpose().replicate(sample_size, 1);
     inv_var_tile_data = inv_var_arr.transpose().replicate(sample_size, 1);
 
-    ConstEigenVectorArrayMap<T> scale_arr(Scale->data<T>(), C);
+    Tensor Scale_data;
+    if (!Scale) {
+      Scale_data.mutable_data<T>({C}, ctx.GetPlace());
+      set_constant(dev_ctx, &Scale_data, static_cast<T>(1));
+    }
+    ConstEigenVectorArrayMap<T> scale_arr(
+        Scale ? Scale->data<T>() : Scale_data.data<T>(), C);
 
     Tensor scale_tile;
     scale_tile.Resize({sample_size, NxC});
@@ -483,9 +526,6 @@ class InstanceNormDoubleGradKernel<platform::CPUDeviceContext, T>
     //          inv_var.pow(3) * (x - mean) * np.mean(dy * (x - mean),
     //          axis=(h,w))))
 
-    auto &dev_ctx = ctx.template device_context<platform::CPUDeviceContext>();
-    math::SetConstant<platform::CPUDeviceContext, T> set_constant;
-
     Tensor x_sub_mean_mul_invstd;
     x_sub_mean_mul_invstd.Resize({sample_size, NxC});
     x_sub_mean_mul_invstd.mutable_data<T>(ctx.GetPlace());
diff --git a/paddle/fluid/operators/instance_norm_op.cu b/paddle/fluid/operators/instance_norm_op.cu
index 1567c229cdc2d587479323d12a81bedcf02a8a5b..51313835ebad4b269e7cd2348d50e9b436b22bdd 100644
--- a/paddle/fluid/operators/instance_norm_op.cu
+++ b/paddle/fluid/operators/instance_norm_op.cu
@@ -35,8 +35,7 @@ using BatchNormParamType = typename CudnnDataType<T>::BatchNormParamType;
 template <typename T>
 static __global__ void repeat_param(const T *input, T *output,
                                     const int repeat_num, const int C) {
-  for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < repeat_num * C;
-       i += blockDim.x * gridDim.x) {
+  CUDA_KERNEL_LOOP(i, repeat_num * C) {
     int index = i % C;
     output[i] = input[index];
   }
@@ -146,10 +145,19 @@ class InstanceNormKernel<platform::CUDADeviceContext, T>
     const int max_blocks = std::max(max_threads / block, 1);
     const int grid = std::min((NxC + block - 1) / block, max_blocks);
 
-    repeat_param<T><<<grid, block, 0, dev_ctx.stream()>>>(
-        scale->data<T>(), scale_tmp.data<T>(), N, C);
-    repeat_param<T><<<grid, block, 0, dev_ctx.stream()>>>(
-        bias->data<T>(), bias_tmp.data<T>(), N, C);
+    math::SetConstant<platform::CUDADeviceContext, T> set_constant;
+    if (scale) {
+      repeat_param<T><<<grid, block, 0, dev_ctx.stream()>>>(
+          scale->data<T>(), scale_tmp.data<T>(), N, C);
+    } else {
+      set_constant(dev_ctx, &scale_tmp, static_cast<T>(1));
+    }
+    if (bias) {
+      repeat_param<T><<<grid, block, 0, dev_ctx.stream()>>>(
+          bias->data<T>(), bias_tmp.data<T>(), N, C);
+    } else {
+      set_constant(dev_ctx, &bias_tmp, static_cast<T>(0));
+    }
 
     auto handle = dev_ctx.cudnn_handle();
 
@@ -267,24 +275,27 @@ class InstanceNormGradKernel<platform::CUDADeviceContext, T>
       d_scale->mutable_data<T>(ctx.GetPlace());
       d_bias->mutable_data<T>(ctx.GetPlace());
     }
-    PADDLE_ENFORCE_EQ(
-        scale->dims().size(), 1UL,
-        platform::errors::InvalidArgument(
-            "The `shape` in InstanceNormOp is invalid: "
-            "the size of scale's dimensions must be equal to 1. But "
-            "received: the size of scale's dimensions"
-            "is [%d]",
-            scale->dims().size()));
-    PADDLE_ENFORCE_EQ(scale->dims()[0], C,
-                      platform::errors::InvalidArgument(
-                          "The `shape` in InstanceNormOp is invalid: "
-                          "the first dimension of scale must be equal to "
-                          "Channels([%d]). But received: "
-                          "the first dimension of scale is [%d],"
-                          "the dimensions of scale is [%s], ",
-                          C, scale->dims()[0], scale->dims()));
+    if (scale) {
+      PADDLE_ENFORCE_EQ(
+          scale->dims().size(), 1UL,
+          platform::errors::InvalidArgument(
+              "The `shape` in InstanceNormOp is invalid: "
+              "the size of scale's dimensions must be equal to 1. But "
+              "received: the size of scale's dimensions"
+              "is [%d]",
+              scale->dims().size()));
+      PADDLE_ENFORCE_EQ(scale->dims()[0], C,
+                        platform::errors::InvalidArgument(
+                            "The `shape` in InstanceNormOp is invalid: "
+                            "the first dimension of scale must be equal to "
+                            "Channels([%d]). But received: "
+                            "the first dimension of scale is [%d],"
+                            "the dimensions of scale is [%s], ",
+                            C, scale->dims()[0], scale->dims()));
+    }
 
     auto &dev_ctx = ctx.template device_context<platform::CUDADeviceContext>();
+    math::SetConstant<platform::CUDADeviceContext, T> set_constant;
 
     const int n = x->numel();
     const int block = 512;
@@ -300,8 +311,12 @@ class InstanceNormGradKernel<platform::CUDADeviceContext, T>
         ctx.AllocateTmpTensor<T, platform::CUDADeviceContext>({NxC}, dev_ctx);
     Tensor d_bias_tmp =
         ctx.AllocateTmpTensor<T, platform::CUDADeviceContext>({NxC}, dev_ctx);
-    repeat_param<T><<<grid, block, 0, dev_ctx.stream()>>>(
-        scale->data<T>(), scale_tmp.data<T>(), N, C);
+    if (scale) {
+      repeat_param<T><<<grid, block, 0, dev_ctx.stream()>>>(
+          scale->data<T>(), scale_tmp.data<T>(), N, C);
+    } else {
+      set_constant(dev_ctx, &scale_tmp, static_cast<T>(1));
+    }
 
     std::vector<int> dims;
     std::vector<int> strides;
@@ -361,7 +376,7 @@ class InstanceNormGradKernel<platform::CUDADeviceContext, T>
     } else {
       if (d_x) {
         GradComputeDX<T, block><<<NxC, block, 0, dev_ctx.stream()>>>(
-            d_y->data<T>(), scale->data<BatchNormParamType<T>>(),
+            d_y->data<T>(), scale_tmp.data<BatchNormParamType<T>>(),
             saved_mean_data, x->data<T>(), saved_var_data, C, H * W * D,
             d_x->data<T>());
       }
@@ -610,7 +625,6 @@ class InstanceNormDoubleGradKernel<platform::CUDADeviceContext, T>
     auto *ddY = ctx.Output<Tensor>("DDY");
 
     const T *x_data = X->data<T>();
-    const T *scale_data = Scale->data<T>();
     const T *dy_data = dY->data<T>();
     const T *ddx_data = (ddX == nullptr ? nullptr : ddX->data<T>());
 
@@ -620,6 +634,9 @@ class InstanceNormDoubleGradKernel<platform::CUDADeviceContext, T>
     const T *mean_data = Saved_mean->data<T>();
     const T *variance_data = Saved_variance->data<T>();
 
+    auto &dev_ctx = ctx.template device_context<platform::CUDADeviceContext>();
+    math::SetConstant<platform::CUDADeviceContext, T> set_zero;
+
     auto &x_dims = X->dims();
     int N, C, H, W, D;
     ExtractNCWHD(x_dims, DataLayout::kNCHW, &N, &C, &H, &W, &D);
@@ -627,15 +644,19 @@ class InstanceNormDoubleGradKernel<platform::CUDADeviceContext, T>
     const int n = X->numel();
     int sample_size = n / N / C;
 
-    auto &dev_ctx = ctx.template device_context<platform::CUDADeviceContext>();
+    Tensor scale_tmp;
+    if (!Scale) {
+      scale_tmp.mutable_data<T>({C}, ctx.GetPlace());
+      set_zero(dev_ctx, &scale_tmp, static_cast<T>(1));
+    }
+    const T *scale_data = Scale ? Scale->data<T>() : scale_tmp.data<T>();
+
     const int block = 512;
     int max_threads = dev_ctx.GetMaxPhysicalThreadCount();
     const int max_blocks = std::max(max_threads / block, 1);
     const int grid = NxC;
     const int grid1 = (C + block - 1) / block;
 
-    math::SetConstant<platform::CUDADeviceContext, T> set_zero;
-
     if (dX) {
       T *dx_data = dX->mutable_data<T>(ctx.GetPlace());
       set_zero(dev_ctx, dX, static_cast<T>(0));
diff --git a/paddle/fluid/operators/linspace_op.cu b/paddle/fluid/operators/linspace_op.cu
index 90bd17cda0e0d1f78810233537bb502f9115fbd0..47d4536dcfe2a0ab43b3584196a138214e438e3e 100644
--- a/paddle/fluid/operators/linspace_op.cu
+++ b/paddle/fluid/operators/linspace_op.cu
@@ -19,13 +19,9 @@ limitations under the License. */
 namespace paddle {
 namespace operators {
 
-#define CUDA_1D_KERNEL_LOOP(i, n)                              \
-  for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < (n); \
-       i += blockDim.x * gridDim.x)
-
 template <typename T>
 __global__ void LinspaceKernel(T start, T step, int64_t size, T* out) {
-  CUDA_1D_KERNEL_LOOP(index, size) { out[index] = start + step * index; }
+  CUDA_KERNEL_LOOP(index, size) { out[index] = start + step * index; }
 }
 
 template <typename T>
diff --git a/paddle/fluid/operators/lite/ut_helper.h b/paddle/fluid/operators/lite/ut_helper.h
index 02a1a4150d01aa2edd95bf980ec3c73f8379a1f1..f83b2a1a85c4fbda3383c5723fd00fb5ef0f1fc7 100644
--- a/paddle/fluid/operators/lite/ut_helper.h
+++ b/paddle/fluid/operators/lite/ut_helper.h
@@ -23,6 +23,7 @@
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/framework/tensor_util.h"
 #include "paddle/fluid/inference/analysis/helper.h"
+#include "paddle/fluid/platform/errors.h"
 
 namespace paddle {
 namespace inference {
@@ -98,7 +99,7 @@ void CreateTensor(framework::Scope* scope, const std::string& name,
 #ifdef PADDLE_WITH_CUDA
     place = platform::CUDAPlace(0);
 #else
-    PADDLE_THROW(platform::errors::PreconditionNetMet(
+    PADDLE_THROW(platform::errors::PreconditionNotMet(
         "You must define PADDLE_WITH_CUDA for using CUDAPlace."));
 #endif
   } else {
diff --git a/paddle/fluid/operators/lstm_unit_op.cu b/paddle/fluid/operators/lstm_unit_op.cu
index 7d2279f16d35c0d39bbeb59fe4bf4450eb8dd13b..810b83cb535fecc02bb7ac2e2360217229614d8b 100644
--- a/paddle/fluid/operators/lstm_unit_op.cu
+++ b/paddle/fluid/operators/lstm_unit_op.cu
@@ -24,10 +24,6 @@ https://github.com/caffe2/caffe2/blob/master/caffe2/operators/lstm_unit_op_gpu.c
 namespace paddle {
 namespace operators {
 
-#define CUDA_1D_KERNEL_LOOP(i, n)                              \
-  for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < (n); \
-       i += blockDim.x * gridDim.x)
-
 template <typename Dtype>
 __device__ Dtype cuda_sigmoid(const Dtype x) {
   return Dtype(1) / (Dtype(1) + exp(-x));
@@ -42,7 +38,7 @@ template <typename T>
 __global__ void LSTMUnitKernel(const int nthreads, const int dim,
                                const T* C_prev, const T* X, T* C, T* H,
                                const T forget_bias) {
-  CUDA_1D_KERNEL_LOOP(index, nthreads) {
+  CUDA_KERNEL_LOOP(index, nthreads) {
     const int n = index / dim;
     const int d = index % dim;
 
@@ -65,7 +61,7 @@ __global__ void LSTMUnitGradientKernel(const int nthreads, const int dim,
                                        const T* C_diff, const T* H_diff,
                                        T* C_prev_diff, T* X_diff,
                                        const T forget_bias) {
-  CUDA_1D_KERNEL_LOOP(index, nthreads) {
+  CUDA_KERNEL_LOOP(index, nthreads) {
     const int n = index / dim;
     const int d = index % dim;
     const T* X_offset = X + 4 * dim * n;
diff --git a/paddle/fluid/operators/math/bert_encoder_functor.cu b/paddle/fluid/operators/math/bert_encoder_functor.cu
index 9274146290d5f3be7cf1a67a53267d2e82c82ee8..59a79bcb699307b1be81a8cb54006f3daebe7fb9 100644
--- a/paddle/fluid/operators/math/bert_encoder_functor.cu
+++ b/paddle/fluid/operators/math/bert_encoder_functor.cu
@@ -75,6 +75,34 @@ __device__ inline void LayerNorm(const kvp<T> &thread_data, const int ld,
   }
 }
 
+template <typename T, typename T2, int TPB>
+__device__ inline void LayerNorm2(const kvp<T> &thread_data, const int ld,
+                                  const int offset, const float2 *bias,
+                                  const float2 *scale, T2 *output, T eps) {
+  using BlockReduce = cub::BlockReduce<kvp<T>, TPB>;
+  __shared__ typename BlockReduce::TempStorage temp_storage;
+  __shared__ T mu;      // mean
+  __shared__ T rsigma;  // 1 / std.dev.
+
+  const auto sum_kv = BlockReduce(temp_storage).Reduce(thread_data, cub::Sum());
+
+  if (threadIdx.x == 0) {
+    mu = sum_kv.key;
+    rsigma = rsqrt(sum_kv.value - mu * mu + eps);
+  }
+  __syncthreads();
+
+  for (int i = threadIdx.x; i < ld; i += TPB) {
+    const int idx = offset + i;
+    T2 val = output[idx];
+    const float2 g = scale[i];
+    const float2 b = bias[i];
+    val.x = T(g.x) * (val.x - mu) * rsigma + T(b.x);
+    val.y = T(g.y) * (val.y - mu) * rsigma + T(b.y);
+    output[idx] = val;
+  }
+}
+
 template <typename T, unsigned TPB>
 __global__ void EmbEltwiseLayernormKernel(int hidden, const int64_t *ids,
                                           const float *scale, const float *bias,
@@ -323,6 +351,27 @@ __global__ void SkipLayerNormKernel(int num, int hidden, const T *input1,
   LayerNorm<T, TPB>(thread_data, hidden, offset, bias, scale, output, eps);
 }
 
+template <typename T, typename T2, unsigned TPB>
+__global__ void SkipLayerNormKernel2(int num, int hidden, const T2 *input1,
+                                     const T2 *input2, T2 *output,
+                                     const float2 *scale, const float2 *bias,
+                                     float eps) {
+  const T rld = T(0.5f / hidden);  // because hidden is hidden/2
+  const int offset = blockIdx.x * hidden;
+  cub::Sum pair_sum;
+  kvp<T> thread_data(0, 0);
+
+  for (int it = threadIdx.x; it < hidden; it += TPB) {
+    const int idx = offset + it;
+    const T2 val2 = input1[idx] + input2[idx];
+    thread_data = pair_sum(
+        thread_data, kvp<T>(rld * (val2.x + val2.y),
+                            rld * val2.x * val2.x + rld * val2.y * val2.y));
+    output[idx] = val2;
+  }
+  LayerNorm2<T, T2, TPB>(thread_data, hidden, offset, bias, scale, output, eps);
+}
+
 template <typename T>
 void SkipLayerNormFunctor<T>::operator()(const int num, const int hidden,
                                          const T *input1, const T *input2,
@@ -344,8 +393,35 @@ void SkipLayerNormFunctor<T>::operator()(const int num, const int hidden,
         num, hidden, input1, input2, output, scale, bias, eps);
   } else {
     const int threads = 256;
-    SkipLayerNormKernel<T, threads><<<block, threads, 0, stream>>>(
-        num, hidden, input1, input2, output, scale, bias, eps);
+    if (hidden % 2 == 0) {
+#ifdef SUPPORTS_CUDA_FP16
+      if (std::is_same<T, float>::value) {
+#endif
+        SkipLayerNormKernel2<float, float2,
+                             threads><<<block, threads, 0, stream>>>(
+            num, hidden / 2, reinterpret_cast<const float2 *>(input1),
+            reinterpret_cast<const float2 *>(input2),
+            reinterpret_cast<float2 *>(output),
+            reinterpret_cast<const float2 *>(scale),
+            reinterpret_cast<const float2 *>(bias), eps);
+#ifdef SUPPORTS_CUDA_FP16
+      } else if (std::is_same<T, __half>::value) {
+        SkipLayerNormKernel2<__half, __half2,
+                             threads><<<block, threads, 0, stream>>>(
+            num, hidden / 2, reinterpret_cast<const __half2 *>(input1),
+            reinterpret_cast<const __half2 *>(input2),
+            reinterpret_cast<__half2 *>(output),
+            reinterpret_cast<const float2 *>(scale),
+            reinterpret_cast<const float2 *>(bias), eps);
+      } else {
+        assert(false);
+        // should not be here
+      }
+#endif
+    } else {
+      SkipLayerNormKernel<T, threads><<<block, threads, 0, stream>>>(
+          num, hidden, input1, input2, output, scale, bias, eps);
+    }
   }
 }
 
diff --git a/paddle/fluid/operators/math/cross_entropy.cu b/paddle/fluid/operators/math/cross_entropy.cu
index 2d871c6e14b855c01b7783bd90103a1e49c71ac2..c7fac60dd3e663088813f795352e4d751059de39 100644
--- a/paddle/fluid/operators/math/cross_entropy.cu
+++ b/paddle/fluid/operators/math/cross_entropy.cu
@@ -25,8 +25,7 @@ template <typename T>
 __global__ void CrossEntropyKernel(T* Y, const T* X, const int64_t* label,
                                    const int N, const int D,
                                    const int ignore_index) {
-  for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < N;
-       i += blockDim.x * gridDim.x) {
+  CUDA_KERNEL_LOOP(i, N) {
     PADDLE_ENFORCE(label[i] >= 0 && label[i] < D || label[i] == ignore_index,
                    "label[%d] expected >= 0 and < %ld, or == %ld, but got "
                    "%ld. Please check input value.",
diff --git a/paddle/fluid/operators/math/math_cuda_utils.h b/paddle/fluid/operators/math/math_cuda_utils.h
index 0325717b4d3714e8eae260beb89df7f2addda88f..1149914efbca4613757b3402624dd9ce3f62625f 100644
--- a/paddle/fluid/operators/math/math_cuda_utils.h
+++ b/paddle/fluid/operators/math/math_cuda_utils.h
@@ -66,7 +66,8 @@ __device__ __forceinline__ float2 ToFloat2<float2>(float2 a) {
 }
 
 template <>
-__device__ __forceinline__ float2 FloatsToPair(const float a, const float b) {
+__device__ __forceinline__ float2 FloatsToPair<float2>(const float a,
+                                                       const float b) {
   return make_float2(a, b);
 }
 
@@ -86,7 +87,8 @@ __device__ __forceinline__ float2 ToFloat2<__half2>(__half2 a) {
 }
 
 template <>
-__device__ __forceinline__ __half2 FloatsToPair(const float a, const float b) {
+__device__ __forceinline__ __half2 FloatsToPair<__half2>(const float a,
+                                                         const float b) {
   return __floats2half2_rn(a, b);
 }
 #endif
diff --git a/paddle/fluid/operators/math/math_function.cu b/paddle/fluid/operators/math/math_function.cu
index 235bbb57ed6f7914d568171ab55e0b9a002a3e78..fba143d017deb4b4814ad8b10e614357a7ebee23 100644
--- a/paddle/fluid/operators/math/math_function.cu
+++ b/paddle/fluid/operators/math/math_function.cu
@@ -75,8 +75,7 @@ template <typename T>
 __global__ void RowwiseAddKernel(const T* a, const T* b, T* c, int width,
                                  int num) {
   T tmp = 1.0 / width;
-  for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < num;
-       i += blockDim.x * gridDim.x) {
+  CUDA_KERNEL_LOOP(i, num) {
     int h = i * tmp;
     int w = i - h * width;
     c[i] = a[i] + b[w];
diff --git a/paddle/fluid/operators/math/matrix_bit_code.h b/paddle/fluid/operators/math/matrix_bit_code.h
index 9d8d0de53a72332f25fb7f949cd2bb9cb3055fc5..7f507999fda0eb576d6d1da69da6c2e4d8a7459a 100644
--- a/paddle/fluid/operators/math/matrix_bit_code.h
+++ b/paddle/fluid/operators/math/matrix_bit_code.h
@@ -98,7 +98,7 @@ inline int clz(const T& value) {
   }
 }
 
-inline size_t FindLastSet(size_t x) { return sizeof(size_t) * 8 - clz(x); }
+inline size_t FindLastSet(size_t x) { return 1 + sizeof(size_t) * 8 - clz(x); }
 #endif  // !_WIN32
 class SimpleCode {
  public:
diff --git a/paddle/fluid/operators/mean_iou_op.cu b/paddle/fluid/operators/mean_iou_op.cu
index ada1892f43dcf33cf4db64215732189947f03579..7098a720cc3a03d1dc033d810aa2e36d6552adce 100644
--- a/paddle/fluid/operators/mean_iou_op.cu
+++ b/paddle/fluid/operators/mean_iou_op.cu
@@ -23,10 +23,6 @@ namespace operators {
 
 using platform::PADDLE_CUDA_NUM_THREADS;
 
-#define CUDA_1D_KERNEL_LOOP(i, n)                              \
-  for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < (n); \
-       i += blockDim.x * gridDim.x)
-
 template <typename T>
 __global__ void CountCUDAKernel(const int num_classes, const int count,
                                 const T* predictions, const T* labels,
@@ -42,7 +38,7 @@ __global__ void CountCUDAKernel(const int num_classes, const int count,
 
   T pred;
   T label;
-  CUDA_1D_KERNEL_LOOP(i, count) {
+  CUDA_KERNEL_LOOP(i, count) {
     pred = predictions[i];
     label = labels[i];
     if (pred == label) {
@@ -68,7 +64,7 @@ __global__ void ComputeIoUCUDAKernel(const int num_classes, int* wrong,
     valid_count_c = 0;
   }
   __syncthreads();
-  CUDA_1D_KERNEL_LOOP(i, num_classes) {
+  CUDA_KERNEL_LOOP(i, num_classes) {
     int wrong_n = wrong[i];
     int correct_n = correct[i];
     int denominator = wrong_n + correct_n;
diff --git a/paddle/fluid/operators/metrics/auc_op.cu b/paddle/fluid/operators/metrics/auc_op.cu
index 04af6c51c73a2563040d5e5ed358f592784f1221..13da4ff0857d97d61ae8d4da9b05b0f27128d94e 100644
--- a/paddle/fluid/operators/metrics/auc_op.cu
+++ b/paddle/fluid/operators/metrics/auc_op.cu
@@ -23,9 +23,6 @@ namespace operators {
 using platform::PADDLE_CUDA_NUM_THREADS;
 using Tensor = framework::Tensor;
 using LoDTensor = framework::LoDTensor;
-#define CUDA_KERNEL_LOOP(i, n)                                 \
-  for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < (n); \
-       i += blockDim.x * gridDim.x)
 
 __global__ void ClearObsoleteDataKernel(int64_t *pos, int64_t *neg,
                                         const int bucket_length,
diff --git a/paddle/fluid/operators/mish_op.cc b/paddle/fluid/operators/mish_op.cc
new file mode 100644
index 0000000000000000000000000000000000000000..ea754b5b1e9413fbd28b351c13fe1da549ccfafb
--- /dev/null
+++ b/paddle/fluid/operators/mish_op.cc
@@ -0,0 +1,121 @@
+/* Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/operators/mish_op.h"
+#include <memory>
+#include <string>
+
+namespace paddle {
+namespace operators {
+
+class MishOp : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+
+  void InferShape(framework::InferShapeContext *ctx) const override {
+    OP_INOUT_CHECK(ctx->HasInput("X"), "Input", "X", "mish");
+    OP_INOUT_CHECK(ctx->HasOutput("Out"), "Output", "Out", "mish");
+
+    ctx->ShareDim("X", /*->*/ "Out");
+    ctx->ShareLoD("X", /*->*/ "Out");
+  }
+
+ protected:
+  framework::OpKernelType GetExpectedKernelType(
+      const framework::ExecutionContext &ctx) const override {
+    return framework::OpKernelType(
+        OperatorWithKernel::IndicateVarDataType(ctx, "X"),
+        ctx.device_context());
+  }
+};
+
+class MishOpMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  void Make() override {
+    AddInput("X", "Input of Mish operator");
+    AddOutput("Out", "Output of Mish operator");
+    AddAttr<float>(
+        "threshold",
+        "Constant threshold of softplus in Mish operator. Approximate value "
+        "of softplus will be used if absolute value of input is greater than "
+        ":attr:`threshold`")
+        .SetDefault(20.f);
+    AddComment(R"DOC(
+Mish Activation Operator.
+
+..  math::
+    softplus = \begin{cases}
+            x, \text{if } x > \text{threshold} \\
+            e^{x}, \text{if } x < -\text{threshold} \\
+            \ln(1 + e^{x}),  \text{otherwise}
+          \end{cases}
+
+    out = x * \tanh(softplus)
+
+)DOC");
+  }
+};
+
+// The operator to calculate gradients of a prelu operator.
+class MishGradOp : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+
+  void InferShape(framework::InferShapeContext *ctx) const override {
+    OP_INOUT_CHECK(ctx->HasInput("X"), "Input", "X", "mish");
+    OP_INOUT_CHECK(ctx->HasInput(framework::GradVarName("Out")), "Input",
+                   "Out@GRAD", "mish");
+
+    auto x_grad_name = framework::GradVarName("X");
+    if (ctx->HasOutput(x_grad_name)) {
+      ctx->SetOutputDim(x_grad_name, ctx->GetInputDim("X"));
+    }
+  }
+
+ protected:
+  framework::OpKernelType GetExpectedKernelType(
+      const framework::ExecutionContext &ctx) const override {
+    return framework::OpKernelType(
+        OperatorWithKernel::IndicateVarDataType(ctx, "X"),
+        ctx.device_context());
+  }
+};
+
+template <typename T>
+class MishGradOpMaker : public framework::SingleGradOpMaker<T> {
+ public:
+  using framework::SingleGradOpMaker<T>::SingleGradOpMaker;
+
+ protected:
+  void Apply(GradOpPtr<T> op) const override {
+    op->SetType("mish_grad");
+    op->SetInput("X", this->Input("X"));
+    op->SetInput(framework::GradVarName("Out"), this->OutputGrad("Out"));
+    op->SetOutput(framework::GradVarName("X"), this->InputGrad("X"));
+    op->SetAttrMap(this->Attrs());
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+
+REGISTER_OPERATOR(mish, ops::MishOp, ops::MishOpMaker,
+                  ops::MishGradOpMaker<paddle::framework::OpDesc>,
+                  ops::MishGradOpMaker<paddle::imperative::OpBase>);
+REGISTER_OPERATOR(mish_grad, ops::MishGradOp);
+REGISTER_OP_CPU_KERNEL(
+    mish, ops::MishFP32CPUKernel<paddle::platform::CPUDeviceContext>,
+    ops::MishCPUKernel<paddle::platform::CPUDeviceContext, double>);
+REGISTER_OP_CPU_KERNEL(
+    mish_grad, ops::MishGradFP32CPUKernel<paddle::platform::CPUDeviceContext>,
+    ops::MishGradCPUKernel<paddle::platform::CPUDeviceContext, double>);
diff --git a/paddle/fluid/operators/mish_op.cu b/paddle/fluid/operators/mish_op.cu
new file mode 100644
index 0000000000000000000000000000000000000000..77817e526e13d0618fbbfea313fe1d4c28cd582d
--- /dev/null
+++ b/paddle/fluid/operators/mish_op.cu
@@ -0,0 +1,173 @@
+/* Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/operators/mish_op.h"
+#include "paddle/fluid/platform/cuda_primitives.h"
+#include "paddle/fluid/platform/gpu_launch_config.h"
+
+namespace paddle {
+namespace operators {
+
+using Tensor = framework::Tensor;
+
+template <typename T>
+__global__ void KeMishFw(const T* in, T* out, const int numel,
+                         const float threshold) {
+  int tid = blockIdx.x * blockDim.x + threadIdx.x;
+  int stride = blockDim.x * gridDim.x;
+  for (; tid < numel; tid += stride) {
+    T x = in[tid];
+    T sp = CalcSoftplus<T>(x, threshold);
+    out[tid] = x * tanh(sp);
+  }
+}
+
+// expf instead of exp should be used for float type, complement
+// and register float kernel separatelly
+__global__ void KeMishFwFP32(const float* in, float* out, const int numel,
+                             const float threshold) {
+  int tid = blockIdx.x * blockDim.x + threadIdx.x;
+  int stride = blockDim.x * gridDim.x;
+  for (; tid < numel; tid += stride) {
+    float x = in[tid];
+    float sp = CalcSoftplusFP32(x, threshold);
+    out[tid] = x * tanhf(sp);
+  }
+}
+
+template <typename T>
+__global__ void KeMishBw(const T* in, const T* dout, T* din, const int numel,
+                         const float threshold) {
+  int tid = blockIdx.x * blockDim.x + threadIdx.x;
+  int stride = blockDim.x * gridDim.x;
+  for (; tid < numel; tid += stride) {
+    T x = in[tid];
+    T sp = CalcSoftplus<T>(x, threshold);
+    T tsp = tanh(sp);
+    T grad_sp = -expm1(-sp);
+    T grad_tsp = (static_cast<T>(1) - tsp * tsp) * grad_sp;
+    din[tid] = dout[tid] * (x * grad_tsp + tsp);
+  }
+}
+
+__global__ void KeMishBwFP32(const float* in, const float* dout, float* din,
+                             const int numel, const float threshold) {
+  int tid = blockIdx.x * blockDim.x + threadIdx.x;
+  int stride = blockDim.x * gridDim.x;
+  for (; tid < numel; tid += stride) {
+    float x = in[tid];
+    float sp = CalcSoftplusFP32(x, threshold);
+    float tsp = tanhf(sp);
+    float grad_sp = -expm1f(-sp);
+    float grad_tsp = (static_cast<float>(1) - tsp * tsp) * grad_sp;
+    din[tid] = dout[tid] * (x * grad_tsp + tsp);
+  }
+}
+
+template <typename DeviceContext, typename T>
+class MishCUDAKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    auto* x = ctx.Input<Tensor>("X");
+    auto* out = ctx.Output<Tensor>("Out");
+
+    const float threshold = ctx.Attr<float>("threshold");
+
+    const T* x_data = x->data<T>();
+    T* out_data = out->mutable_data<T>(ctx.GetPlace());
+
+    const int numel = x->numel();
+
+    platform::GpuLaunchConfig config = platform::getGpuLaunchConfig(numel, ctx);
+    KeMishFw<T><<<config.blocks, config.threads, 0,
+                  ctx.cuda_device_context().stream()>>>(x_data, out_data, numel,
+                                                        threshold);
+  }
+};
+
+template <typename DeviceContext>
+class MishFP32CUDAKernel : public framework::OpKernel<float> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    auto* x = ctx.Input<Tensor>("X");
+    auto* out = ctx.Output<Tensor>("Out");
+
+    const float threshold = ctx.Attr<float>("threshold");
+
+    const float* x_data = x->data<float>();
+    float* out_data = out->mutable_data<float>(ctx.GetPlace());
+
+    const int numel = x->numel();
+
+    platform::GpuLaunchConfig config = platform::getGpuLaunchConfig(numel, ctx);
+    KeMishFwFP32<<<config.blocks, config.threads, 0,
+                   ctx.cuda_device_context().stream()>>>(x_data, out_data,
+                                                         numel, threshold);
+  }
+};
+
+template <typename DeviceContext, typename T>
+class MishGradCUDAKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    auto* x = ctx.Input<Tensor>("X");
+    auto* dout = ctx.Input<Tensor>(framework::GradVarName("Out"));
+    auto* dx = ctx.Output<Tensor>(framework::GradVarName("X"));
+
+    auto threshold = ctx.Attr<float>("threshold");
+
+    const T* x_data = x->data<T>();
+    const T* dout_data = dout->data<T>();
+    T* dx_data = dx->mutable_data<T>(ctx.GetPlace());
+
+    const int numel = x->numel();
+
+    platform::GpuLaunchConfig config = platform::getGpuLaunchConfig(numel, ctx);
+    KeMishBw<T><<<config.blocks, config.threads, 0,
+                  ctx.cuda_device_context().stream()>>>(
+        x_data, dout_data, dx_data, numel, threshold);
+  }
+};
+
+template <typename DeviceContext>
+class MishGradFP32CUDAKernel : public framework::OpKernel<float> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    auto* x = ctx.Input<Tensor>("X");
+    auto* dout = ctx.Input<Tensor>(framework::GradVarName("Out"));
+    auto* dx = ctx.Output<Tensor>(framework::GradVarName("X"));
+
+    auto threshold = ctx.Attr<float>("threshold");
+
+    const float* x_data = x->data<float>();
+    const float* dout_data = dout->data<float>();
+    float* dx_data = dx->mutable_data<float>(ctx.GetPlace());
+
+    const int numel = x->numel();
+
+    platform::GpuLaunchConfig config = platform::getGpuLaunchConfig(numel, ctx);
+    KeMishBwFP32<<<config.blocks, config.threads, 0,
+                   ctx.cuda_device_context().stream()>>>(
+        x_data, dout_data, dx_data, numel, threshold);
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+REGISTER_OP_CUDA_KERNEL(
+    mish, ops::MishFP32CUDAKernel<paddle::platform::CUDADeviceContext>,
+    ops::MishCUDAKernel<paddle::platform::CUDADeviceContext, double>)
+REGISTER_OP_CUDA_KERNEL(
+    mish_grad, ops::MishGradFP32CUDAKernel<paddle::platform::CUDADeviceContext>,
+    ops::MishGradCUDAKernel<paddle::platform::CUDADeviceContext, double>)
diff --git a/paddle/fluid/operators/mish_op.h b/paddle/fluid/operators/mish_op.h
new file mode 100644
index 0000000000000000000000000000000000000000..86ccb57d929e5dec72fe67185530478109e2d7f0
--- /dev/null
+++ b/paddle/fluid/operators/mish_op.h
@@ -0,0 +1,137 @@
+/* Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+#include "paddle/fluid/framework/op_registry.h"
+namespace paddle {
+namespace operators {
+
+using Tensor = framework::Tensor;
+
+template <typename T>
+HOSTDEVICE static T CalcSoftplus(T x, float threshold) {
+  if (threshold > 0 && x > threshold) {
+    return x;
+  } else if (threshold > 0 && x < -threshold) {
+    return exp(x);
+  } else {
+    return log1p(exp(x));
+  }
+}
+
+// expf instead of exp should be used for float type, complement
+// and register float kernel separatelly
+HOSTDEVICE static float CalcSoftplusFP32(float x, float threshold) {
+  if (threshold > 0 && x > threshold) {
+    return x;
+  } else if (threshold > 0 && x < -threshold) {
+    return expf(x);
+  } else {
+    return log1pf(expf(x));
+  }
+}
+
+template <typename DeviceContext, typename T>
+class MishCPUKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    auto* x = ctx.Input<Tensor>("X");
+    auto* out = ctx.Output<Tensor>("Out");
+
+    const float threshold = ctx.Attr<float>("threshold");
+
+    const T* x_data = x->data<T>();
+    T* out_data = out->mutable_data<T>(ctx.GetPlace());
+
+    int numel = x->numel();
+    for (int i = 0; i < numel; i++) {
+      T x_d = x_data[i];
+      T sp = CalcSoftplus<T>(x_d, threshold);
+      out_data[i] = x_d * std::tanh(sp);
+    }
+  }
+};
+
+template <typename DeviceContext>
+class MishFP32CPUKernel : public framework::OpKernel<float> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    auto* x = ctx.Input<Tensor>("X");
+    auto* out = ctx.Output<Tensor>("Out");
+
+    const float threshold = ctx.Attr<float>("threshold");
+
+    const float* x_data = x->data<float>();
+    float* out_data = out->mutable_data<float>(ctx.GetPlace());
+
+    int numel = x->numel();
+    for (int i = 0; i < numel; i++) {
+      float x_d = x_data[i];
+      float sp = CalcSoftplusFP32(x_d, threshold);
+      out_data[i] = x_d * std::tanh(sp);
+    }
+  }
+};
+
+template <typename DeviceContext, typename T>
+class MishGradCPUKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    auto* x = ctx.Input<Tensor>("X");
+    auto* dx = ctx.Output<Tensor>(framework::GradVarName("X"));
+    auto* dout = ctx.Input<Tensor>(framework::GradVarName("Out"));
+
+    auto threshold = ctx.Attr<float>("threshold");
+
+    const T* x_data = x->data<T>();
+    const T* dout_data = dout->data<T>();
+    T* dx_data = dx->mutable_data<T>(ctx.GetPlace());
+
+    int numel = x->numel();
+    for (int i = 0; i < numel; i++) {
+      T x_d = x_data[i];
+      T sp = CalcSoftplus<T>(x_d, threshold);
+      T tsp = std::tanh(sp);
+      T grad_sp = -std::expm1(-sp);
+      T grad_tsp = (static_cast<T>(1) - tsp * tsp) * grad_sp;
+      dx_data[i] = dout_data[i] * (x_d * grad_tsp + tsp);
+    }
+  }
+};
+
+template <typename DeviceContext>
+class MishGradFP32CPUKernel : public framework::OpKernel<float> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    auto* x = ctx.Input<Tensor>("X");
+    auto* dx = ctx.Output<Tensor>(framework::GradVarName("X"));
+    auto* dout = ctx.Input<Tensor>(framework::GradVarName("Out"));
+
+    auto threshold = ctx.Attr<float>("threshold");
+
+    const float* x_data = x->data<float>();
+    const float* dout_data = dout->data<float>();
+    float* dx_data = dx->mutable_data<float>(ctx.GetPlace());
+
+    int numel = x->numel();
+    for (int i = 0; i < numel; i++) {
+      float x_d = x_data[i];
+      float sp = CalcSoftplusFP32(x_d, threshold);
+      float tsp = std::tanh(sp);
+      float grad_sp = -std::expm1f(-sp);
+      float grad_tsp = (static_cast<float>(1) - tsp * tsp) * grad_sp;
+      dx_data[i] = dout_data[i] * (x_d * grad_tsp + tsp);
+    }
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/fluid/operators/mkldnn/conv_mkldnn_op.cc b/paddle/fluid/operators/mkldnn/conv_mkldnn_op.cc
index ac6ddebb813fab2bc5d1c1faaaa8d96bbc22dbd4..17e1e1958346155af32cf75b5e9fc25cdbdd91eb 100644
--- a/paddle/fluid/operators/mkldnn/conv_mkldnn_op.cc
+++ b/paddle/fluid/operators/mkldnn/conv_mkldnn_op.cc
@@ -943,7 +943,7 @@ class ConvMKLDNNGradOpKernel : public paddle::framework::OpKernel<T> {
     const std::string key = platform::CreateKey(
         src_tz, ctx.InputName("Input") + ctx.InputName("Filter"));
 
-    const std::string key_conv_pd = key + "@forward_pd";
+    const std::string key_conv_pd = key + "@fwd_pd";
     std::vector<primitive> pipeline;
 
     // Create user memory descriptors
diff --git a/paddle/fluid/operators/mkldnn/lrn_mkldnn_op.cc b/paddle/fluid/operators/mkldnn/lrn_mkldnn_op.cc
index 4bfaeb41ee8716baeb850009a6833eb2e4d95ff9..00c10cecbf4828d2157505abca49763b6ded1b16 100644
--- a/paddle/fluid/operators/mkldnn/lrn_mkldnn_op.cc
+++ b/paddle/fluid/operators/mkldnn/lrn_mkldnn_op.cc
@@ -59,15 +59,6 @@ class LRNMKLDNNOpKernel : public paddle::framework::OpKernel<T> {
                                {MKLDNN_ARG_DST, *dst_memory},
                                {MKLDNN_ARG_WORKSPACE, *workspace_memory}});
     } else {
-      // mid has to be allocated and filled
-      // k to pass LRN unit tests
-      // TODO(jczaja): Disable checking mid in unit tests (Require API change)
-      mid->mutable_data<T>(ctx.GetPlace());
-      auto e_mid = framework::EigenTensor<T, 4>::From(*mid);
-      const float k = ctx.Attr<float>("k");
-      e_mid = e_mid.constant(k);
-      mid->set_format(platform::GetMKLDNNFormat(*dst_memory));
-
       lrn_p->execute(astream, {{MKLDNN_ARG_SRC, *src_memory},
                                {MKLDNN_ARG_DST, *dst_memory}});
     }
@@ -85,7 +76,7 @@ class LRNMKLDNNGradOpKernel : public paddle::framework::OpKernel<T> {
     const bool is_float_type = std::is_same<T, float>::value;
     PADDLE_ENFORCE_EQ(is_float_type, true,
                       platform::errors::PreconditionNotMet(
-                          "DNNL LRN GradOpKernl must use float data."));
+                          "DNNL LRN GradOpKernel must use float data."));
     PADDLE_ENFORCE_EQ(platform::is_cpu_place(ctx.GetPlace()), true,
                       paddle::platform::errors::PreconditionNotMet(
                           "Operator DNNL LRNGrad must use CPUPlace"));
diff --git a/paddle/fluid/operators/nll_loss_op.cu b/paddle/fluid/operators/nll_loss_op.cu
index 7b37239a339ecde8f1f01631c6b3f08a693e8b7f..3d618805f02aa9b6d5310bfc8a79857f522f8ac5 100644
--- a/paddle/fluid/operators/nll_loss_op.cu
+++ b/paddle/fluid/operators/nll_loss_op.cu
@@ -31,10 +31,6 @@ static inline int NumBlocks(const int N) {
                   kNumMaxinumNumBlocks);
 }
 
-#define CUDA_1D_KERNEL_LOOP(i, n)                              \
-  for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < (n); \
-       i += blockDim.x * gridDim.x)
-
 template <typename T>
 __global__ void GPUNLLLossForward1D_no_reduce(T* out_data, const T* x_data,
                                               const int64_t* label_data,
@@ -42,7 +38,7 @@ __global__ void GPUNLLLossForward1D_no_reduce(T* out_data, const T* x_data,
                                               const int64_t batch_size,
                                               const int64_t n_classes,
                                               const int64_t ignore_index) {
-  CUDA_1D_KERNEL_LOOP(i, batch_size) {
+  CUDA_KERNEL_LOOP(i, batch_size) {
     const int64_t cur_label = label_data[i];
     if (cur_label == ignore_index) {
       out_data[i] = 0;
@@ -191,7 +187,7 @@ __global__ void GPUNLLLossForward2D_no_reduce(
   const int64_t map_size = in_dim2 * in_dim3;
   const int64_t sample_size = n_classes * map_size;
   const int64_t out_numel = batch_size * map_size;
-  CUDA_1D_KERNEL_LOOP(i, out_numel) {
+  CUDA_KERNEL_LOOP(i, out_numel) {
     const int64_t b = i % batch_size;
     const int64_t h = (i / batch_size) % in_dim2;
     const int64_t w = (i / (batch_size * in_dim2)) % in_dim3;
@@ -261,7 +257,7 @@ __global__ void GPUNLLLossBackward1D_no_reduce(
     T* dx_data, const int64_t* label_data, const T* weight_data,
     const T* dout_data, const int64_t batch_size, const int64_t n_classes,
     const int64_t ignore_index) {
-  CUDA_1D_KERNEL_LOOP(i, batch_size) {
+  CUDA_KERNEL_LOOP(i, batch_size) {
     const int64_t cur_label = label_data[i];
     if (cur_label == ignore_index) {
       continue;
@@ -299,7 +295,7 @@ __global__ void GPUNLLLossBackward2D_no_reduce(
   const int64_t map_size = in_dim2 * in_dim3;
   const int64_t sample_size = n_classes * map_size;
   const int64_t out_numel = batch_size * map_size;
-  CUDA_1D_KERNEL_LOOP(i, out_numel) {
+  CUDA_KERNEL_LOOP(i, out_numel) {
     const int64_t b = i % batch_size;
     const int64_t h = (i / batch_size) % in_dim2;
     const int64_t w = (i / (batch_size * in_dim2)) % in_dim3;
diff --git a/paddle/fluid/operators/optimizers/lars_momentum_op.cu b/paddle/fluid/operators/optimizers/lars_momentum_op.cu
index a277d6ff2bea917addac8c6ea4b24b63dcbc8dba..1dace4ed6ab3e17b348035e34f6d9ea6d31edae9 100644
--- a/paddle/fluid/operators/optimizers/lars_momentum_op.cu
+++ b/paddle/fluid/operators/optimizers/lars_momentum_op.cu
@@ -26,8 +26,7 @@ __global__ void MomentumLarsKernel(const T* p, const T* g, const T* v,
                                    const T* g_norm, T* p_out, T* v_out) {
   T lr = learning_rate[0];
   T local_lr = learning_rate[0];
-  for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < num;
-       i += blockDim.x * gridDim.x) {
+  CUDA_KERNEL_LOOP(i, num) {
     if (p_norm[0] > 0 && g_norm[0] > 0) {
       local_lr = lr * lars_coeff * p_norm[0] /
                  (g_norm[0] + lars_weight_decay * p_norm[0]);
diff --git a/paddle/fluid/operators/optimizers/sgd_op.cu b/paddle/fluid/operators/optimizers/sgd_op.cu
index 96eb51903f015478e02e7bd8d9dd8cfcc5d93ee2..b70f24e0e5e8f2f6c6ac974942ccd4c4c3ad41bb 100644
--- a/paddle/fluid/operators/optimizers/sgd_op.cu
+++ b/paddle/fluid/operators/optimizers/sgd_op.cu
@@ -25,8 +25,7 @@ template <typename T>
 __global__ void SGDKernel(const T* g, const T* p, const T* learning_rate,
                           const int num, T* p_out) {
   T lr = learning_rate[0];
-  int grid_size = blockDim.x * gridDim.x;
-  for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < num; i += grid_size) {
+  CUDA_KERNEL_LOOP(i, num) {
     T g_data = g[i];
     T p_data = p[i];
     p_out[i] = p_data - lr * g_data;
diff --git a/paddle/fluid/operators/pad2d_op.cu b/paddle/fluid/operators/pad2d_op.cu
index c05d778fb29c88c69ba389fdb1a9b024cf237af2..a77d0a5650ef3271ce0f5a46e0e5c6d2e1fef37d 100644
--- a/paddle/fluid/operators/pad2d_op.cu
+++ b/paddle/fluid/operators/pad2d_op.cu
@@ -23,10 +23,6 @@ namespace operators {
 
 using platform::PADDLE_CUDA_NUM_THREADS;
 
-#define CUDA_1D_KERNEL_LOOP(i, n)                              \
-  for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < (n); \
-       i += blockDim.x * gridDim.x)
-
 using framework::Tensor;
 
 template <typename T>
@@ -36,7 +32,7 @@ __global__ void Pad2DConstNCHW(const int nthreads, const T* in_data,
                                const int out_height, const int out_width,
                                const int pad_top, const int pad_left, T value,
                                T* out_data) {
-  CUDA_1D_KERNEL_LOOP(index, nthreads) {
+  CUDA_KERNEL_LOOP(index, nthreads) {
     int nc = index / out_width;
     const int out_w = index % out_width;
     const int out_h = nc % out_height;
@@ -57,7 +53,7 @@ __global__ void Pad2DConstNHWC(const int nthreads, const T* in_data,
                                const int out_height, const int out_width,
                                const int pad_top, const int pad_left, T value,
                                T* out_data) {
-  CUDA_1D_KERNEL_LOOP(index, nthreads) {
+  CUDA_KERNEL_LOOP(index, nthreads) {
     int n = index / channels;
     const int c = index % channels;
     const int out_w = n % out_width;
@@ -81,7 +77,7 @@ __global__ void Pad2DReflectNCHW(const int nthreads, const T* in_data,
                                  const int out_height, const int out_width,
                                  const int pad_top, const int pad_left,
                                  T* out_data) {
-  CUDA_1D_KERNEL_LOOP(index, nthreads) {
+  CUDA_KERNEL_LOOP(index, nthreads) {
     int nc = index / out_width;
     const int out_w = index % out_width;
     const int out_h = nc % out_height;
@@ -103,7 +99,7 @@ __global__ void Pad2DReflectNHWC(const int nthreads, const T* in_data,
                                  const int out_height, const int out_width,
                                  const int pad_top, const int pad_left,
                                  T* out_data) {
-  CUDA_1D_KERNEL_LOOP(index, nthreads) {
+  CUDA_KERNEL_LOOP(index, nthreads) {
     int n = index / channels;
     const int c = index % channels;
     const int out_w = n % out_width;
@@ -128,7 +124,7 @@ __global__ void Pad2DEdgeNCHW(const int nthreads, const T* in_data,
                               const int out_height, const int out_width,
                               const int pad_top, const int pad_left,
                               T* out_data) {
-  CUDA_1D_KERNEL_LOOP(index, nthreads) {
+  CUDA_KERNEL_LOOP(index, nthreads) {
     int nc = index / out_width;
     const int out_w = index % out_width;
     const int out_h = nc % out_height;
@@ -146,7 +142,7 @@ __global__ void Pad2DEdgeNHWC(const int nthreads, const T* in_data,
                               const int out_height, const int out_width,
                               const int pad_top, const int pad_left,
                               T* out_data) {
-  CUDA_1D_KERNEL_LOOP(index, nthreads) {
+  CUDA_KERNEL_LOOP(index, nthreads) {
     int n = index / channels;
     const int c = index % channels;
     const int out_w = n % out_width;
@@ -167,7 +163,7 @@ __global__ void Pad2DGradConstNCHW(const int in_size, T* d_in_data,
                                    const int out_height, const int out_width,
                                    const int pad_top, const int pad_left,
                                    const T* d_out_data) {
-  CUDA_1D_KERNEL_LOOP(in_index, in_size) {
+  CUDA_KERNEL_LOOP(in_index, in_size) {
     int nc = in_index / in_width;
     const int out_w = in_index % in_width + pad_left;
     const int out_h = nc % in_height + pad_top;
@@ -184,7 +180,7 @@ __global__ void Pad2DGradConstNHWC(const int in_size, T* d_in_data,
                                    const int out_height, const int out_width,
                                    const int pad_top, const int pad_left,
                                    const T* d_out_data) {
-  CUDA_1D_KERNEL_LOOP(in_index, in_size) {
+  CUDA_KERNEL_LOOP(in_index, in_size) {
     int n = in_index / channels;
     const int c = in_index % channels;
     const int out_w = n % in_width + pad_left;
@@ -204,7 +200,7 @@ __global__ void Pad2DGradReflectNCHW(const int out_size, T* d_in_data,
                                      const int out_height, const int out_width,
                                      const int pad_top, const int pad_left,
                                      const T* d_out_data) {
-  CUDA_1D_KERNEL_LOOP(out_index, out_size) {
+  CUDA_KERNEL_LOOP(out_index, out_size) {
     int nc = out_index / out_width;
     const int out_w = out_index % out_width;
     const int out_h = nc % out_height;
@@ -228,7 +224,7 @@ __global__ void Pad2DGradReflectNHWC(const int out_size, T* d_in_data,
                                      const int out_height, const int out_width,
                                      const int pad_top, const int pad_left,
                                      const T* d_out_data) {
-  CUDA_1D_KERNEL_LOOP(out_index, out_size) {
+  CUDA_KERNEL_LOOP(out_index, out_size) {
     const int c = out_index % channels;
     int n = out_index / channels;
     const int out_w = n % out_width;
@@ -254,7 +250,7 @@ __global__ void Pad2DGradEdgeNCHW(const int out_size, T* d_in_data,
                                   const int out_height, const int out_width,
                                   const int pad_top, const int pad_left,
                                   const T* d_out_data) {
-  CUDA_1D_KERNEL_LOOP(out_index, out_size) {
+  CUDA_KERNEL_LOOP(out_index, out_size) {
     int nc = out_index / out_width;
     const int out_w = out_index % out_width;
     const int out_h = nc % out_height;
@@ -274,7 +270,7 @@ __global__ void Pad2DGradEdgeNHWC(const int out_size, T* d_in_data,
                                   const int out_height, const int out_width,
                                   const int pad_top, const int pad_left,
                                   const T* d_out_data) {
-  CUDA_1D_KERNEL_LOOP(out_index, out_size) {
+  CUDA_KERNEL_LOOP(out_index, out_size) {
     const int c = out_index % channels;
     int n = out_index / channels;
     const int out_w = n % out_width;
diff --git a/paddle/fluid/operators/prelu_op.cu b/paddle/fluid/operators/prelu_op.cu
index 2e51b00b98052ccce4d56fb8c3ac9fb3e53f87b2..2f61c53f877d5fc89c89dc6d51229d127a1eb48c 100644
--- a/paddle/fluid/operators/prelu_op.cu
+++ b/paddle/fluid/operators/prelu_op.cu
@@ -25,11 +25,6 @@ using Tensor = framework::Tensor;
 
 #define CUDA_NUM_THREADS 1024
 
-// CUDA: grid stride looping
-#define CUDA_KERNEL_LOOP(i, n)                                 \
-  for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < (n); \
-       i += blockDim.x * gridDim.x)
-
 inline static int PADDLE_GET_BLOCKS(const int N) {
   return (N + CUDA_NUM_THREADS - 1) / CUDA_NUM_THREADS;
 }
diff --git a/paddle/fluid/operators/randperm_op.cc b/paddle/fluid/operators/randperm_op.cc
index 919a05a0d992602f442511f50502525e987c0251..deafd651e90089542d2f50eea638ca8058d09c58 100644
--- a/paddle/fluid/operators/randperm_op.cc
+++ b/paddle/fluid/operators/randperm_op.cc
@@ -92,4 +92,5 @@ template <typename T>
 using kernel =
     paddle::operators::RandpermKernel<paddle::platform::CPUDeviceContext, T>;
 
-REGISTER_OP_CPU_KERNEL(randperm, kernel<int64_t>, kernel<int>);
+REGISTER_OP_CPU_KERNEL(randperm, kernel<int64_t>, kernel<int>, kernel<float>,
+                       kernel<double>);
diff --git a/paddle/fluid/operators/randperm_op.cu b/paddle/fluid/operators/randperm_op.cu
index 21ae1a4968a7e1fd9fd8aee3a12ea71c42a74d46..7ed52a8fd25b104f50446082ff3a040e90bf44ea 100644
--- a/paddle/fluid/operators/randperm_op.cu
+++ b/paddle/fluid/operators/randperm_op.cu
@@ -20,4 +20,5 @@ template <typename T>
 using kernel =
     paddle::operators::RandpermKernel<paddle::platform::CUDADeviceContext, T>;
 
-REGISTER_OP_CUDA_KERNEL(randperm, kernel<int64_t>, kernel<int>);
+REGISTER_OP_CUDA_KERNEL(randperm, kernel<int64_t>, kernel<int>, kernel<float>,
+                        kernel<double>);
diff --git a/paddle/fluid/operators/range_op.cu b/paddle/fluid/operators/range_op.cu
index e2c03716d55ee41ce3a9053b48b5c6d4c70e391f..c527bc74eee93fe1a69ae82d8c3fc674406f35e5 100644
--- a/paddle/fluid/operators/range_op.cu
+++ b/paddle/fluid/operators/range_op.cu
@@ -19,13 +19,9 @@ limitations under the License. */
 namespace paddle {
 namespace operators {
 
-#define CUDA_1D_KERNEL_LOOP(i, n)                              \
-  for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < (n); \
-       i += blockDim.x * gridDim.x)
-
 template <typename T>
 __global__ void RangeKernel(T start, T step, int64_t size, T* out) {
-  CUDA_1D_KERNEL_LOOP(index, size) { out[index] = start + step * index; }
+  CUDA_KERNEL_LOOP(index, size) { out[index] = start + step * index; }
 }
 
 template <typename T>
diff --git a/paddle/fluid/operators/rank_attention.cu.h b/paddle/fluid/operators/rank_attention.cu.h
index 9de3de241dc2e2dfa48fa6c1677a0dce0cafe358..27fe67e73cde0e7811271b57d9ff9eeaabec411e 100644
--- a/paddle/fluid/operators/rank_attention.cu.h
+++ b/paddle/fluid/operators/rank_attention.cu.h
@@ -19,10 +19,6 @@ limitations under the License. */
 namespace paddle {
 namespace operators {
 
-#define CUDA_KERNEL_LOOP(i, n)                                 \
-  for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < (n); \
-       i += blockDim.x * gridDim.x)
-
 const int CUDA_NUM_THREADS = 1024;
 static inline int GET_BLOCKS(const int N) {
   return (N + CUDA_NUM_THREADS - 1) / CUDA_NUM_THREADS;
diff --git a/paddle/fluid/operators/roi_align_op.cu b/paddle/fluid/operators/roi_align_op.cu
index 4c868d22c78f60352e96485515fd63f43b5826ca..f7ec13e5bccd63d2f6552ed52f8d709a57320ddd 100644
--- a/paddle/fluid/operators/roi_align_op.cu
+++ b/paddle/fluid/operators/roi_align_op.cu
@@ -31,10 +31,6 @@ static inline int NumBlocks(const int N) {
                   kNumMaxinumNumBlocks);
 }
 
-#define CUDA_1D_KERNEL_LOOP(i, n)                              \
-  for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < (n); \
-       i += blockDim.x * gridDim.x)
-
 template <class T>
 __device__ T BilinearInterpolate(const T* input_data, const int height,
                                  const int width, T y, T x) {
@@ -110,7 +106,7 @@ __global__ void GPUROIAlignForward(
     const float spatial_scale, const int channels, const int height,
     const int width, const int pooled_height, const int pooled_width,
     const int sampling_ratio, int* roi_batch_id_data, T* output_data) {
-  CUDA_1D_KERNEL_LOOP(i, nthreads) {
+  CUDA_KERNEL_LOOP(i, nthreads) {
     int pw = i % pooled_width;
     int ph = (i / pooled_width) % pooled_height;
     int c = (i / pooled_width / pooled_height) % channels;
@@ -165,7 +161,7 @@ __global__ void GPUROIAlignBackward(const int nthreads, const T* input_rois,
                                     const int pooled_width,
                                     const int sampling_ratio,
                                     int* roi_batch_id_data, T* input_grad) {
-  CUDA_1D_KERNEL_LOOP(i, nthreads) {
+  CUDA_KERNEL_LOOP(i, nthreads) {
     int pw = i % pooled_width;
     int ph = (i / pooled_width) % pooled_height;
     int c = (i / pooled_width / pooled_height) % channels;
diff --git a/paddle/fluid/operators/roll_op.cc b/paddle/fluid/operators/roll_op.cc
index d28feb257ef147d3e951599f6550f37402c0cbf2..f470f41f1eb5c9d08af7802f943b3a1e54f30939 100644
--- a/paddle/fluid/operators/roll_op.cc
+++ b/paddle/fluid/operators/roll_op.cc
@@ -33,7 +33,7 @@ class RollOp : public framework::OperatorWithKernel {
                       platform::errors::InvalidArgument(
                           "Output(Out) of RollOp should not be null."));
 
-    auto dims = ctx->Attrs().Get<std::vector<int64_t>>("dims");
+    auto dims = ctx->Attrs().Get<std::vector<int64_t>>("axis");
     auto shifts = ctx->Attrs().Get<std::vector<int64_t>>("shifts");
 
     PADDLE_ENFORCE_EQ(dims.size(), shifts.size(),
@@ -92,7 +92,7 @@ class RollOpMaker : public framework::OpProtoAndCheckerMaker {
                                   "of the tensor are shifted.")
         .SetDefault({});
     AddAttr<std::vector<int64_t>>(
-        "dims",
+        "axis",
         "Axis along which to roll. It must have the same size "
         "with shifts.")
         .SetDefault({});
diff --git a/paddle/fluid/operators/roll_op.h b/paddle/fluid/operators/roll_op.h
index fbc277433fc56bb58d384f26726349a3a63c372b..74dd37ed8388fe495cf5bf6cc859dd899fdd87dd 100644
--- a/paddle/fluid/operators/roll_op.h
+++ b/paddle/fluid/operators/roll_op.h
@@ -82,7 +82,7 @@ class RollKernel : public framework::OpKernel<T> {
     auto& input = input_var->Get<LoDTensor>();
     auto* output = output_var->GetMutable<LoDTensor>();
     std::vector<int64_t> shifts = context.Attr<std::vector<int64_t>>("shifts");
-    std::vector<int64_t> dims = context.Attr<std::vector<int64_t>>("dims");
+    std::vector<int64_t> dims = context.Attr<std::vector<int64_t>>("axis");
 
     std::vector<T> out_vec;
     TensorToVector(input, context.device_context(), &out_vec);
@@ -94,8 +94,8 @@ class RollKernel : public framework::OpKernel<T> {
       PADDLE_ENFORCE_EQ(
           dims[i] < input_dim.size() && dims[i] >= (0 - input_dim.size()), true,
           platform::errors::OutOfRange(
-              "Attr(dims[%d]) is out of range, It's expected "
-              "to be in range of [-%d, %d]. But received Attr(dims[%d]) = %d.",
+              "Attr(axis[%d]) is out of range, It's expected "
+              "to be in range of [-%d, %d]. But received Attr(axis[%d]) = %d.",
               i, input_dim.size(), input_dim.size() - 1, i, dims[i]));
       shift_along_dim(out_vec.data(), input_dim, dims[i], shifts[i]);
     }
@@ -114,7 +114,7 @@ class RollGradKernel : public framework::OpKernel<T> {
     auto& input = input_var->Get<LoDTensor>();
     auto* output = output_var->GetMutable<LoDTensor>();
     std::vector<int64_t> shifts = context.Attr<std::vector<int64_t>>("shifts");
-    std::vector<int64_t> dims = context.Attr<std::vector<int64_t>>("dims");
+    std::vector<int64_t> dims = context.Attr<std::vector<int64_t>>("axis");
 
     std::vector<T> out_vec;
     TensorToVector(input, context.device_context(), &out_vec);
diff --git a/paddle/fluid/operators/run_program_op.h b/paddle/fluid/operators/run_program_op.h
index 505ce4c09681d3405227b0e2e8b8b1209a3d359f..830334043c4d703e7fafbb1565bd896da0264a16 100644
--- a/paddle/fluid/operators/run_program_op.h
+++ b/paddle/fluid/operators/run_program_op.h
@@ -17,10 +17,12 @@ limitations under the License. */
 #include <algorithm>
 #include <iterator>
 #include <string>
+#include <unordered_set>
 #include <utility>
 #include <vector>
 
 #include "paddle/fluid/framework/executor.h"
+#include "paddle/fluid/framework/op_desc.h"
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/framework/operator.h"
 #include "paddle/fluid/framework/program_desc.h"
@@ -47,13 +49,13 @@ static void CheckInputVarStatus(const Variable &var,
       var.IsType<LoDTensor>(), true,
       platform::errors::InvalidArgument(
           "The input variable %s of "
-          "RunProgram(Grad)Op(StaticModelRunner) holds "
+          "RunProgram(Grad)Op holds "
           "wrong type. Expect type is LoDTensor, but receive type is %s.",
           var_name, platform::demangle(framework::ToTypeName(var.Type()))));
   PADDLE_ENFORCE_EQ(
       var.Get<LoDTensor>().IsInitialized(), true,
       platform::errors::InvalidArgument("The tensor in input variable %s of "
-                                        "RunProgram(Grad)Op(StaticModelRunner) "
+                                        "RunProgram(Grad)Op "
                                         "is not initialized.",
                                         var_name));
 }
@@ -66,14 +68,14 @@ static void CheckOutputVarStatus(const Variable &src_var,
         src_var.IsType<LoDTensor>(), true,
         platform::errors::InvalidArgument(
             "The output variable %s get from "
-            "RunProgram(Grad)Op(StaticModelRunner)'s internal scope holds "
+            "RunProgram(Grad)Op's internal scope holds "
             "wrong type. Expect type is LoDTensor, but receive type is %s.",
             var_name,
             platform::demangle(framework::ToTypeName(src_var.Type()))));
     PADDLE_ENFORCE_EQ(src_var.Get<LoDTensor>().IsInitialized(), true,
                       platform::errors::InvalidArgument(
                           "The tensor in output variable %s get from "
-                          "RunProgram(Grad)Op(StaticModelRunner)'s internal "
+                          "RunProgram(Grad)Op's internal "
                           "scope is not initialized.",
                           var_name));
   } else if (dst_var.IsType<SelectedRows>()) {
@@ -81,20 +83,20 @@ static void CheckOutputVarStatus(const Variable &src_var,
         src_var.IsType<SelectedRows>(), true,
         platform::errors::InvalidArgument(
             "The output variable %s get from "
-            "RunProgram(Grad)Op(StaticModelRunner)'s internal scope holds "
+            "RunProgram(Grad)Op's internal scope holds "
             "wrong type. Expect type is SelectedRows, but receive type is %s.",
             var_name,
             platform::demangle(framework::ToTypeName(src_var.Type()))));
     PADDLE_ENFORCE_EQ(src_var.Get<SelectedRows>().value().IsInitialized(), true,
                       platform::errors::InvalidArgument(
                           "The tensor in output variable %s get from "
-                          "RunProgram(Grad)Op(StaticModelRunner)'s "
+                          "RunProgram(Grad)Op's "
                           "internal scope is not initialized.",
                           var_name));
 
   } else {
     PADDLE_THROW(platform::errors::InvalidArgument(
-        "The RunProgram(Grad)Op(StaticModelRunner) only support output "
+        "The RunProgram(Grad)Op only support output "
         "variable of type LoDTensor or SelectedRows, "
         "but received variable %s's type is %s",
         var_name, platform::demangle(framework::ToTypeName(dst_var.Type()))));
@@ -141,7 +143,7 @@ static void ShareVarsFromScope(const std::vector<Variable *> &vars,
     auto *var = scope->FindVar(var_names[i]);
     PADDLE_ENFORCE_NOT_NULL(
         var, platform::errors::NotFound("The output variable %s is not in "
-                                        "RunProgram(Grad)Op(StaticModelRunner)'"
+                                        "RunProgram(Grad)Op'"
                                         "s internal scope.",
                                         var_names[i]));
     CheckOutputVarStatus(*var, *vars[i], var_names[i]);
@@ -149,14 +151,46 @@ static void ShareVarsFromScope(const std::vector<Variable *> &vars,
   }
 }
 
-static void AppendSkipDeletionVars(
-    std::vector<std::string> *all_vars,
-    const std::vector<std::string> &append_vars) {
+static void AppendSkipDeletionVars(const std::vector<std::string> &append_vars,
+                                   std::vector<std::string> *all_vars) {
   for (auto &var : append_vars) {
     all_vars->emplace_back(var);
   }
 }
 
+static void AppendSafeEagerDeletionSkipVars(
+    const framework::ProgramDesc &program,
+    std::vector<std::string> *skip_vars) {
+  const framework::BlockDesc &block = program.Block(0);
+  const std::vector<framework::OpDesc *> &all_ops = block.AllOps();
+
+  std::unordered_set<std::string> grad_op_output;
+  std::unordered_set<std::string> grad_op_input;
+  for (const framework::OpDesc *op : all_ops) {
+    int op_role = BOOST_GET_CONST(
+        int, op->GetAttr(framework::OpProtoAndCheckerMaker::OpRoleAttrName()));
+    if ((op_role & static_cast<int>(framework::OpRole::kBackward)) == 0) {
+      continue;
+    }
+
+    for (const std::string &in_arg_name : op->InputArgumentNames()) {
+      grad_op_input.emplace(in_arg_name);
+    }
+    for (const std::string &out_arg_name : op->OutputArgumentNames()) {
+      grad_op_output.emplace(out_arg_name);
+    }
+  }
+
+  // For the grad op input variables, if it is not output of grad_op, it may
+  // be output of forward op and we should set the variables as skip_var to
+  // prevent it being deleted when grad op is called multiple times.
+  for (const std::string &var_name : grad_op_input) {
+    if (grad_op_output.find(var_name) == grad_op_output.end()) {
+      skip_vars->emplace_back(var_name);
+    }
+  }
+}
+
 }  // namespace details
 
 template <typename DeviceContext, typename T>
@@ -192,7 +226,7 @@ class RunProgramOpKernel : public framework::OpKernel<T> {
 
     // skip delete vars
     std::vector<std::string> skip_vars;
-    details::AppendSkipDeletionVars(&skip_vars, output_var_names);
+    details::AppendSkipDeletionVars(output_var_names, &skip_vars);
     VLOG(2) << "Prepare to skip " << skip_vars.size()
             << " var(s): " << string::join_strings(skip_vars, ' ');
 
@@ -261,20 +295,21 @@ class RunProgramGradOpKernel : public framework::OpKernel<T> {
         out_scope_vec->size(), 1,
         platform::errors::InvalidArgument(
             "The OutScope of RunProgramGradOp should only hold one scope."));
+    auto &scope = *(out_scope_vec->front());
 
     // Step 2. prepare executor and scope
     framework::Executor exe(ctx.GetPlace());
 
     // skip delete vars
     std::vector<std::string> skip_vars;
-    details::AppendSkipDeletionVars(&skip_vars, input_grad_var_names);
-    details::AppendSkipDeletionVars(&skip_vars, param_grad_names);
+    details::AppendSkipDeletionVars(input_grad_var_names, &skip_vars);
+    details::AppendSkipDeletionVars(param_grad_names, &skip_vars);
+    details::AppendSafeEagerDeletionSkipVars(*program, &skip_vars);
     VLOG(2) << "Prepare to skip " << skip_vars.size()
             << " var(s): " << string::join_strings(skip_vars, ' ');
 
     auto exe_ctx = exe.Prepare(*program, 0, skip_vars);
 
-    auto &scope = *(out_scope_vec->front());
     details::ShareVarsIntoScope(output_grad_vars, output_grad_var_names,
                                 &scope);
 
diff --git a/paddle/fluid/operators/scatter.cu.h b/paddle/fluid/operators/scatter.cu.h
index 9de810154e62278ea83273d1979c51b9b2429d39..7890d50e109281214df0bcdb9ac62884eab94791 100644
--- a/paddle/fluid/operators/scatter.cu.h
+++ b/paddle/fluid/operators/scatter.cu.h
@@ -26,14 +26,11 @@ namespace operators {
 
 using Tensor = framework::Tensor;
 
-#define CUDA_1D_KERNEL_LOOP(i, n)                              \
-  for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < (n); \
-       i += blockDim.x * gridDim.x)
 template <typename T, typename IndexT = int>
 __global__ void ScatterInitCUDAKernel(const IndexT* indices, T* output,
                                       size_t index_size, size_t slice_size,
                                       bool overwrite) {
-  CUDA_1D_KERNEL_LOOP(i, index_size * slice_size) {
+  CUDA_KERNEL_LOOP(i, index_size * slice_size) {
     int indices_i = i / slice_size;
     int slice_i = i - indices_i * slice_size;  // offset inside the slice
     IndexT scatter_i = indices[indices_i];
@@ -46,7 +43,7 @@ template <typename T, typename IndexT = int>
 __global__ void ScatterCUDAKernel(const T* params, const IndexT* indices,
                                   T* output, size_t index_size,
                                   size_t slice_size, bool overwrite) {
-  CUDA_1D_KERNEL_LOOP(i, index_size * slice_size) {
+  CUDA_KERNEL_LOOP(i, index_size * slice_size) {
     int indices_i = i / slice_size;
     int slice_i = i - indices_i * slice_size;  // offset inside the slice
     IndexT scatter_i = indices[indices_i];
@@ -64,7 +61,7 @@ __global__ void ScatterNdCUDAKernel(const T* update, const IndexT* indices,
                                     T* output, const int* output_dims,
                                     size_t remain_size, size_t slice_size,
                                     size_t end_size) {
-  CUDA_1D_KERNEL_LOOP(i, remain_size * slice_size) {
+  CUDA_KERNEL_LOOP(i, remain_size * slice_size) {
     int indices_i = i / slice_size;
     int slice_i = i - indices_i * slice_size;  // offset inside the slice
     IndexT gather_i = 0;
diff --git a/paddle/fluid/operators/search_compute.h b/paddle/fluid/operators/search_compute.h
index 675f5691aaa5429f6dd17b98c722fc7d3ed8d2c0..d166b350af30f7eb1118b4b507e80a515c1a42bb 100644
--- a/paddle/fluid/operators/search_compute.h
+++ b/paddle/fluid/operators/search_compute.h
@@ -14,7 +14,9 @@ limitations under the License. */
 
 #pragma once
 
+#if !defined(PADDLE_WITH_ARM)
 #include <immintrin.h>
+#endif
 #include <cfloat>
 #include <cmath>
 #include <cstring>
@@ -72,6 +74,8 @@ void call_gemm_batched(const framework::ExecutionContext& ctx,
   }
 }
 
+#if !defined(PADDLE_WITH_ARM)
+
 #define __m256x __m256
 
 static const unsigned int AVX_STEP_SIZE = 8;
@@ -94,6 +98,8 @@ static const unsigned int SSE_CUT_LEN_MASK = 1U;
 #define _mm_store_px _mm_storeu_ps
 #define _mm_load1_px _mm_load1_ps
 
+#endif
+
 template <typename T>
 inline void axpy(const T* x, T* y, size_t len, const T alpha) {
   unsigned int jjj, lll;
@@ -108,6 +114,8 @@ inline void axpy(const T* x, T* y, size_t len, const T alpha) {
         _mm256_add_px(_mm256_load_px(y + jjj),
                       _mm256_mul_px(mm_alpha, _mm256_load_px(x + jjj))));
   }
+#elif defined(PADDLE_WITH_ARM)
+  PADDLE_THROW(platform::errors::Unimplemented("axpy is not supported"));
 #else
   lll = len & ~SSE_CUT_LEN_MASK;
   __m128x mm_alpha = _mm_load1_px(&alpha);
@@ -135,6 +143,8 @@ inline void axpy_noadd(const T* x, T* y, size_t len, const T alpha) {
   for (jjj = 0; jjj < lll; jjj += AVX_STEP_SIZE) {
     _mm256_store_px(y + jjj, _mm256_mul_px(mm_alpha, _mm256_load_px(x + jjj)));
   }
+#elif defined(PADDLE_WITH_ARM)
+  PADDLE_THROW(platform::errors::Unimplemented("axpy_noadd is not supported"));
 #else
   lll = len & ~SSE_CUT_LEN_MASK;
   __m128x mm_alpha = _mm_load1_px(&alpha);
diff --git a/paddle/fluid/operators/sequence_ops/sequence_topk_avg_pooling_op.cc b/paddle/fluid/operators/sequence_ops/sequence_topk_avg_pooling_op.cc
index 8b5d859a8d315434d3946d760b00e14b5f865d72..63420ee30e446da7420a4c4a71853c28e73a403d 100644
--- a/paddle/fluid/operators/sequence_ops/sequence_topk_avg_pooling_op.cc
+++ b/paddle/fluid/operators/sequence_ops/sequence_topk_avg_pooling_op.cc
@@ -24,24 +24,30 @@ class SequenceTopkAvgPoolingOp : public framework::OperatorWithKernel {
   using framework::OperatorWithKernel::OperatorWithKernel;
 
   void InferShape(framework::InferShapeContext* ctx) const override {
-    PADDLE_ENFORCE_EQ(ctx->HasInput("X"), true,
-                      "Input(X) of SequencePoolOp should not be null.");
-    PADDLE_ENFORCE_EQ(ctx->HasInput("ROW"), true,
-                      "Input(ROW) of SequencePoolOp should not be null.");
-    PADDLE_ENFORCE_EQ(ctx->HasInput("COLUMN"), true,
-                      "Input(COLUMN) of SequencePoolOp should not be null.");
-    PADDLE_ENFORCE_EQ(ctx->HasOutput("Out"), true,
-                      "Output(Out) of SequencePoolOp should not be null.");
-    PADDLE_ENFORCE_EQ(ctx->HasOutput("pos"), true,
-                      "pos(out) should not be null");
+    OP_INOUT_CHECK(ctx->HasInput("X"), "Input", "X", "SequenceTopkAvgPooling");
+    OP_INOUT_CHECK(ctx->HasInput("ROW"), "Input", "ROW",
+                   "SequenceTopkAvgPooling");
+    OP_INOUT_CHECK(ctx->HasInput("COLUMN"), "Input", "COLUMN",
+                   "SequenceTopkAvgPooling");
+    OP_INOUT_CHECK(ctx->HasOutput("Out"), "Output", "Out",
+                   "SequenceTopkAvgPooling");
+    OP_INOUT_CHECK(ctx->HasOutput("pos"), "Output", "pos",
+                   "SequenceTopkAvgPooling");
 
     auto attr = ctx->Attrs();
     auto channel_num = attr.Get<int>("channel_num");
+    PADDLE_ENFORCE_GT(
+        channel_num, 0,
+        platform::errors::InvalidArgument(
+            "Expected channel_num > 0, but received %d.", channel_num));
+
     auto topks = attr.Get<std::vector<int>>("topks");
+    auto num_k = topks.size();
+    PADDLE_ENFORCE_GT(
+        num_k, 0, platform::errors::InvalidArgument(
+                      "Expected topks.size() > 0, but received %zu.", num_k));
 
     auto row_dim = ctx->GetInputDim("ROW");
-
-    auto num_k = topks.size();
     auto row_shape_0 = row_dim[0];
 
     std::vector<int> vec_out_shape;
@@ -49,7 +55,7 @@ class SequenceTopkAvgPoolingOp : public framework::OperatorWithKernel {
     vec_out_shape.push_back(channel_num * num_k);
 
     ctx->SetOutputDim("Out", framework::make_ddim(vec_out_shape));
-    ctx->ShareLoD("X", "Out");
+    ctx->ShareLoD("ROW", "Out");
   }
 };
 
@@ -78,10 +84,10 @@ class SequenceTopkAvgPoolingGradOp : public framework::OperatorWithKernel {
   using framework::OperatorWithKernel::OperatorWithKernel;
 
   void InferShape(framework::InferShapeContext* ctx) const override {
-    PADDLE_ENFORCE_EQ(ctx->HasInput(framework::GradVarName("Out")), true,
-                      "Gradient of Out should not be null.");
-    PADDLE_ENFORCE_EQ(ctx->HasInput("X"), true,
-                      "The input X should not be null.");
+    OP_INOUT_CHECK(ctx->HasInput(framework::GradVarName("Out")), "Input",
+                   framework::GradVarName("Out"), "SequenceTopkAvgPoolingGrad");
+    OP_INOUT_CHECK(ctx->HasInput("X"), "Input", "X",
+                   "SequenceTopkAvgPoolingGrad");
 
     ctx->ShareDim("X", /*->*/ framework::GradVarName("X"));
     ctx->ShareLoD("X", /*->*/ framework::GradVarName("X"));
diff --git a/paddle/fluid/operators/sequence_ops/sequence_topk_avg_pooling_op.h b/paddle/fluid/operators/sequence_ops/sequence_topk_avg_pooling_op.h
index 2cb70ee736d38c0b00dfb275ee82a90b5c3c0261..e8e0241e46ad2a33289a77d8607546b4522b69bf 100644
--- a/paddle/fluid/operators/sequence_ops/sequence_topk_avg_pooling_op.h
+++ b/paddle/fluid/operators/sequence_ops/sequence_topk_avg_pooling_op.h
@@ -13,52 +13,57 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #pragma once
+#include <functional>
 #include <limits>
+#include <queue>
 #include <string>
+#include <utility>
 #include <vector>
 #include "paddle/fluid/framework/eigen.h"
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/operators/math/math_function.h"
 
+namespace paddle {
+namespace operators {
+
+using Tensor = framework::Tensor;
+using LoDTensor = framework::LoDTensor;
+static constexpr int TopKPosPaddingId = -1;
+
+namespace details {
+
 template <typename T>
-void get_topk_pos(const T* data, int length, int k, int* pos) {
-  size_t real_k = k < length ? k : length;
-
-  std::vector<T> v(data, data + length);
-
-  std::vector<int> topk_pos;
-  T min_val = std::numeric_limits<T>::lowest();
-  while (topk_pos.size() < real_k) {
-    T max_val = min_val;
-    int max_pos = -1;
-    for (int i = 0; i < length; ++i) {
-      if (v[i] > max_val) {
-        max_pos = i;
-        max_val = v[i];
+static void get_topk_pos(const T* data, int length, int k, int* pos) {
+  VLOG(3) << "length: " << length << " , k : " << k;
+
+  std::priority_queue<std::pair<T, int>, std::vector<std::pair<T, int>>,
+                      std::greater<std::pair<T, int>>>
+      topk_queue;
+
+  for (int i = 0; i < length; ++i) {
+    T elem = data[i];
+    if (topk_queue.size() < static_cast<size_t>(k)) {
+      topk_queue.emplace(elem, i);
+    } else {
+      if (elem >= topk_queue.top().first) {
+        // replace top node if found a bigger value
+        topk_queue.pop();
+        topk_queue.emplace(elem, i);
       }
     }
-
-    assert(max_pos >= 0);
-
-    topk_pos.push_back(max_pos);
-    v[max_pos] = min_val;
   }
-
-  assert(topk_pos.size() > 0);
-  while (topk_pos.size() < (size_t)k) {
-    topk_pos.push_back(-1);
+  // reversely assign value
+  int real_k = topk_queue.size();
+  for (int i = real_k - 1; i >= 0; --i) {
+    pos[i] = topk_queue.top().second;
+    topk_queue.pop();
   }
-
-  for (size_t i = 0; i < topk_pos.size(); ++i) {
-    pos[i] = topk_pos[i];
+  // if length of data is less than k, fill TopKPosPaddingId at the end of pos.
+  for (int i = real_k; i < k; ++i) {
+    pos[i] = TopKPosPaddingId;
   }
 }
-
-namespace paddle {
-namespace operators {
-
-using Tensor = framework::Tensor;
-using LoDTensor = framework::LoDTensor;
+}  // namespace details
 
 template <typename DeviceContext, typename T>
 class SequenceTopkAvgPoolingKernel : public framework::OpKernel<T> {
@@ -70,20 +75,29 @@ class SequenceTopkAvgPoolingKernel : public framework::OpKernel<T> {
     auto* out = context.Output<LoDTensor>("Out");
     auto* pos = context.Output<Tensor>("pos");
 
-    PADDLE_ENFORCE_EQ(in->lod().empty(), false,
-                      "Input(X) Tensor of SequenceTopkAvgPoolingOp does not "
-                      "contain LoD information.");
-    PADDLE_ENFORCE_EQ(row->lod().empty(), false,
-                      "Input(ROW) Tensor of SequenceTopkAvgPoolingOp does not "
-                      "contain LoD information.");
-    PADDLE_ENFORCE_EQ(col->lod().empty(), false,
-                      "Input(COLUMN) Tensor of SequenceTopkAvgPoolingOp does "
-                      "not contain LoD information.");
+    PADDLE_ENFORCE_EQ(
+        in->lod().empty(), false,
+        platform::errors::InvalidArgument(
+            "Input(X) Tensor of SequenceTopkAvgPoolingOp does not "
+            "contain LoD information."));
+    PADDLE_ENFORCE_EQ(
+        row->lod().empty(), false,
+        platform::errors::InvalidArgument(
+            "Input(ROW) Tensor of SequenceTopkAvgPoolingOp does not "
+            "contain LoD information."));
+    PADDLE_ENFORCE_EQ(
+        col->lod().empty(), false,
+        platform::errors::InvalidArgument(
+            "Input(COLUMN) Tensor of SequenceTopkAvgPoolingOp does "
+            "not contain LoD information."));
 
     auto channel_num = context.Attr<int>("channel_num");
     auto topks = context.Attr<std::vector<int>>("topks");
     auto k_num = topks.size();
     auto max_k = topks[topks.size() - 1];
+    PADDLE_ENFORCE_GE(max_k, 0,
+                      platform::errors::InvalidArgument(
+                          "Expected max_k >= 0, but received %d.", max_k));
     std::vector<int> vec_pos_shape;
     auto in_lod = in->lod()[0];
 
@@ -116,7 +130,10 @@ class SequenceTopkAvgPoolingKernel : public framework::OpKernel<T> {
       int row_size = row_lod[i + 1] - row_lod[i];
       int col_size = col_lod[i + 1] - col_lod[i];
       PADDLE_ENFORCE_EQ(total_size, channel_num * row_size * col_size,
-                        "size wrong in sequence_topk_avg_pooling_op!");
+                        platform::errors::PreconditionNotMet(
+                            "Expected total_size == channel_num * row_size * "
+                            "col_size, but got %d != %d.",
+                            total_size, channel_num * row_size * col_size));
 
       int feature_num = row_size * col_size;
       for (int j = 0; j < channel_num; ++j) {
@@ -130,14 +147,14 @@ class SequenceTopkAvgPoolingKernel : public framework::OpKernel<T> {
           auto out_slice_data = dout_data + row_lod[i] * channel_num * k_num +
                                 r * channel_num * k_num + j * k_num;
 
-          get_topk_pos<T>(row_data, col_size, max_k, pos_slice_data);
-          if (pos_slice_data[0] == -1) {
+          details::get_topk_pos<T>(row_data, col_size, max_k, pos_slice_data);
+          if (pos_slice_data[0] == TopKPosPaddingId) {
             sum_data[0] = 0.0;
           } else {
             sum_data[0] = row_data[pos_slice_data[0]];
           }
           for (int k = 1; k < max_k; ++k) {
-            if (pos_slice_data[k] == -1) {
+            if (pos_slice_data[k] == TopKPosPaddingId) {
               sum_data[k] = sum_data[k - 1];
             } else {
               sum_data[k] = sum_data[k - 1] + row_data[pos_slice_data[k]];
@@ -206,7 +223,7 @@ class SequenceTopkAvgPoolingGradKernel : public framework::OpKernel<T> {
 
           for (size_t m = 0; m < k_num; ++m) {
             for (int k = 0; k < topks[m]; ++k) {
-              if (pos_slice_data[k] == -1) {
+              if (pos_slice_data[k] == TopKPosPaddingId) {
                 break;
               } else {
                 in_slice_data[pos_slice_data[k]] += row_data[m] / topks[m];
diff --git a/paddle/fluid/operators/sigmoid_cross_entropy_with_logits_op.cu b/paddle/fluid/operators/sigmoid_cross_entropy_with_logits_op.cu
index 7c3a0ecba02a5d16dcb45025284680ba933ce9d5..cdcd51904e8840772ffcd18aac3a24eea7b7fd17 100644
--- a/paddle/fluid/operators/sigmoid_cross_entropy_with_logits_op.cu
+++ b/paddle/fluid/operators/sigmoid_cross_entropy_with_logits_op.cu
@@ -31,15 +31,11 @@ static inline int NumBlocks(const int N) {
                   kNumMaxinumNumBlocks);
 }
 
-#define CUDA_1D_KERNEL_LOOP(i, n)                              \
-  for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < (n); \
-       i += blockDim.x * gridDim.x)
-
 template <typename T>
 __global__ void GPUSigmoidForward(const T *x_data, const T *label_data,
                                   const int ignore_index, const int limit,
                                   T *out_data, T *counts) {
-  CUDA_1D_KERNEL_LOOP(i, limit) {
+  CUDA_KERNEL_LOOP(i, limit) {
     T x = x_data[i];
     T label = label_data[i];
     T eps = static_cast<T>(1e-5);
@@ -77,14 +73,14 @@ __global__ void Sum(const T *counts, int num, const T eps, T *sum) {
 
 template <typename T>
 __global__ void Div(T *loss, const int num, const T *norm) {
-  CUDA_1D_KERNEL_LOOP(i, num) { loss[i] /= norm[0]; }
+  CUDA_KERNEL_LOOP(i, num) { loss[i] /= norm[0]; }
 }
 
 template <typename T>
 __global__ void GPUSigmoidBackward(const T *x_data, const T *label_data,
                                    const int ignore_index, const T *dout_data,
                                    const int limit, T *dx_data, T *counts) {
-  CUDA_1D_KERNEL_LOOP(i, limit) {
+  CUDA_KERNEL_LOOP(i, limit) {
     T x = x_data[i];
     T label = label_data[i];
     T dout = dout_data[i];
diff --git a/paddle/fluid/operators/slice_op.cu b/paddle/fluid/operators/slice_op.cu
index d6945df9e184e0582628f56eecb96139f344bf52..7493b18936492c79107d601516fc7e4f5d05194e 100644
--- a/paddle/fluid/operators/slice_op.cu
+++ b/paddle/fluid/operators/slice_op.cu
@@ -12,145 +12,12 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include <thrust/device_vector.h>
-#include "paddle/fluid/framework/tensor_util.h"
-#include "paddle/fluid/operators/math/math_function.h"
 #include "paddle/fluid/operators/slice_op.h"
-#include "paddle/fluid/platform/cuda_device_function.h"
-#include "paddle/fluid/platform/cuda_primitives.h"
 #include "paddle/fluid/platform/float16.h"
-namespace paddle {
-namespace operators {
-
-using platform::PADDLE_CUDA_NUM_THREADS;
-
-template <size_t D>
-__global__ void Padding(const paddle::platform::float16* d_out,
-                        const int64_t* out_dims, const int64_t* in_dims,
-                        const int64_t* offsets, int64_t n,
-                        paddle::platform::float16* d_in) {
-  int64_t out_idx = threadIdx.x + blockDim.x * blockIdx.x;
-  if (out_idx < n) {
-    int64_t out_idx_tmp = out_idx;
-    int64_t coords[D] = {0};
-    for (int i = D - 1; i >= 0; --i) {
-      coords[i] = out_idx_tmp % out_dims[i];
-      out_idx_tmp /= out_dims[i];
-      coords[i] += offsets[i];
-    }
-
-    int64_t in_idx = 0;
-    for (int i = 0; i < D; ++i) {
-      in_idx = in_idx * in_dims[i] + coords[i];
-    }
-
-    d_in[in_idx] = d_out[out_idx];
-  }
-}
-
-template <>
-class SliceGradKernel<paddle::platform::CUDADeviceContext,
-                      paddle::platform::float16>
-    : public framework::OpKernel<paddle::platform::float16> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    auto* d_out = ctx.Input<framework::Tensor>(framework::GradVarName("Out"));
-    auto* d_in = ctx.Output<framework::Tensor>(framework::GradVarName("Input"));
-    d_in->mutable_data<paddle::platform::float16>(ctx.GetPlace());
-
-    auto out_dims = d_out->dims();
-    auto in_dims = d_in->dims();
-    int rank = out_dims.size();
-    std::vector<int64_t> offsets(rank, 0);
-    auto axes = ctx.Attr<std::vector<int>>("axes");
-    auto starts_int = ctx.Attr<std::vector<int>>("starts");
-    std::vector<int64_t> starts(starts_int.begin(), starts_int.end());
-
-    auto list_new_starts_tensor =
-        ctx.MultiInput<framework::Tensor>("StartsTensorList");
-
-    if (list_new_starts_tensor.size() > 0) {
-      starts = GetDataFromTensorList<int64_t>(list_new_starts_tensor);
-    } else if (ctx.HasInput("StartsTensor")) {
-      auto* starts_tensor = ctx.Input<framework::Tensor>("StartsTensor");
-      starts = GetDataFromTensor<int64_t>(starts_tensor);
-    }
-
-    for (size_t i = 0; i < starts.size(); ++i) {
-      if (starts[i] < 0) {
-        starts[i] += in_dims[axes[i]];
-      }
-      offsets[axes[i]] = std::max(starts[i], static_cast<int64_t>(0));
-    }
-
-    math::SetConstant<paddle::platform::CUDADeviceContext,
-                      paddle::platform::float16>
-        set_zero;
-    auto& dev_ctx =
-        ctx.template device_context<paddle::platform::CUDADeviceContext>();
-    set_zero(dev_ctx, d_in, static_cast<paddle::platform::float16>(0));
-
-    int64_t numel = d_out->numel();
-    dim3 blocks((numel - 1) / PADDLE_CUDA_NUM_THREADS + 1);
-    dim3 threads(PADDLE_CUDA_NUM_THREADS);
-    auto stream = ctx.cuda_device_context().stream();
-    const std::vector<int64_t> out_shape =
-        framework::vectorize<int64_t>(out_dims);
-    const std::vector<int64_t> in_shape =
-        framework::vectorize<int64_t>(in_dims);
-
-    framework::Tensor out_dims_tensor;
-    framework::Tensor in_dims_tensor;
-    framework::Tensor offsets_tensor;
-    framework::TensorFromVector(out_shape, ctx.device_context(),
-                                &out_dims_tensor);
-    framework::TensorFromVector(in_shape, ctx.device_context(),
-                                &in_dims_tensor);
-    framework::TensorFromVector(offsets, ctx.device_context(), &offsets_tensor);
-    const int64_t* out_dims_ptr = out_dims_tensor.data<int64_t>();
-    const int64_t* in_dims_ptr = in_dims_tensor.data<int64_t>();
-    const int64_t* offsets_ptr = offsets_tensor.data<int64_t>();
-
-    switch (rank) {
-      case 1:
-        Padding<1><<<blocks, threads, 0, stream>>>(
-            d_out->data<paddle::platform::float16>(), out_dims_ptr, in_dims_ptr,
-            offsets_ptr, numel, d_in->data<paddle::platform::float16>());
-        break;
-      case 2:
-        Padding<2><<<blocks, threads, 0, stream>>>(
-            d_out->data<paddle::platform::float16>(), out_dims_ptr, in_dims_ptr,
-            offsets_ptr, numel, d_in->data<paddle::platform::float16>());
-        break;
-      case 3:
-        Padding<3><<<blocks, threads, 0, stream>>>(
-            d_out->data<paddle::platform::float16>(), out_dims_ptr, in_dims_ptr,
-            offsets_ptr, numel, d_in->data<paddle::platform::float16>());
-        break;
-      case 4:
-        Padding<4><<<blocks, threads, 0, stream>>>(
-            d_out->data<paddle::platform::float16>(), out_dims_ptr, in_dims_ptr,
-            offsets_ptr, numel, d_in->data<paddle::platform::float16>());
-        break;
-      case 5:
-        Padding<5><<<blocks, threads, 0, stream>>>(
-            d_out->data<paddle::platform::float16>(), out_dims_ptr, in_dims_ptr,
-            offsets_ptr, numel, d_in->data<paddle::platform::float16>());
-        break;
-      case 6:
-        Padding<6><<<blocks, threads, 0, stream>>>(
-            d_out->data<paddle::platform::float16>(), out_dims_ptr, in_dims_ptr,
-            offsets_ptr, numel, d_in->data<paddle::platform::float16>());
-        break;
-    }
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
 
 namespace ops = paddle::operators;
 namespace plat = paddle::platform;
+
 REGISTER_OP_CUDA_KERNEL(
     slice, ops::SliceKernel<paddle::platform::CUDADeviceContext, float>,
     ops::SliceKernel<paddle::platform::CUDADeviceContext, double>,
diff --git a/paddle/fluid/operators/slice_op.h b/paddle/fluid/operators/slice_op.h
index 39cc605f6b318d5a356f5e9fd2d66fc5a8b6700d..ee46f4d821c783813a3cdcf051c58bfa8d3212e9 100644
--- a/paddle/fluid/operators/slice_op.h
+++ b/paddle/fluid/operators/slice_op.h
@@ -350,7 +350,7 @@ class SliceGradKernel : public framework::OpKernel<T> {
       platform::DeviceContextPool& pool =
           platform::DeviceContextPool::Instance();
       auto& dev_ctx = *pool.Get(context.GetPlace());
-      T value = 0.0;
+      T value = T(0);
       math::SetConstant<DeviceContext, T> functor;
       for (int i = 0; i < d_in_size; ++i) {
         auto dim = input_array->at(i).dims();
@@ -440,7 +440,7 @@ class SliceGradKernel : public framework::OpKernel<T> {
     auto d_out_t =
         framework::EigenTensor<T, D, Eigen::RowMajor, Eigen::DenseIndex>::From(
             *d_out, out_dims);
-    d_in_t.device(place) = d_out_t.pad(paddings, 0);
+    d_in_t.device(place) = d_out_t.pad(paddings, T(0));
   }
 };
 }  // namespace operators
diff --git a/paddle/fluid/operators/softmax_with_cross_entropy_op.cu b/paddle/fluid/operators/softmax_with_cross_entropy_op.cu
index dbda4b9b7e03a41a9630722dfe82fbde62ee5437..ba56e5e36f9851276b4986022452c7914e30dde4 100644
--- a/paddle/fluid/operators/softmax_with_cross_entropy_op.cu
+++ b/paddle/fluid/operators/softmax_with_cross_entropy_op.cu
@@ -24,24 +24,24 @@ template <typename T>
 __global__ void CrossEntropyGrad(T* logit_grad, const int64_t* labels,
                                  const int n, const int d, const int remain,
                                  const int ignore_index) {
-  for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < n * remain;
-       i += blockDim.x * gridDim.x) {
-    int idx_n = i / remain;
-    int idx_remain = i % remain;
-    int idx = idx_n * d + labels[i] * remain + idx_remain;
-    logit_grad[idx] -=
-        ignore_index == labels[i] ? static_cast<T>(0.) : static_cast<T>(1.);
+  CUDA_KERNEL_LOOP(index, n * remain) {
+    int idx_n = index / remain;
+    int idx_remain = index % remain;
+    int tmp = labels[index];
+    if (ignore_index != tmp) {
+      int idx = idx_n * d + tmp * remain + idx_remain;
+      logit_grad[idx] -= static_cast<T>(1.);
+    }
   }
 }
 
 template <typename T>
 __global__ void Scale(T* logit_grad, const T* loss_grad, const int num,
                       const int d, const int remain) {
-  for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < num;
-       i += blockDim.x * gridDim.x) {
-    int idx_n = i / d;
-    int idx_remain = i % remain;
-    logit_grad[i] *= loss_grad[idx_n * remain + idx_remain];
+  CUDA_KERNEL_LOOP(index, num) {
+    int idx_n = index / d;
+    int idx_remain = index % remain;
+    logit_grad[index] *= loss_grad[idx_n * remain + idx_remain];
   }
 }
 
diff --git a/paddle/fluid/operators/trace_op.cc b/paddle/fluid/operators/trace_op.cc
index 6bb158c5816762a6d9c4660f49b3fb48168d57f6..66766b4e1cd830f8dda40befa228294b976a4ff7 100644
--- a/paddle/fluid/operators/trace_op.cc
+++ b/paddle/fluid/operators/trace_op.cc
@@ -30,8 +30,8 @@ class TraceOp : public framework::OperatorWithKernel {
         ctx->HasOutput("Out"), true,
         platform::errors::NotFound("Output of TraceOp is not found."));
 
-    int dim1 = ctx->Attrs().Get<int>("dim1");
-    int dim2 = ctx->Attrs().Get<int>("dim2");
+    int dim1 = ctx->Attrs().Get<int>("axis1");
+    int dim2 = ctx->Attrs().Get<int>("axis2");
 
     auto x_dims = ctx->GetInputDim("Input");
 
@@ -84,15 +84,15 @@ class TraceOpMaker : public framework::OpProtoAndCheckerMaker {
         )DOC")
         .SetDefault(0);
     AddAttr<int>(
-        "dim1",
-        R"DOC((int, default 0), the first dim of the 2-D planes from which the diagonals should be taken. 
-        Can be both positive and negative. Default: 0.
+        "axis1",
+        R"DOC((int, default 0), the first axis of the 2-D planes from which the diagonals should be taken. 
+        Can be either positive or negative. Default: 0.
         )DOC")
         .SetDefault(-2);
     AddAttr<int>(
-        "dim2",
-        R"DOC((int, default 1), the second dim of the 2-D planes from which the diagonals should be taken. 
-        Can be both positive and negative. Default: 1.
+        "axis2",
+        R"DOC((int, default 1), the second axis of the 2-D planes from which the diagonals should be taken. 
+        Can be either positive or negative. Default: 1.
         )DOC")
         .SetDefault(-1);
     AddComment(R"DOC(
diff --git a/paddle/fluid/operators/trace_op.cu b/paddle/fluid/operators/trace_op.cu
index ffba298cc232e82bb7f133f181944f63df72da67..452f2dd9d62bedb449979a11698e4eb0bb116ce9 100644
--- a/paddle/fluid/operators/trace_op.cu
+++ b/paddle/fluid/operators/trace_op.cu
@@ -33,8 +33,8 @@ class TraceCUDAKernel : public framework::OpKernel<T> {
     auto* out = context.Output<framework::Tensor>("Out");
 
     const int64_t offset = context.Attr<int>("offset");
-    const int64_t dim1 = context.Attr<int>("dim1");
-    const int64_t dim2 = context.Attr<int>("dim2");
+    const int64_t dim1 = context.Attr<int>("axis1");
+    const int64_t dim2 = context.Attr<int>("axis2");
 
     T* out_data = out->mutable_data<T>(context.GetPlace());
     const framework::Tensor diag =
diff --git a/paddle/fluid/operators/trace_op.h b/paddle/fluid/operators/trace_op.h
index 51d807bfb3dd02b2e15fe39ebb749f927667daec..54c4251a38cf10a8f489ca78346fae9471b464db 100644
--- a/paddle/fluid/operators/trace_op.h
+++ b/paddle/fluid/operators/trace_op.h
@@ -174,8 +174,8 @@ class TraceKernel : public framework::OpKernel<T> {
     auto* out = context.Output<framework::Tensor>("Out");
 
     const int64_t offset = context.Attr<int>("offset");
-    const int64_t dim1 = context.Attr<int>("dim1");
-    const int64_t dim2 = context.Attr<int>("dim2");
+    const int64_t dim1 = context.Attr<int>("axis1");
+    const int64_t dim2 = context.Attr<int>("axis2");
 
     auto output_dims = out->dims();
 
@@ -205,8 +205,8 @@ class TraceGradKernel : public framework::OpKernel<T> {
         context.Output<framework::Tensor>(framework::GradVarName("Input"));
 
     int64_t offset = context.Attr<int>("offset");
-    int64_t dim1 = context.Attr<int>("dim1");
-    int64_t dim2 = context.Attr<int>("dim2");
+    int64_t dim1 = context.Attr<int>("axis1");
+    int64_t dim2 = context.Attr<int>("axis2");
 
     auto input_dims = d_x->dims();
     auto input_stride = framework::stride(input_dims);
diff --git a/paddle/fluid/operators/transpose_op.cu b/paddle/fluid/operators/transpose_op.cu
index f2d39a35c3d8d8db39c5fbbfe12283ce1c874e54..79dd29ebc691c59c653c3114373226994f24b131 100644
--- a/paddle/fluid/operators/transpose_op.cu
+++ b/paddle/fluid/operators/transpose_op.cu
@@ -29,10 +29,6 @@ using Tensor = framework::Tensor;
 using Dim3 = framework::Dim3;
 using Index3 = framework::Index3;
 
-#define CUDA_1D_KERNEL_LOOP(i, n)                              \
-  for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < (n); \
-       i += blockDim.x * gridDim.x)
-
 struct EqualTo {
   constexpr bool operator()(int a, int b) const { return a == b; }
 };
@@ -464,7 +460,7 @@ __global__ void TransposeSimpleKernel(int nthreads, const T* __restrict__ input,
   output_dims[pos1] = input_dims[1];
   output_dims[pos2] = input_dims[2];
 
-  CUDA_1D_KERNEL_LOOP(output_index, nthreads) {
+  CUDA_KERNEL_LOOP(output_index, nthreads) {
     Index3 output_tensor_index = ConvertTensorIndex(output_index, output_dims);
 
     Index3 input_tensor_index;
@@ -664,19 +660,26 @@ template <typename DeviceContext, typename T>
 class TransposeGPUKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& context) const override {
-    auto* x = context.Input<framework::Tensor>("X");
-    auto* out = context.Output<framework::Tensor>("Out");
-    out->mutable_data<T>(context.GetPlace());
-    if (out->numel() == 0) {
+    auto* x = context.InputVar("X");
+    auto* out = context.OutputVar("Out");
+
+    const framework::Tensor* x_tensor =
+        GetLoDTensorOrSelectedRowsValueFromVar(*x);
+    framework::Tensor* out_tensor =
+        GetMutableLoDTensorOrSelectedRowsValueFromVar(out);
+
+    out_tensor->mutable_data<T>(context.GetPlace());
+    if (out_tensor->numel() == 0) {
       return;
     }
 
     std::vector<int> axis = context.Attr<std::vector<int>>("axis");
     int ndims = axis.size();
     const auto& dev_ctx = context.template device_context<DeviceContext>();
-    auto ret = TransposeSimple<T>::run(dev_ctx, *x, axis, out);
+    auto ret = TransposeSimple<T>::run(dev_ctx, *x_tensor, axis, out_tensor);
     if (!ret) {
-      TransCompute<DeviceContext, T>(ndims, dev_ctx, *x, out, axis);
+      TransCompute<DeviceContext, T>(ndims, dev_ctx, *x_tensor, out_tensor,
+                                     axis);
     }
   }
 };
@@ -684,14 +687,19 @@ template <typename DeviceContext, typename T>
 class TransposeGradGPUKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& context) const override {
-    auto* out_grad =
-        context.Input<framework::Tensor>(framework::GradVarName("Out"));
-    auto* x_grad =
-        context.Output<framework::Tensor>(framework::GradVarName("X"));
-    if (!x_grad) return;
-
-    x_grad->mutable_data<T>(context.GetPlace());
-    if (x_grad->numel() == 0) {
+    auto* out_grad = context.InputVar(framework::GradVarName("Out"));
+    auto* x_grad = context.OutputVar(framework::GradVarName("X"));
+    if (!x_grad) {
+      return;
+    }
+
+    const framework::Tensor* out_grad_tensor =
+        GetLoDTensorOrSelectedRowsValueFromVar(*out_grad);
+    framework::Tensor* x_grad_tensor =
+        GetMutableLoDTensorOrSelectedRowsValueFromVar(x_grad);
+
+    x_grad_tensor->mutable_data<T>(context.GetPlace());
+    if (x_grad_tensor->numel() == 0) {
       return;
     }
     std::vector<int> axis = context.Attr<std::vector<int>>("axis");
@@ -703,11 +711,11 @@ class TransposeGradGPUKernel : public framework::OpKernel<T> {
 
     int ndims = axis.size();
     const auto& dev_ctx = context.template device_context<DeviceContext>();
-    auto ret =
-        TransposeSimple<T>::run(dev_ctx, *out_grad, reversed_axis, x_grad);
+    auto ret = TransposeSimple<T>::run(dev_ctx, *out_grad_tensor, reversed_axis,
+                                       x_grad_tensor);
     if (!ret) {
-      TransCompute<DeviceContext, T>(ndims, dev_ctx, *out_grad, x_grad,
-                                     reversed_axis);
+      TransCompute<DeviceContext, T>(ndims, dev_ctx, *out_grad_tensor,
+                                     x_grad_tensor, reversed_axis);
     }
   }
 };
diff --git a/paddle/fluid/operators/transpose_op.h b/paddle/fluid/operators/transpose_op.h
index f2951e90ebe883c5006081ff7e4c8f97742cafff..d7f5c3dd457c90eefc4181cdbc662196a046853e 100644
--- a/paddle/fluid/operators/transpose_op.h
+++ b/paddle/fluid/operators/transpose_op.h
@@ -64,16 +64,23 @@ template <typename DeviceContext, typename T>
 class TransposeKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& context) const override {
-    auto* x = context.Input<framework::Tensor>("X");
-    auto* out = context.Output<framework::Tensor>("Out");
-    out->mutable_data<T>(context.GetPlace());
-    if (out->numel() == 0) {
+    auto* x = context.InputVar("X");
+    auto* out = context.OutputVar("Out");
+
+    const framework::Tensor* x_tensor =
+        GetLoDTensorOrSelectedRowsValueFromVar(*x);
+    framework::Tensor* out_tensor =
+        GetMutableLoDTensorOrSelectedRowsValueFromVar(out);
+
+    out_tensor->mutable_data<T>(context.GetPlace());
+    if (out_tensor->numel() == 0) {
       return;
     }
+
     std::vector<int> axis = context.Attr<std::vector<int>>("axis");
     int ndims = axis.size();
     auto& dev_ctx = context.template device_context<DeviceContext>();
-    TransCompute<DeviceContext, T>(ndims, dev_ctx, *x, out, axis);
+    TransCompute<DeviceContext, T>(ndims, dev_ctx, *x_tensor, out_tensor, axis);
   }
 };
 
@@ -81,14 +88,19 @@ template <typename DeviceContext, typename T>
 class TransposeGradKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& context) const override {
-    auto* out_grad =
-        context.Input<framework::Tensor>(framework::GradVarName("Out"));
-    auto* x_grad =
-        context.Output<framework::Tensor>(framework::GradVarName("X"));
-    if (!x_grad) return;
-
-    x_grad->mutable_data<T>(context.GetPlace());
-    if (x_grad->numel() == 0) {
+    auto* out_grad = context.InputVar(framework::GradVarName("Out"));
+    auto* x_grad = context.OutputVar(framework::GradVarName("X"));
+
+    if (!x_grad) {
+      return;
+    }
+    const framework::Tensor* out_grad_tensor =
+        GetLoDTensorOrSelectedRowsValueFromVar(*out_grad);
+    framework::Tensor* x_grad_tensor =
+        GetMutableLoDTensorOrSelectedRowsValueFromVar(x_grad);
+
+    x_grad_tensor->mutable_data<T>(context.GetPlace());
+    if (x_grad_tensor->numel() == 0) {
       return;
     }
 
@@ -101,8 +113,8 @@ class TransposeGradKernel : public framework::OpKernel<T> {
 
     int ndims = axis.size();
     auto& dev_ctx = context.template device_context<DeviceContext>();
-    TransCompute<DeviceContext, T>(ndims, dev_ctx, *out_grad, x_grad,
-                                   reversed_axis);
+    TransCompute<DeviceContext, T>(ndims, dev_ctx, *out_grad_tensor,
+                                   x_grad_tensor, reversed_axis);
   }
 };
 
diff --git a/paddle/fluid/platform/CMakeLists.txt b/paddle/fluid/platform/CMakeLists.txt
index c49989db3d525fbc33cfd64db1cc9db1f241f253..dcc3a51e72b3ef5ffc29f7db566840e32b5d43e9 100644
--- a/paddle/fluid/platform/CMakeLists.txt
+++ b/paddle/fluid/platform/CMakeLists.txt
@@ -93,13 +93,6 @@ if(WITH_GPU)
     target_link_libraries(device_context cuda_resource_pool)
 endif()
 
-if(WIN32)
-    if(WITH_GPU AND NOT WITH_DSO)
-        get_property(cuda_modules GLOBAL PROPERTY CUDA_MODULES)
-        target_link_libraries(device_context ${cuda_modules})
-    endif(WITH_GPU AND NOT WITH_DSO)
-endif(WIN32)
-
 nv_test(device_context_test SRCS device_context_test.cu DEPS device_context gpu_info)
 
 cc_test(init_test SRCS init_test.cc DEPS device_context)
diff --git a/paddle/fluid/platform/collective_helper.cc b/paddle/fluid/platform/collective_helper.cc
index 71482c10062a599a1a53280192ed0086a7c5d9e4..4cb6ee3143a862f59a3c224a012d380e629e738f 100644
--- a/paddle/fluid/platform/collective_helper.cc
+++ b/paddle/fluid/platform/collective_helper.cc
@@ -57,11 +57,25 @@ class NCCLCommImpl : public NCCLComm {
 
 NCCLComm* NCCLCommContext::CreateNCCLComm(ncclUniqueId* nccl_id, int nranks,
                                           int rank, int dev_id, int ring_id) {
-  PADDLE_ENFORCE_NOT_NULL(nccl_id);
-  PADDLE_ENFORCE_GT(nranks, 1);
-  PADDLE_ENFORCE_GE(rank, 0);
-  PADDLE_ENFORCE_LT(rank, nranks);
-  PADDLE_ENFORCE_GE(dev_id, 0);
+  PADDLE_ENFORCE_NOT_NULL(nccl_id,
+                          platform::errors::InvalidArgument(
+                              "The nccl unique id should not be null."));
+  PADDLE_ENFORCE_GT(
+      nranks, 1,
+      platform::errors::InvalidArgument(
+          "Expected nranks > 1. But received nranks is %d.", nranks));
+  PADDLE_ENFORCE_GE(rank, 0,
+                    platform::errors::InvalidArgument(
+                        "Expected rank >= 0. But received rank is %d.", rank));
+  PADDLE_ENFORCE_LT(
+      rank, nranks,
+      platform::errors::InvalidArgument(
+          "Expected rank < nranks. But received rank is %d, nranks is %d.",
+          rank, nranks));
+  PADDLE_ENFORCE_GE(
+      dev_id, 0,
+      platform::errors::InvalidArgument(
+          "Expected dev_id >= 0. But received dev_id is %d.", dev_id));
 
   ncclComm_t comm = nullptr;
   PADDLE_ENFORCE_CUDA_SUCCESS(cudaSetDevice(dev_id));
@@ -82,14 +96,22 @@ NCCLComm* NCCLCommContext::CreateNCCLComm(ncclUniqueId* nccl_id, int nranks,
 
 void NCCLCommContext::CreateAllNCCLComms(const std::vector<int>& dev_ids,
                                          int ring_id) {
-  PADDLE_ENFORCE_GT(dev_ids.size(), 0);
+  PADDLE_ENFORCE_GT(
+      dev_ids.size(), 0,
+      platform::errors::InvalidArgument("Expected the size of dev_ids > 0. But "
+                                        "received the size of dev_ids is %d.",
+                                        dev_ids.size()));
 
   const int kDevices = dev_ids.size();
   ncclComm_t comms[kDevices];
   PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::ncclCommInitAll(
       comms, dev_ids.size(), dev_ids.data()));
 
-  PADDLE_ENFORCE_EQ(comm_map_.count(ring_id), 0);
+  PADDLE_ENFORCE_EQ(comm_map_.count(ring_id), 0,
+                    platform::errors::InvalidArgument(
+                        "Expected comm_map_.count(ring_id) = 0. But received "
+                        "comm_map_.count(ring_id) is %d.",
+                        comm_map_.count(ring_id)));
   for (size_t i = 0; i < dev_ids.size(); ++i) {
     AssignNCCLComm(comms[i], dev_ids.size(), i, dev_ids[i], ring_id);
     VLOG(1) << "nccl communicator of rank " << i << " in ring " << ring_id
diff --git a/paddle/fluid/platform/collective_helper.h b/paddle/fluid/platform/collective_helper.h
index 61cad961f51a0d800a12200b6a28aa50fa867496..cc19fd5ac4985969c759ef69c4b4036e714b93b4 100644
--- a/paddle/fluid/platform/collective_helper.h
+++ b/paddle/fluid/platform/collective_helper.h
@@ -78,24 +78,28 @@ class NCCLCommContext {
 
   // retrieve a communicator by the ring id in multiprocessing mode
   NCCLComm* Get(int ring_id) const {
-    PADDLE_ENFORCE_GT(comm_map_.count(ring_id), 0,
-                      "comunicator in ring id %d has not been initialized",
-                      ring_id);
+    PADDLE_ENFORCE_GT(
+        comm_map_.count(ring_id), 0,
+        platform::errors::InvalidArgument(
+            "Communicator in ring id %d has not been initialized.", ring_id));
     PADDLE_ENFORCE_EQ(comm_map_.at(ring_id).size(), 1,
-                      "you should specify a device id to retrieve from "
-                      "multiple communicators");
+                      platform::errors::InvalidArgument(
+                          "One device id should be specified to retrieve from "
+                          "multiple communicators."));
     return comm_map_.at(ring_id).begin()->second.get();
   }
 
   // retrieve a communicator by the ring id and the device id
   NCCLComm* Get(int ring_id, int dev_id) const {
-    PADDLE_ENFORCE_GT(comm_map_.count(ring_id), 0,
-                      "comunicator of ring id %d has not been initialized",
-                      ring_id);
+    PADDLE_ENFORCE_GT(
+        comm_map_.count(ring_id), 0,
+        platform::errors::InvalidArgument(
+            "Communicator of ring id %d has not been initialized.", ring_id));
     PADDLE_ENFORCE_GT(
         comm_map_.at(ring_id).count(dev_id), 0,
-        "comunicator at device id %d has not been initialized in ring %d",
-        dev_id, ring_id);
+        platform::errors::InvalidArgument(
+            "Communicator at device id %d has not been initialized in ring %d.",
+            dev_id, ring_id));
     return comm_map_.at(ring_id).at(dev_id).get();
   }
 
diff --git a/paddle/fluid/platform/cpu_helper.cc b/paddle/fluid/platform/cpu_helper.cc
index b737a6c38d0441cbfcd55ed2c55969ceca68db5d..a402f397348a4648cbef0a6026bde3e865bd5be1 100644
--- a/paddle/fluid/platform/cpu_helper.cc
+++ b/paddle/fluid/platform/cpu_helper.cc
@@ -43,7 +43,9 @@ void SetNumThreads(int num_threads) {
   platform::dynload::MKL_Set_Num_Threads(real_num_threads);
   omp_set_num_threads(real_num_threads);
 #else
-  PADDLE_ENFORCE(false, "To be implemented.");
+  PADDLE_THROW(platform::errors::Unimplemented(
+      "This library (except OPENBLAS, MKLML) is not supported yet, so the"
+      "number of threads cannot be set."));
 #endif
 }
 
diff --git a/paddle/fluid/platform/cpu_info.cc b/paddle/fluid/platform/cpu_info.cc
index 6545e62927fe05b1d7a3426b3d91c5eb6aac0c2d..63760ada2b4d5226035b990cf5ecb7e1d21fbbe2 100644
--- a/paddle/fluid/platform/cpu_info.cc
+++ b/paddle/fluid/platform/cpu_info.cc
@@ -139,7 +139,7 @@ bool MayIUse(const cpu_isa_t cpu_isa) {
   if (cpu_isa == isa_any) {
     return true;
   } else {
-#ifndef WITH_NV_JETSON
+#if !defined(WITH_NV_JETSON) && !defined(PADDLE_WITH_ARM)
     int reg[4];
     cpuid(reg, 0);
     int nIds = reg[0];
diff --git a/paddle/fluid/platform/cpu_info.h b/paddle/fluid/platform/cpu_info.h
index 66f05d51c0b6795e7509b3e80d9ffd3e4a28adfd..c071246c512500b47c1e131a44a71c6bb5377be2 100644
--- a/paddle/fluid/platform/cpu_info.h
+++ b/paddle/fluid/platform/cpu_info.h
@@ -40,7 +40,7 @@ limitations under the License. */
 #ifdef _WIN32
 #define cpuid(reg, x) __cpuidex(reg, x, 0)
 #else
-#ifndef WITH_NV_JETSON
+#if !defined(WITH_NV_JETSON) && !defined(PADDLE_WITH_ARM)
 #include <cpuid.h>
 inline void cpuid(int reg[4], int x) {
   __cpuid_count(x, 0, reg[0], reg[1], reg[2], reg[3]);
diff --git a/paddle/fluid/platform/cuda_helper.h b/paddle/fluid/platform/cuda_helper.h
index 74cf5545239f1dde1a6f81ebdf7f735a132133d9..6b3f91d52057ed804a61d1e72867bc30c19afbd9 100644
--- a/paddle/fluid/platform/cuda_helper.h
+++ b/paddle/fluid/platform/cuda_helper.h
@@ -17,6 +17,7 @@
 #include <mutex>  // NOLINT
 
 #include "paddle/fluid/platform/dynload/cublas.h"
+#include "paddle/fluid/platform/enforce.h"
 #include "paddle/fluid/platform/macros.h"
 
 #if CUDA_VERSION < 9000
@@ -26,6 +27,54 @@ enum cublasMath_t { CUBLAS_DEFAULT_MATH = 0 };
 namespace paddle {
 namespace platform {
 
+/*
+ * Summary: Grid stride looping macro in CUDA kernel
+ *
+ *  [ Why need this macro? ]
+ *
+ *    The original looping in CUDA kernel is:
+ *
+ *    `for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < (n); \
+ *        i += blockDim.x * gridDim.x)`
+ *
+ *    This for condition is risky. The value of `blockIdx.x * blockDim.x`
+ *    may be large, such as over 1GB, the first iteration is no problem here,
+ *    but when `i += blockDim.x * gridDim.x` is executed, the value of i
+ *    will greater than INT_MAX and overflow becomes negative value, at
+ *    this time, the cycle condition `i < (n)` is still satisfied, so it
+ *    will cause illegal access to cuda memory.
+ *
+ *    Here is a real example in ERINE, it will trigger above error.
+ *    The related data are:
+ *      - blockIdx.x = 2172938
+ *      - blockDim.x = 512
+ *      - blockIdx.x * blockDim.x = 1112543864
+ *      - INT_MAX = 2147483647
+ *
+ *    So we polish the for condition as follow, the int64_t __index__ will
+ *    prevent overflow in the loop increment.
+ *
+ * Parameters:
+ *    - i: loop index
+ *    - num: total element numbers
+ *
+ * Examples:
+ *    template <typename T>
+ *    __global__ void Scale(T* logit_grad, const T* loss_grad, const int num,
+ *                      const int d, const int remain) {
+ *    CUDA_KERNEL_LOOP(index, num) {
+ *      int idx_n = index / d;
+ *      int idx_remain = index % remain;
+ *      logit_grad[index] *= loss_grad[idx_n * remain + idx_remain];
+ *      }
+ *    }
+ *
+*/
+#define CUDA_KERNEL_LOOP(i, num)                             \
+  int64_t __index__ = blockIdx.x * blockDim.x + threadIdx.x; \
+  for (int i = __index__; __index__ < (num);                 \
+       __index__ += blockDim.x * gridDim.x, i = __index__)
+
 class CublasHandleHolder {
  public:
   CublasHandleHolder(cudaStream_t stream, cublasMath_t math_type) {
diff --git a/paddle/fluid/platform/cuda_helper_test.cu b/paddle/fluid/platform/cuda_helper_test.cu
index 9e3025bf30b8849472e33a71228eb16814157b21..044f4d6748e3ad72c097c317784fa2b6b9775bcd 100644
--- a/paddle/fluid/platform/cuda_helper_test.cu
+++ b/paddle/fluid/platform/cuda_helper_test.cu
@@ -25,13 +25,14 @@
 #include "paddle/fluid/platform/cuda_primitives.h"
 #include "paddle/fluid/platform/float16.h"
 
+#include "paddle/fluid/platform/cuda_helper.h"
+
 using paddle::platform::PADDLE_CUDA_NUM_THREADS;
 using paddle::platform::float16;
 
 template <typename T>
 __global__ void AddKernel(const T* data_a, T* data_b, size_t num) {
-  for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < num;
-       i += blockDim.x * gridDim.x) {
+  CUDA_KERNEL_LOOP(i, num) {
     paddle::platform::CudaAtomicAdd(&data_b[i], data_a[i]);
   }
 }
@@ -191,10 +192,7 @@ __forceinline__ __device__ T BlockReduce(T val) {
 template <typename T>
 __global__ void DeviceReduceSum(T* in, T* out, size_t N) {
   T sum(0);
-  for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < N;
-       i += blockDim.x * gridDim.x) {
-    sum += in[i];
-  }
+  CUDA_KERNEL_LOOP(i, N) { sum += in[i]; }
   sum = BlockReduce<T>(sum);
   __syncthreads();
   if (threadIdx.x == 0) out[blockIdx.x] = sum;
diff --git a/paddle/fluid/platform/cuda_profiler.h b/paddle/fluid/platform/cuda_profiler.h
index 41d7c121469edd24c67b4288793cb95159fd4b62..957bdf1e698d0aedb86c5b0cb732ab545c260bcc 100644
--- a/paddle/fluid/platform/cuda_profiler.h
+++ b/paddle/fluid/platform/cuda_profiler.h
@@ -26,13 +26,13 @@ void CudaProfilerInit(std::string output_file, std::string output_mode,
                       std::string config_file) {
   PADDLE_ENFORCE(output_mode == "kvp" || output_mode == "csv");
   cudaOutputMode_t mode = output_mode == "csv" ? cudaCSV : cudaKeyValuePair;
-  PADDLE_ENFORCE(
+  PADDLE_ENFORCE_CUDA_SUCCESS(
       cudaProfilerInitialize(config_file.c_str(), output_file.c_str(), mode));
 }
 
-void CudaProfilerStart() { PADDLE_ENFORCE(cudaProfilerStart()); }
+void CudaProfilerStart() { PADDLE_ENFORCE_CUDA_SUCCESS(cudaProfilerStart()); }
 
-void CudaProfilerStop() { PADDLE_ENFORCE(cudaProfilerStop()); }
+void CudaProfilerStop() { PADDLE_ENFORCE_CUDA_SUCCESS(cudaProfilerStop()); }
 
 }  // namespace platform
 }  // namespace paddle
diff --git a/paddle/fluid/platform/cuda_resource_pool.cc b/paddle/fluid/platform/cuda_resource_pool.cc
index 65c8b96028aceef09c4deff6cee92c3d970a659f..6ecb312d72072c7904430e52acb77944abd04417 100644
--- a/paddle/fluid/platform/cuda_resource_pool.cc
+++ b/paddle/fluid/platform/cuda_resource_pool.cc
@@ -50,11 +50,11 @@ std::shared_ptr<CudaStreamObject> CudaStreamResourcePool::New(int dev_idx) {
   PADDLE_ENFORCE_GE(
       dev_idx, 0,
       platform::errors::InvalidArgument(
-          "dev_idx should be not less than 0, but got %d", dev_idx));
+          "The dev_idx should be not less than 0, but got %d.", dev_idx));
   PADDLE_ENFORCE_LT(
       dev_idx, pool_.size(),
       platform::errors::OutOfRange(
-          "dev_idx should be less than device count %d, but got %d",
+          "The dev_idx should be less than device count %d, but got %d.",
           pool_.size(), dev_idx));
   return pool_[dev_idx]->New();
 }
@@ -89,11 +89,11 @@ std::shared_ptr<CudaEventObject> CudaEventResourcePool::New(int dev_idx) {
   PADDLE_ENFORCE_GE(
       dev_idx, 0,
       platform::errors::InvalidArgument(
-          "dev_idx should be not less than 0, but got %d", dev_idx));
+          "The dev_idx should be not less than 0, but got %d.", dev_idx));
   PADDLE_ENFORCE_LT(
       dev_idx, pool_.size(),
       platform::errors::OutOfRange(
-          "dev_idx should be less than device count %d, but got %d",
+          "The dev_idx should be less than device count %d, but got %d.",
           pool_.size(), dev_idx));
   return pool_[dev_idx]->New();
 }
diff --git a/paddle/fluid/platform/cudnn_helper.h b/paddle/fluid/platform/cudnn_helper.h
index 2947cd016ef32eae1be62fab6797e962b261d983..efb57e12fdbe650e74101355da73be929f072be7 100644
--- a/paddle/fluid/platform/cudnn_helper.h
+++ b/paddle/fluid/platform/cudnn_helper.h
@@ -103,7 +103,8 @@ inline cudnnPoolingMode_t GetPoolingMode(const PoolingMode& mode) {
     case PoolingMode::kMaximum:
       return CUDNN_POOLING_MAX;
     default:
-      PADDLE_THROW("Unexpected pooling mode.");
+      PADDLE_THROW(
+          platform::errors::Unimplemented("Unexpected CUDNN pooling mode."));
   }
 }
 #else
@@ -119,7 +120,8 @@ inline cudnnPoolingMode_t GetPoolingMode(const PoolingMode& mode) {
     case PoolingMode::kMaximum:
       return CUDNN_POOLING_MAX;
     default:
-      PADDLE_THROW("Unexpected pooling mode.");
+      PADDLE_THROW(
+          platform::errors::Unimplemented("Unexpected CUDNN pooling mode."));
   }
 }
 #endif  // CUDNN_VERSION < 6000
@@ -140,7 +142,8 @@ inline ActivationMode StringToActivationMode(const std::string& str) {
   } else if (str == "bandpass") {
     return ActivationMode::kBandPass;
   } else {
-    PADDLE_THROW("Unknown activation string: %s", str);
+    PADDLE_THROW(platform::errors::Unimplemented(
+        "Unknown CUDNN activation string: %s.", str));
   }
 }
 
@@ -208,7 +211,8 @@ inline cudnnTensorFormat_t GetCudnnTensorFormat(
     case DataLayout::kNDHWC:
       return CUDNN_TENSOR_NHWC;  // add, liyamei
     default:
-      PADDLE_THROW("Unknown cudnn equivalent for order");
+      PADDLE_THROW(platform::errors::Unimplemented(
+          "CUDNN has no equivalent dataLayout for input order."));
   }
   return CUDNN_TENSOR_NCHW;
 }
@@ -329,18 +333,28 @@ class ScopedConvolutionDescriptor {
   inline cudnnConvolutionDescriptor_t descriptor(
       cudnnDataType_t type, const std::vector<int>& pads,
       const std::vector<int>& strides, const std::vector<int>& dilations) {
-    PADDLE_ENFORCE_EQ(pads.size(), strides.size());
-    PADDLE_ENFORCE_EQ(pads.size(), dilations.size());
+    PADDLE_ENFORCE_EQ(pads.size(), strides.size(),
+                      platform::errors::InvalidArgument(
+                          "The size of pads and strides should be equal. But "
+                          "received size of pads is %d, size of strides is %d.",
+                          pads.size(), strides.size()));
+    PADDLE_ENFORCE_EQ(
+        pads.size(), dilations.size(),
+        platform::errors::InvalidArgument(
+            "The size of pads and dilations should be equal. But received size "
+            "of pads is %d, size of dilations is %d.",
+            pads.size(), dilations.size()));
 
 #if !CUDNN_VERSION_MIN(6, 0, 0)
     // cudnn v5 does not support dilation conv, the argument is called upscale
     // instead of dilations and it is must be one.
     for (size_t i = 0; i < dilations.size(); ++i) {
-      PADDLE_ENFORCE_EQ(
-          dilations[i], 1,
-          "Dilations conv is not supported in this cuDNN version(%d.%d.%d).",
-          CUDNN_VERSION / 1000, CUDNN_VERSION % 1000 / 100,
-          CUDNN_VERSION % 100);
+      PADDLE_ENFORCE_EQ(dilations[i], 1,
+                        platform::errors::InvalidArgument(
+                            "Dilations conv is not supported in this cuDNN "
+                            "version(%d.%d.%d).",
+                            CUDNN_VERSION / 1000, CUDNN_VERSION % 1000 / 100,
+                            CUDNN_VERSION % 100));
     }
 #endif
 
@@ -377,8 +391,17 @@ class ScopedPoolingDescriptor {
                                              const std::vector<int>& kernel,
                                              const std::vector<int>& pads,
                                              const std::vector<int>& strides) {
-    PADDLE_ENFORCE_EQ(kernel.size(), pads.size());
-    PADDLE_ENFORCE_EQ(kernel.size(), strides.size());
+    PADDLE_ENFORCE_EQ(kernel.size(), pads.size(),
+                      platform::errors::InvalidArgument(
+                          "The size of kernel and pads should be equal. But "
+                          "received size of kernel is %d, size of pads is %d.",
+                          kernel.size(), pads.size()));
+    PADDLE_ENFORCE_EQ(
+        kernel.size(), strides.size(),
+        platform::errors::InvalidArgument(
+            "The size of kernel and strides should be equal. But "
+            "received size of kernel is %d, size of strides is %d.",
+            kernel.size(), strides.size()));
     PADDLE_ENFORCE_CUDA_SUCCESS(dynload::cudnnSetPoolingNdDescriptor(
         desc_, (GetPoolingMode(mode)),
         CUDNN_PROPAGATE_NAN,  // Always propagate nans.
@@ -456,8 +479,9 @@ class ScopedActivationDescriptor {
         mode = CUDNN_ACTIVATION_TANH;
         break;
       default:
-        PADDLE_THROW("unrecognized activation mode: %d .",
-                     static_cast<int>(activation_mode));
+        PADDLE_THROW(platform::errors::Unimplemented(
+            "Unrecognized CUDNN activation mode: %d.",
+            static_cast<int>(activation_mode)));
     }
     PADDLE_ENFORCE_CUDA_SUCCESS(dynload::cudnnSetActivationDescriptor(
         desc_, mode, CUDNN_NOT_PROPAGATE_NAN, relu_ceiling));
diff --git a/paddle/fluid/platform/device_code.cc b/paddle/fluid/platform/device_code.cc
index e8b2d5d4ed12d516776eace322d0aff6cb381d71..9d5a0954b00b1755a86cbd5d654b9a06edff4879 100644
--- a/paddle/fluid/platform/device_code.cc
+++ b/paddle/fluid/platform/device_code.cc
@@ -60,10 +60,10 @@ platform::DeviceCode* DeviceCodePool::Get(const platform::Place& place,
 }
 
 DeviceCodePool::DeviceCodePool(const std::vector<platform::Place>& places) {
-  PADDLE_ENFORCE_GT(
-      places.size(), 0,
-      errors::InvalidArgument(
-          "Expected the number of places >= 1. Expected %d.", places.size()));
+  PADDLE_ENFORCE_GT(places.size(), 0,
+                    errors::InvalidArgument(
+                        "Expected the number of places >= 1. But received %d.",
+                        places.size()));
   // Remove the duplicated places
   std::set<Place> set;
   for (auto& p : places) {
diff --git a/paddle/fluid/platform/device_context.cc b/paddle/fluid/platform/device_context.cc
index 6784da0ebcc493447b95c757d61503a9128ae4a4..38b0894c3f71dc150a9ed737b0ac17b22baffb8a 100644
--- a/paddle/fluid/platform/device_context.cc
+++ b/paddle/fluid/platform/device_context.cc
@@ -59,12 +59,11 @@ DeviceContextPool* DeviceContextPool::pool = nullptr;
 platform::DeviceContext* DeviceContextPool::Get(const platform::Place& place) {
   auto it = device_contexts_.find(place);
   if (it == device_contexts_.end()) {
-    PADDLE_THROW(
-        "Place %s is not supported, Please check that your paddle compiles "
-        "with WITH_GPU "
-        "option or check that your train process hold the correct gpu_id if "
-        "you use Executor",
-        place);
+    PADDLE_THROW(platform::errors::Unimplemented(
+        "Place %s is not supported. Please check that your paddle compiles "
+        "with WITH_GPU option or check that your train process hold the "
+        "correct gpu_id if you use Executor.",
+        place));
   }
   return it->second.get().get();
 }
@@ -84,7 +83,11 @@ inline void EmplaceDeviceContext(
 
 DeviceContextPool::DeviceContextPool(
     const std::vector<platform::Place>& places) {
-  PADDLE_ENFORCE_GT(places.size(), 0);
+  PADDLE_ENFORCE_GT(
+      places.size(), 0,
+      platform::errors::InvalidArgument("The number of platform places should "
+                                        "be larger than 0. But received %d.",
+                                        places.size()));
   std::set<Place> set;
   for (auto& p : places) {
     set.insert(p);
@@ -101,17 +104,17 @@ DeviceContextPool::DeviceContextPool(
       EmplaceDeviceContext<CUDADeviceContext, CUDAPlace>(&device_contexts_, p);
 #else
       PADDLE_THROW(
-          "'CUDAPlace' is not supported, Please re-compile with WITH_GPU "
-          "option");
+          platform::errors::Unimplemented("CUDAPlace is not supported. Please "
+                                          "re-compile with WITH_GPU option."));
 #endif
     } else if (platform::is_cuda_pinned_place(p)) {
 #ifdef PADDLE_WITH_CUDA
       EmplaceDeviceContext<CUDAPinnedDeviceContext, CUDAPinnedPlace>(
           &device_contexts_, p);
 #else
-      PADDLE_THROW(
-          "'CUDAPlace' is not supported, Please re-compile with WITH_GPU "
-          "option");
+      PADDLE_THROW(platform::errors::Unimplemented(
+          "CUDAPlace is not supported. Please re-compile with WITH_GPU "
+          "option."));
 #endif
     }
   }
diff --git a/paddle/fluid/platform/device_context.h b/paddle/fluid/platform/device_context.h
index 9393ea3e332cb9cc9723a83693725c4c7ed4707c..7511edb9ccf2c6ca1d5aea2964799b8be08064b6 100644
--- a/paddle/fluid/platform/device_context.h
+++ b/paddle/fluid/platform/device_context.h
@@ -515,7 +515,9 @@ class DeviceContextPool {
   explicit DeviceContextPool(const std::vector<platform::Place>& places);
 
   static DeviceContextPool& Instance() {
-    PADDLE_ENFORCE_NOT_NULL(pool, "Need to Create DeviceContextPool first!");
+    PADDLE_ENFORCE_NOT_NULL(pool,
+                            platform::errors::PreconditionNotMet(
+                                "Need to Create DeviceContextPool firstly!"));
     return *pool;
   }
 
diff --git a/paddle/fluid/platform/device_memory_aligment.cc b/paddle/fluid/platform/device_memory_aligment.cc
index 7b901856daa7d899074ba8659ea6cf7f36b89f01..8b57de934990809ad62ce99055c7466ab474c9a6 100644
--- a/paddle/fluid/platform/device_memory_aligment.cc
+++ b/paddle/fluid/platform/device_memory_aligment.cc
@@ -24,7 +24,8 @@ size_t Alignment(size_t size, const platform::Place &place) {
 #ifdef PADDLE_WITH_CUDA
     alignment = GpuMinChunkSize();
 #else
-    PADDLE_THROW("Fluid is not compiled with CUDA");
+    PADDLE_THROW(platform::errors::PreconditionNotMet(
+        "Fluid is not compiled with CUDA."));
 #endif
   }
   size_t remaining = size % alignment;
diff --git a/paddle/fluid/platform/device_tracer.cc b/paddle/fluid/platform/device_tracer.cc
index d362b841065c41a0b26cedb566fa6d66572dbbbc..ec934c3b980c3d53ae5581a1317a2610218f2ebf 100644
--- a/paddle/fluid/platform/device_tracer.cc
+++ b/paddle/fluid/platform/device_tracer.cc
@@ -177,8 +177,10 @@ void CUPTIAPI bufferCompleted(CUcontext ctx, uint32_t streamId, uint8_t *buffer,
   static std::thread::id cupti_thread_id(0);
   if (cupti_thread_id == std::thread::id(0))
     cupti_thread_id = std::this_thread::get_id();
-  PADDLE_ENFORCE_EQ(std::this_thread::get_id(), cupti_thread_id,
-                    "Only one thread is allowed to call bufferCompleted()");
+  PADDLE_ENFORCE_EQ(
+      std::this_thread::get_id(), cupti_thread_id,
+      platform::errors::PermissionDenied(
+          "Only one thread is allowed to call bufferCompleted()."));
   CUptiResult status;
   CUpti_Activity *record = NULL;
   if (validSize > 0) {
@@ -573,7 +575,8 @@ class DeviceTracerImpl : public DeviceTracer {
         } else if (platform::is_cuda_pinned_place(r.place)) {
           event->set_place(proto::MemEvent::CUDAPinnedPlace);
         } else {
-          PADDLE_THROW("The current place is not supported.");
+          PADDLE_THROW(platform::errors::Unimplemented(
+              "The current place is not supported."));
         }
         event->set_alloc_in(r.alloc_in);
         event->set_free_in(r.free_in);
diff --git a/paddle/fluid/platform/dynload/cublas.h b/paddle/fluid/platform/dynload/cublas.h
index 937e200924b71dc8b8c726741e35964ebc65236e..7e32720c1d733411178c102d5c4500f722e7d005 100644
--- a/paddle/fluid/platform/dynload/cublas.h
+++ b/paddle/fluid/platform/dynload/cublas.h
@@ -36,7 +36,6 @@ extern void *cublas_dso_handle;
  *
  * note: default dynamic linked libs
  */
-#ifdef PADDLE_USE_DSO
 #define DECLARE_DYNAMIC_LOAD_CUBLAS_WRAP(__name)                             \
   struct DynLoad__##__name {                                                 \
     using FUNC_TYPE = decltype(&::__name);                                   \
@@ -50,16 +49,6 @@ extern void *cublas_dso_handle;
     }                                                                        \
   };                                                                         \
   extern DynLoad__##__name __name
-#else
-#define DECLARE_DYNAMIC_LOAD_CUBLAS_WRAP(__name)     \
-  struct DynLoad__##__name {                         \
-    template <typename... Args>                      \
-    inline cublasStatus_t operator()(Args... args) { \
-      return ::__name(args...);                      \
-    }                                                \
-  };                                                 \
-  extern DynLoad__##__name __name
-#endif
 
 #define CUBLAS_BLAS_ROUTINE_EACH(__macro) \
   __macro(cublasSaxpy_v2);                \
diff --git a/paddle/fluid/platform/dynload/cuda_driver.cc b/paddle/fluid/platform/dynload/cuda_driver.cc
index 017e887bc7da53b6721af42a0a1fcc29b09f2565..89a29bae7f3373da69f8813add2770b6300c2970 100644
--- a/paddle/fluid/platform/dynload/cuda_driver.cc
+++ b/paddle/fluid/platform/dynload/cuda_driver.cc
@@ -25,14 +25,10 @@ void* cuda_dso_handle = nullptr;
 
 CUDA_ROUTINE_EACH(DEFINE_WRAP);
 
-#ifdef PADDLE_USE_DSO
 bool HasCUDADriver() {
   std::call_once(cuda_dso_flag, []() { cuda_dso_handle = GetCUDADsoHandle(); });
   return cuda_dso_handle != nullptr;
 }
-#else
-bool HasCUDADriver() { return false; }
-#endif
 
 }  // namespace dynload
 }  // namespace platform
diff --git a/paddle/fluid/platform/dynload/cuda_driver.h b/paddle/fluid/platform/dynload/cuda_driver.h
index d39aceaa11bca0cce4f160c0a66dbd92d1813acb..056fcc069dba927fec0f04784f9edfd4e44ef3d2 100644
--- a/paddle/fluid/platform/dynload/cuda_driver.h
+++ b/paddle/fluid/platform/dynload/cuda_driver.h
@@ -27,8 +27,6 @@ extern std::once_flag cuda_dso_flag;
 extern void* cuda_dso_handle;
 extern bool HasCUDADriver();
 
-#ifdef PADDLE_USE_DSO
-
 #define DECLARE_DYNAMIC_LOAD_CUDA_WRAP(__name)                           \
   struct DynLoad__##__name {                                             \
     template <typename... Args>                                          \
@@ -43,19 +41,6 @@ extern bool HasCUDADriver();
   };                                                                     \
   extern struct DynLoad__##__name __name
 
-#else
-
-#define DECLARE_DYNAMIC_LOAD_CUDA_WRAP(__name) \
-  struct DynLoad__##__name {                   \
-    template <typename... Args>                \
-    inline auto operator()(Args... args) {     \
-      return ::__name(args...);                \
-    }                                          \
-  };                                           \
-  extern DynLoad__##__name __name
-
-#endif
-
 /**
  * include all needed cuda driver functions
  **/
diff --git a/paddle/fluid/platform/dynload/cudnn.cc b/paddle/fluid/platform/dynload/cudnn.cc
index edff8761ee15f89c95c208edd41a976176fd0ae4..44a03d6f14a3ba07d73cfbc944d8db9601394103 100644
--- a/paddle/fluid/platform/dynload/cudnn.cc
+++ b/paddle/fluid/platform/dynload/cudnn.cc
@@ -50,7 +50,6 @@ CUDNN_DNN_ROUTINE_EACH_R7(DEFINE_WRAP);
 CUDNN_DNN_ROUTINE_EACH_AFTER_R7(DEFINE_WRAP);
 #endif
 
-#ifdef PADDLE_USE_DSO
 bool HasCUDNN() {
   std::call_once(cudnn_dso_flag,
                  []() { cudnn_dso_handle = GetCUDNNDsoHandle(); });
@@ -58,13 +57,12 @@ bool HasCUDNN() {
 }
 
 void EnforceCUDNNLoaded(const char* fn_name) {
-  PADDLE_ENFORCE(cudnn_dso_handle != nullptr,
-                 "Cannot load cudnn shared library. Cannot invoke method %s",
-                 fn_name);
+  PADDLE_ENFORCE_NOT_NULL(
+      cudnn_dso_handle,
+      platform::errors::PreconditionNotMet(
+          "Cannot load cudnn shared library. Cannot invoke method %s.",
+          fn_name));
 }
-#else
-bool HasCUDNN() { return true; }
-#endif
 
 }  // namespace dynload
 }  // namespace platform
diff --git a/paddle/fluid/platform/dynload/cudnn.h b/paddle/fluid/platform/dynload/cudnn.h
index bec5ceb1f47db405a6f968548ab6a0b79506f6bd..96297ec8557bcaac08b105d362dbca2fb0dcd29b 100644
--- a/paddle/fluid/platform/dynload/cudnn.h
+++ b/paddle/fluid/platform/dynload/cudnn.h
@@ -28,8 +28,6 @@ extern std::once_flag cudnn_dso_flag;
 extern void* cudnn_dso_handle;
 extern bool HasCUDNN();
 
-#ifdef PADDLE_USE_DSO
-
 extern void EnforceCUDNNLoaded(const char* fn_name);
 #define DECLARE_DYNAMIC_LOAD_CUDNN_WRAP(__name)                            \
   struct DynLoad__##__name {                                               \
@@ -46,19 +44,6 @@ extern void EnforceCUDNNLoaded(const char* fn_name);
   };                                                                       \
   extern struct DynLoad__##__name __name
 
-#else
-
-#define DECLARE_DYNAMIC_LOAD_CUDNN_WRAP(__name) \
-  struct DynLoad__##__name {                    \
-    template <typename... Args>                 \
-    inline auto operator()(Args... args) {      \
-      return ::__name(args...);                 \
-    }                                           \
-  };                                            \
-  extern DynLoad__##__name __name
-
-#endif
-
 /**
  * include all needed cudnn functions in HPPL
  * different cudnn version has different interfaces
diff --git a/paddle/fluid/platform/dynload/cupti.h b/paddle/fluid/platform/dynload/cupti.h
index b946f46e82af4b09fafff54765b899254a4ec1df..49bfdce4d38bbdfdcdd3e3ea64684dd4179e6f63 100644
--- a/paddle/fluid/platform/dynload/cupti.h
+++ b/paddle/fluid/platform/dynload/cupti.h
@@ -36,7 +36,6 @@ extern void *cupti_dso_handle;
  *
  * note: default dynamic linked libs
  */
-#ifdef PADDLE_USE_DSO
 #define DECLARE_DYNAMIC_LOAD_CUPTI_WRAP(__name)                            \
   struct DynLoad__##__name {                                               \
     template <typename... Args>                                            \
@@ -50,16 +49,6 @@ extern void *cupti_dso_handle;
     }                                                                      \
   };                                                                       \
   extern DynLoad__##__name __name
-#else
-#define DECLARE_DYNAMIC_LOAD_CUPTI_WRAP(__name)            \
-  struct DynLoad__##__name {                               \
-    template <typename... Args>                            \
-    inline CUptiResult CUPTIAPI operator()(Args... args) { \
-      return __name(args...);                              \
-    }                                                      \
-  };                                                       \
-  extern DynLoad__##__name __name
-#endif
 
 #define CUPTI_ROUTINE_EACH(__macro)           \
   __macro(cuptiActivityEnable);               \
diff --git a/paddle/fluid/platform/dynload/curand.h b/paddle/fluid/platform/dynload/curand.h
index 0bb300ec33076d9ddfaf69190f14131279cc888e..48076e5478a30d86802e10f35bfa7cabfc0deab9 100644
--- a/paddle/fluid/platform/dynload/curand.h
+++ b/paddle/fluid/platform/dynload/curand.h
@@ -25,7 +25,7 @@ namespace platform {
 namespace dynload {
 extern std::once_flag curand_dso_flag;
 extern void *curand_dso_handle;
-#ifdef PADDLE_USE_DSO
+
 #define DECLARE_DYNAMIC_LOAD_CURAND_WRAP(__name)                             \
   struct DynLoad__##__name {                                                 \
     template <typename... Args>                                              \
@@ -39,16 +39,6 @@ extern void *curand_dso_handle;
     }                                                                        \
   };                                                                         \
   extern DynLoad__##__name __name
-#else
-#define DECLARE_DYNAMIC_LOAD_CURAND_WRAP(__name) \
-  struct DynLoad__##__name {                     \
-    template <typename... Args>                  \
-    curandStatus_t operator()(Args... args) {    \
-      return ::__name(args...);                  \
-    }                                            \
-  };                                             \
-  extern DynLoad__##__name __name
-#endif
 
 #define CURAND_RAND_ROUTINE_EACH(__macro)      \
   __macro(curandCreateGenerator);              \
diff --git a/paddle/fluid/platform/dynload/cusolver.h b/paddle/fluid/platform/dynload/cusolver.h
index 379bf78d0a716f7b28031f4fffb1903ea6e3fb95..ba2d38729c12e2c5fc978a2768cf36c3af3b7c4c 100644
--- a/paddle/fluid/platform/dynload/cusolver.h
+++ b/paddle/fluid/platform/dynload/cusolver.h
@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 #pragma once
 
+#include <cuda.h>
 #include <cusolverDn.h>
 
 #include <mutex>  // NOLINT
@@ -25,7 +26,7 @@ namespace platform {
 namespace dynload {
 extern std::once_flag cusolver_dso_flag;
 extern void *cusolver_dso_handle;
-#ifdef PADDLE_USE_DSO
+
 #define DECLARE_DYNAMIC_LOAD_CUSOLVER_WRAP(__name)                   \
   struct DynLoad__##__name {                                         \
     template <typename... Args>                                      \
@@ -40,16 +41,6 @@ extern void *cusolver_dso_handle;
     }                                                                \
   };                                                                 \
   extern DynLoad__##__name __name
-#else
-#define DECLARE_DYNAMIC_LOAD_CUSOLVER_WRAP(__name) \
-  struct DynLoad__##__name {                       \
-    template <typename... Args>                    \
-    cusolverStatus_t operator()(Args... args) {    \
-      return ::__name(args...);                    \
-    }                                              \
-  };                                               \
-  extern DynLoad__##__name __name
-#endif
 
 #define CUSOLVER_ROUTINE_EACH(__macro)  \
   __macro(cusolverDnCreate);            \
diff --git a/paddle/fluid/platform/dynload/dynamic_loader.cc b/paddle/fluid/platform/dynload/dynamic_loader.cc
index 9acfab19869e63c64aca4463ce3f525d75d3dbc5..b944fead0935b6404045d929fc88c42f7ce0beef 100644
--- a/paddle/fluid/platform/dynload/dynamic_loader.cc
+++ b/paddle/fluid/platform/dynload/dynamic_loader.cc
@@ -16,6 +16,7 @@ limitations under the License. */
 #include <memory>
 #include <mutex>  // NOLINT
 #include <string>
+#include <vector>
 
 #include "gflags/gflags.h"
 #include "glog/logging.h"
@@ -58,6 +59,8 @@ struct PathNode {
 };
 
 static constexpr char cupti_lib_path[] = CUPTI_LIB_PATH;
+
+// NOTE: In order to adapt to the default installation path of cuda on linux
 static constexpr char linux_cudnn_lib_path[] = "/usr/local/cuda/lib64";
 
 static PathNode s_py_site_pkg_path;
@@ -92,14 +95,29 @@ void SetPaddleLibPath(const std::string& py_site_pkg_path) {
   VLOG(3) << "Set paddle lib path : " << py_site_pkg_path;
 }
 
+static inline void* GetDsoHandleFromSpecificPath(const std::string& spec_path,
+                                                 const std::string& dso_name,
+                                                 int dynload_flags) {
+  void* dso_handle = nullptr;
+  if (!spec_path.empty()) {
+    // search xxx.so from custom path
+    VLOG(3) << "Try to find library: " << dso_name
+            << " from specific path: " << spec_path;
+    std::string dso_path = join(spec_path, dso_name);
+    dso_handle = dlopen(dso_path.c_str(), dynload_flags);
+  }
+  return dso_handle;
+}
+
 static inline void* GetDsoHandleFromDefaultPath(const std::string& dso_path,
                                                 int dynload_flags) {
-  VLOG(3) << "Try to find library: " << dso_path
-          << " from default system path.";
   // default search from LD_LIBRARY_PATH/DYLD_LIBRARY_PATH
   // and /usr/local/lib path
   void* dso_handle = dlopen(dso_path.c_str(), dynload_flags);
+  VLOG(3) << "Try to find library: " << dso_path
+          << " from default system path.";
 
+// TODO(chenweihang): This path is used to search which libs?
 // DYLD_LIBRARY_PATH is disabled after Mac OS 10.11 to
 // bring System Integrity Projection (SIP), if dso_handle
 // is null, search from default package path in Mac OS.
@@ -107,80 +125,72 @@ static inline void* GetDsoHandleFromDefaultPath(const std::string& dso_path,
   if (nullptr == dso_handle) {
     dso_handle =
         dlopen(join("/usr/local/cuda/lib/", dso_path).c_str(), dynload_flags);
-    if (nullptr == dso_handle) {
-      if (dso_path == "libcudnn.dylib") {
-        LOG(WARNING) << "Note: [Recommend] copy cudnn into /usr/local/cuda/ \n "
-                        "For instance, sudo tar -xzf "
-                        "cudnn-7.5-osx-x64-v5.0-ga.tgz -C /usr/local \n sudo "
-                        "chmod a+r /usr/local/cuda/include/cudnn.h "
-                        "/usr/local/cuda/lib/libcudnn*";
-      }
-    }
   }
 #endif
 
-  if (nullptr == dso_handle) {
-    LOG(WARNING) << "Can not find library: " << dso_path
-                 << ". The process maybe hang. Please try to add the lib path "
-                    "to LD_LIBRARY_PATH.";
-  }
   return dso_handle;
 }
 
-static inline void* GetDsoHandleFromSearchPath(const std::string& search_root,
-                                               const std::string& dso_name,
-                                               bool throw_on_error = true) {
+/*
+ * We define three priorities for dynamic library search:
+ *
+ * First: Search for the path specified by the user
+ * Second: Search the system default path
+ * Third: Search for a special path corresponding to
+ *        a specific library to adapt to changes and easy to expand.
+ */
+
+static inline void* GetDsoHandleFromSearchPath(
+    const std::string& config_path, const std::string& dso_name,
+    bool throw_on_error = true,
+    const std::vector<std::string>& extra_paths = std::vector<std::string>(),
+    const std::string& warning_msg = std::string()) {
 #if !defined(_WIN32)
   int dynload_flags = RTLD_LAZY | RTLD_LOCAL;
 #else
   int dynload_flags = 0;
 #endif  // !_WIN32
-  void* dso_handle = nullptr;
+  // 1. search in user config path by FLAGS
+  void* dso_handle =
+      GetDsoHandleFromSpecificPath(config_path, dso_name, dynload_flags);
+  // 2. search in system default path
+  if (nullptr == dso_handle) {
+    dso_handle = GetDsoHandleFromDefaultPath(dso_name, dynload_flags);
+  }
+  // 3. search in extra paths
+  if (nullptr == dso_handle) {
+    for (auto path : extra_paths) {
+      dso_handle = GetDsoHandleFromSpecificPath(path, dso_name, dynload_flags);
+    }
+  }
 
-  std::string dlPath = dso_name;
-  if (search_root.empty()) {
-    dso_handle = GetDsoHandleFromDefaultPath(dlPath, dynload_flags);
-  } else {
-    // search xxx.so from custom path
-    dlPath = join(search_root, dso_name);
-    dso_handle = dlopen(dlPath.c_str(), dynload_flags);
+  // 4. [If Failed] logging warning if exists
+  if (nullptr == dso_handle && !warning_msg.empty()) {
+    LOG(WARNING) << warning_msg;
+  }
+
+  // 5. [If Failed] logging or throw error info
+  if (nullptr == dso_handle) {
+    auto error_msg =
+        "Failed to find dynamic library: %s ( %s ) \n"
+        "Please specify its path correctly using following ways: \n"
+        "  set environment variable LD_LIBRARY_PATH on Linux or "
+        "DYLD_LIBRARY_PATH on Mac OS. \n"
+        "  For instance, issue command: export LD_LIBRARY_PATH=... \n"
+        "  Note: After Mac OS 10.11, using the DYLD_LIBRARY_PATH is "
+        "impossible unless System Integrity Protection (SIP) is disabled.";
 #if !defined(_WIN32)
     auto errorno = dlerror();
 #else
     auto errorno = GetLastError();
 #endif  // !_WIN32
-    // if not found, search from default path
-    if (nullptr == dso_handle) {
-      LOG(WARNING) << "Failed to find dynamic library: " << dlPath << " ("
-                   << errorno << ")";
-      if (dlPath.find("nccl") != std::string::npos) {
-        std::cout
-            << "You may need to install 'nccl2' from NVIDIA official website: "
-            << "https://developer.nvidia.com/nccl/nccl-download"
-            << "before install PaddlePaddle" << std::endl;
-      }
-      dlPath = dso_name;
-      dso_handle = GetDsoHandleFromDefaultPath(dlPath, dynload_flags);
+    if (throw_on_error) {
+      // NOTE: Special error report case, no need to change its format
+      PADDLE_THROW(platform::errors::NotFound(error_msg, dso_name, errorno));
+    } else {
+      LOG(WARNING) << string::Sprintf(error_msg, dso_name, errorno);
     }
   }
-  auto error_msg =
-      "Failed to find dynamic library: %s ( %s ) \n Please specify "
-      "its path correctly using following ways: \n   set "
-      "environment variable LD_LIBRARY_PATH on Linux or "
-      "DYLD_LIBRARY_PATH on Mac OS. \n   For instance, issue command: "
-      "export LD_LIBRARY_PATH=... \n   Note: After Mac OS 10.11, "
-      "using the DYLD_LIBRARY_PATH is impossible unless System "
-      "Integrity Protection (SIP) is disabled.";
-#if !defined(_WIN32)
-  auto errorno = dlerror();
-#else
-  auto errorno = GetLastError();
-#endif  // !_WIN32
-  if (throw_on_error) {
-    PADDLE_ENFORCE(nullptr != dso_handle, error_msg, dlPath, errorno);
-  } else if (nullptr == dso_handle) {
-    LOG(WARNING) << string::Sprintf(error_msg, dlPath, errorno);
-  }
 
   return dso_handle;
 }
@@ -197,27 +207,29 @@ void* GetCublasDsoHandle() {
 
 void* GetCUDNNDsoHandle() {
 #if defined(__APPLE__) || defined(__OSX__)
-  return GetDsoHandleFromSearchPath(FLAGS_cudnn_dir, "libcudnn.dylib", false);
+  std::string mac_warn_meg(
+      "Note: [Recommend] copy cudnn into /usr/local/cuda/ \n "
+      "For instance, sudo tar -xzf "
+      "cudnn-7.5-osx-x64-v5.0-ga.tgz -C /usr/local \n sudo "
+      "chmod a+r /usr/local/cuda/include/cudnn.h "
+      "/usr/local/cuda/lib/libcudnn*");
+  return GetDsoHandleFromSearchPath(FLAGS_cudnn_dir, "libcudnn.dylib", false,
+                                    {}, mac_warn_meg);
 #elif defined(_WIN32) && defined(PADDLE_WITH_CUDA)
   return GetDsoHandleFromSearchPath(FLAGS_cudnn_dir, win_cudnn_lib);
 #else
-  std::string linux_cudnn_path = linux_cudnn_lib_path;
-  if (!FLAGS_cudnn_dir.empty()) {
-    linux_cudnn_path = FLAGS_cudnn_dir;
-  }
-  return GetDsoHandleFromSearchPath(linux_cudnn_path, "libcudnn.so", false);
+  return GetDsoHandleFromSearchPath(FLAGS_cudnn_dir, "libcudnn.so", false,
+                                    {linux_cudnn_lib_path});
 #endif
 }
 
 void* GetCUPTIDsoHandle() {
-  std::string cupti_path = cupti_lib_path;
-  if (!FLAGS_cupti_dir.empty()) {
-    cupti_path = FLAGS_cupti_dir;
-  }
 #if defined(__APPLE__) || defined(__OSX__)
-  return GetDsoHandleFromSearchPath(cupti_path, "libcupti.dylib", false);
+  return GetDsoHandleFromSearchPath(FLAGS_cupti_dir, "libcupti.dylib", false,
+                                    {cupti_lib_path});
 #else
-  return GetDsoHandleFromSearchPath(cupti_path, "libcupti.so", false);
+  return GetDsoHandleFromSearchPath(FLAGS_cupti_dir, "libcupti.so", false,
+                                    {cupti_lib_path});
 #endif
 }
 
@@ -272,10 +284,16 @@ void* GetWarpCTCDsoHandle() {
 }
 
 void* GetNCCLDsoHandle() {
+  std::string warning_msg(
+      "You may need to install 'nccl2' from NVIDIA official website: "
+      "https://developer.nvidia.com/nccl/nccl-download"
+      "before install PaddlePaddle.");
 #if defined(__APPLE__) || defined(__OSX__)
-  return GetDsoHandleFromSearchPath(FLAGS_nccl_dir, "libnccl.dylib");
+  return GetDsoHandleFromSearchPath(FLAGS_nccl_dir, "libnccl.dylib", true, {},
+                                    warning_msg);
 #else
-  return GetDsoHandleFromSearchPath(FLAGS_nccl_dir, "libnccl.so");
+  return GetDsoHandleFromSearchPath(FLAGS_nccl_dir, "libnccl.so", true, {},
+                                    warning_msg);
 #endif
 }
 
@@ -301,9 +319,11 @@ void* GetMKLMLDsoHandle() {
 
 void* GetOpDsoHandle(const std::string& dso_name) {
 #if defined(__APPLE__) || defined(__OSX__)
-  PADDLE_THROW("Do not support Apple.");
+  PADDLE_THROW(platform::errors::Unimplemented(
+      "Create custom cpp op outside framework do not support Apple."));
 #elif defined(_WIN32) && defined(PADDLE_WITH_CUDA)
-  PADDLE_THROW("Do not support Windows.");
+  PADDLE_THROW(platform::errors::Unimplemented(
+      "Create custom cpp op outside framework do not support Windows."));
 #else
   return GetDsoHandleFromSearchPath(FLAGS_op_dir, dso_name);
 #endif
diff --git a/paddle/fluid/platform/dynload/nccl.h b/paddle/fluid/platform/dynload/nccl.h
index b3cd582394cf3f8df03dc4a6e262ee14eff042ad..1d5fa45ecf684597be5a3d5234456a871221d329 100644
--- a/paddle/fluid/platform/dynload/nccl.h
+++ b/paddle/fluid/platform/dynload/nccl.h
@@ -26,8 +26,6 @@ namespace dynload {
 extern std::once_flag nccl_dso_flag;
 extern void* nccl_dso_handle;
 
-#ifdef PADDLE_USE_DSO
-
 #define DECLARE_DYNAMIC_LOAD_NCCL_WRAP(__name)                           \
   struct DynLoad__##__name {                                             \
     template <typename... Args>                                          \
@@ -41,16 +39,6 @@ extern void* nccl_dso_handle;
     }                                                                    \
   };                                                                     \
   extern DynLoad__##__name __name
-#else
-#define DECLARE_DYNAMIC_LOAD_NCCL_WRAP(__name) \
-  struct DynLoad__##__name {                   \
-    template <typename... Args>                \
-    ncclResult_t operator()(Args... args) {    \
-      return __name(args...);                  \
-    }                                          \
-  };                                           \
-  extern DynLoad__##__name __name
-#endif
 
 #define NCCL_RAND_ROUTINE_EACH(__macro) \
   __macro(ncclCommInitAll);             \
diff --git a/paddle/fluid/platform/dynload/nvrtc.cc b/paddle/fluid/platform/dynload/nvrtc.cc
index f95d4b6ab521d22aeadca1214e265761c0652d33..74dfa5b3c22f8e846ff46b8baa2a66b6e4b8df8a 100644
--- a/paddle/fluid/platform/dynload/nvrtc.cc
+++ b/paddle/fluid/platform/dynload/nvrtc.cc
@@ -25,15 +25,11 @@ void* nvrtc_dso_handle = nullptr;
 
 NVRTC_ROUTINE_EACH(DEFINE_WRAP);
 
-#ifdef PADDLE_USE_DSO
 bool HasNVRTC() {
   std::call_once(nvrtc_dso_flag,
                  []() { nvrtc_dso_handle = GetNVRTCDsoHandle(); });
   return nvrtc_dso_handle != nullptr;
 }
-#else
-bool HasNVRTC() { return false; }
-#endif
 
 }  // namespace dynload
 }  // namespace platform
diff --git a/paddle/fluid/platform/dynload/nvrtc.h b/paddle/fluid/platform/dynload/nvrtc.h
index 08f81d4ea85610238aa0053c0008c0a03e2361d6..9464a23ba1ef9f0b24c4ef727c2de8176149f166 100644
--- a/paddle/fluid/platform/dynload/nvrtc.h
+++ b/paddle/fluid/platform/dynload/nvrtc.h
@@ -27,8 +27,6 @@ extern std::once_flag nvrtc_dso_flag;
 extern void* nvrtc_dso_handle;
 extern bool HasNVRTC();
 
-#ifdef PADDLE_USE_DSO
-
 #define DECLARE_DYNAMIC_LOAD_NVRTC_WRAP(__name)                            \
   struct DynLoad__##__name {                                               \
     template <typename... Args>                                            \
@@ -43,19 +41,6 @@ extern bool HasNVRTC();
   };                                                                       \
   extern struct DynLoad__##__name __name
 
-#else
-
-#define DECLARE_DYNAMIC_LOAD_NVRTC_WRAP(__name) \
-  struct DynLoad__##__name {                    \
-    template <typename... Args>                 \
-    inline auto operator()(Args... args) {      \
-      return ::__name(args...);                 \
-    }                                           \
-  };                                            \
-  extern DynLoad__##__name __name
-
-#endif
-
 /**
  * include all needed nvrtc functions
  **/
diff --git a/paddle/fluid/platform/dynload/tensorrt.h b/paddle/fluid/platform/dynload/tensorrt.h
index 4c7ba0f054cfc80702eb4fb4127d7008f6e49c02..566f887014b94d54059d6bd9842db791989d43a6 100644
--- a/paddle/fluid/platform/dynload/tensorrt.h
+++ b/paddle/fluid/platform/dynload/tensorrt.h
@@ -30,35 +30,26 @@ namespace dynload {
 extern std::once_flag tensorrt_dso_flag;
 extern void* tensorrt_dso_handle;
 
-#ifdef PADDLE_USE_DSO
-
-#define DECLARE_DYNAMIC_LOAD_TENSORRT_WRAP(__name)                      \
-  struct DynLoad__##__name {                                            \
-    template <typename... Args>                                         \
-    auto operator()(Args... args) -> DECLARE_TYPE(__name, args...) {    \
-      using tensorrt_func = decltype(&::__name);                        \
-      std::call_once(tensorrt_dso_flag, []() {                          \
-        tensorrt_dso_handle =                                           \
-            paddle::platform::dynload::GetTensorRtDsoHandle();          \
-        PADDLE_ENFORCE(tensorrt_dso_handle, "load tensorrt so failed"); \
-      });                                                               \
-      static void* p_##__name = dlsym(tensorrt_dso_handle, #__name);    \
-      PADDLE_ENFORCE(p_##__name, "load %s failed", #__name);            \
-      return reinterpret_cast<tensorrt_func>(p_##__name)(args...);      \
-    }                                                                   \
-  };                                                                    \
-  extern DynLoad__##__name __name
-
-#else
-#define DECLARE_DYNAMIC_LOAD_TENSORRT_WRAP(__name) \
-  struct DynLoad__##__name {                       \
-    template <typename... Args>                    \
-    tensorrtResult_t operator()(Args... args) {    \
-      return __name(args...);                      \
-    }                                              \
-  };                                               \
+#define DECLARE_DYNAMIC_LOAD_TENSORRT_WRAP(__name)                            \
+  struct DynLoad__##__name {                                                  \
+    template <typename... Args>                                               \
+    auto operator()(Args... args) -> DECLARE_TYPE(__name, args...) {          \
+      using tensorrt_func = decltype(&::__name);                              \
+      std::call_once(tensorrt_dso_flag, []() {                                \
+        tensorrt_dso_handle =                                                 \
+            paddle::platform::dynload::GetTensorRtDsoHandle();                \
+        PADDLE_ENFORCE_NOT_NULL(tensorrt_dso_handle,                          \
+                                platform::errors::Unavailable(                \
+                                    "Load tensorrt %s failed", #__name));     \
+      });                                                                     \
+      static void* p_##__name = dlsym(tensorrt_dso_handle, #__name);          \
+      PADDLE_ENFORCE_NOT_NULL(                                                \
+          p_##__name,                                                         \
+          platform::errors::Unavailable("Load tensorrt %s failed", #__name)); \
+      return reinterpret_cast<tensorrt_func>(p_##__name)(args...);            \
+    }                                                                         \
+  };                                                                          \
   extern DynLoad__##__name __name
-#endif
 
 #define TENSORRT_RAND_ROUTINE_EACH(__macro) \
   __macro(createInferBuilder_INTERNAL);     \
diff --git a/paddle/fluid/platform/enforce_test.cc b/paddle/fluid/platform/enforce_test.cc
index 7a55c51c84448eaf0be1fdf270fa27c0e0dc1361..f086c3f8232e9643c4eb16a97d2cfa3f3e9fe666 100644
--- a/paddle/fluid/platform/enforce_test.cc
+++ b/paddle/fluid/platform/enforce_test.cc
@@ -20,40 +20,46 @@ limitations under the License. */
 #include "paddle/fluid/platform/enforce.h"
 
 TEST(ENFORCE, OK) {
-  PADDLE_ENFORCE(true, "Enforce is ok %d now %f", 123, 0.345);
+  PADDLE_ENFORCE(true, paddle::platform::errors::Unavailable(
+                           "PADDLE_ENFORCE is ok %d now %f.", 123, 0.345));
   size_t val = 1;
   const size_t limit = 10;
-  PADDLE_ENFORCE(val < limit, "Enforce is OK too");
+  PADDLE_ENFORCE(val < limit, paddle::platform::errors::Unavailable(
+                                  "PADDLE_ENFORCE tests failed."));
 }
 
 TEST(ENFORCE, FAILED) {
   bool caught_exception = false;
   try {
-    PADDLE_ENFORCE(false, "Enforce is not ok %d at all", 123);
+    PADDLE_ENFORCE(false, paddle::platform::errors::Unavailable(
+                              "PADDLE_ENFORCE won't work %d at all.", 123));
   } catch (paddle::platform::EnforceNotMet& error) {
     caught_exception = true;
     std::string ex_msg = error.what();
-    EXPECT_TRUE(ex_msg.find("Enforce is not ok 123 at all") !=
+    EXPECT_TRUE(ex_msg.find("PADDLE_ENFORCE won't work 123 at all.") !=
                 std::string::npos);
   }
   EXPECT_TRUE(caught_exception);
 
   caught_exception = false;
   try {
-    PADDLE_ENFORCE(false, "Enforce is not ok at all");
+    PADDLE_ENFORCE(false, paddle::platform::errors::Unavailable(
+                              "PADDLE_ENFORCE won't work at all."));
   } catch (paddle::platform::EnforceNotMet& error) {
     caught_exception = true;
     std::string ex_msg = error.what();
-    EXPECT_TRUE(ex_msg.find("Enforce is not ok at all") != std::string::npos);
+    EXPECT_TRUE(ex_msg.find("PADDLE_ENFORCE won't work at all.") !=
+                std::string::npos);
   }
   EXPECT_TRUE(caught_exception);
 
   caught_exception = false;
   try {
-    PADDLE_ENFORCE(false);
+    PADDLE_ENFORCE(false, paddle::platform::errors::Unavailable(
+                              "PADDLE_ENFORCE won't work at all."));
   } catch (paddle::platform::EnforceNotMet& error) {
     caught_exception = true;
-    EXPECT_NE(std::string(error.what()).find("  at "), 0UL);
+    EXPECT_NE(std::string(error.what()).find(" at "), 0UL);
   }
   EXPECT_TRUE(caught_exception);
 }
@@ -61,9 +67,11 @@ TEST(ENFORCE, FAILED) {
 TEST(ENFORCE, NO_ARG_OK) {
   int a = 2;
   int b = 2;
-  PADDLE_ENFORCE_EQ(a, b);
+  PADDLE_ENFORCE_EQ(a, b, paddle::platform::errors::Unavailable(
+                              "PADDLE_ENFORCE_EQ tests failed."));
   // test enforce with extra message.
-  PADDLE_ENFORCE_EQ(a, b, "some thing wrong %s", "info");
+  PADDLE_ENFORCE_EQ(a, b, paddle::platform::errors::Unavailable(
+                              "Some %s wrong in PADDLE_ENFORCE_EQ.", "info"));
 }
 
 TEST(ENFORCE_EQ, NO_EXTRA_MSG_FAIL) {
@@ -71,7 +79,7 @@ TEST(ENFORCE_EQ, NO_EXTRA_MSG_FAIL) {
   bool caught_exception = false;
   try {
     PADDLE_ENFORCE_EQ(a, 1 + 3, paddle::platform::errors::InvalidArgument(
-                                    "the result is not equal correct result."));
+                                    "The result is not equal correct result."));
   } catch (paddle::platform::EnforceNotMet& error) {
     caught_exception = true;
     std::string ex_msg = error.what();
@@ -86,7 +94,7 @@ TEST(ENFORCE_EQ, EXTRA_MSG_FAIL) {
   bool caught_exception = false;
   try {
     PADDLE_ENFORCE_EQ(a, 1 + 3, paddle::platform::errors::InvalidArgument(
-                                    "the result is not equal correct result."));
+                                    "The result is not equal correct result."));
   } catch (paddle::platform::EnforceNotMet& error) {
     caught_exception = true;
     std::string ex_msg = error.what();
@@ -98,15 +106,19 @@ TEST(ENFORCE_EQ, EXTRA_MSG_FAIL) {
 }
 
 TEST(ENFORCE_NE, OK) {
-  PADDLE_ENFORCE_NE(1, 2);
-  PADDLE_ENFORCE_NE(1.0, 2UL);
+  PADDLE_ENFORCE_NE(1, 2, paddle::platform::errors::Unavailable(
+                              "PADDLE_ENFORCE_NE tests failed."));
+  PADDLE_ENFORCE_NE(1.0, 2UL, paddle::platform::errors::Unavailable(
+                                  "PADDLE_ENFORCE_NE tests failed."));
 }
 TEST(ENFORCE_NE, FAIL) {
   bool caught_exception = false;
 
   try {
     // 2UL here to check data type compatible
-    PADDLE_ENFORCE_NE(1.0, 1UL);
+    PADDLE_ENFORCE_NE(1.0, 1UL,
+                      paddle::platform::errors::Unavailable(
+                          "Expected 1.0 != 1UL, but received 1.0:1 == 1UL:1."));
   } catch (paddle::platform::EnforceNotMet& error) {
     caught_exception = true;
     std::string ex_msg = error.what();
@@ -116,11 +128,15 @@ TEST(ENFORCE_NE, FAIL) {
   EXPECT_TRUE(caught_exception);
 }
 
-TEST(ENFORCE_GT, OK) { PADDLE_ENFORCE_GT(2, 1); }
+TEST(ENFORCE_GT, OK) {
+  PADDLE_ENFORCE_GT(2, 1, paddle::platform::errors::Unavailable(
+                              "PADDLE_ENFORCE_GT tests failed."));
+}
 TEST(ENFORCE_GT, FAIL) {
   bool caught_exception = false;
   try {
-    PADDLE_ENFORCE_GT(1, 2);
+    PADDLE_ENFORCE_GT(1, 2, paddle::platform::errors::InvalidArgument(
+                                "Expected 1 > 2, but received 1:1 <= 2:2."));
   } catch (paddle::platform::EnforceNotMet& error) {
     caught_exception = true;
     std::string ex_msg = error.what();
@@ -131,14 +147,18 @@ TEST(ENFORCE_GT, FAIL) {
 }
 
 TEST(ENFORCE_GE, OK) {
-  PADDLE_ENFORCE_GE(2, 2);
-  PADDLE_ENFORCE_GE(3, 2);
-  PADDLE_ENFORCE_GE(3.21, 2.0);
+  PADDLE_ENFORCE_GE(2, 2, paddle::platform::errors::Unavailable(
+                              "PADDLE_ENFORCE_GE tests failed."));
+  PADDLE_ENFORCE_GE(3, 2, paddle::platform::errors::Unavailable(
+                              "PADDLE_ENFORCE_GE tests failed."));
+  PADDLE_ENFORCE_GE(3.21, 2.0, paddle::platform::errors::Unavailable(
+                                   "PADDLE_ENFORCE_GE tests failed."));
 }
 TEST(ENFORCE_GE, FAIL) {
   bool caught_exception = false;
   try {
-    PADDLE_ENFORCE_GE(1, 2);
+    PADDLE_ENFORCE_GE(1, 2, paddle::platform::errors::InvalidArgument(
+                                "Expected 1 >= 2, but received 1:1 < 2:2."));
   } catch (paddle::platform::EnforceNotMet& error) {
     caught_exception = true;
     std::string ex_msg = error.what();
@@ -149,16 +169,22 @@ TEST(ENFORCE_GE, FAIL) {
 }
 
 TEST(ENFORCE_LE, OK) {
-  PADDLE_ENFORCE_LE(1, 1);
-  PADDLE_ENFORCE_LE(1UL, 1UL);
-  PADDLE_ENFORCE_LE(2, 3);
-  PADDLE_ENFORCE_LE(2UL, 3UL);
-  PADDLE_ENFORCE_LE(2.0, 3.2);
+  PADDLE_ENFORCE_LE(1, 1, paddle::platform::errors::Unavailable(
+                              "PADDLE_ENFORCE_LE tests failed."));
+  PADDLE_ENFORCE_LE(1UL, 1UL, paddle::platform::errors::Unavailable(
+                                  "PADDLE_ENFORCE_LE tests failed."));
+  PADDLE_ENFORCE_LE(2, 3, paddle::platform::errors::Unavailable(
+                              "PADDLE_ENFORCE_LE tests failed."));
+  PADDLE_ENFORCE_LE(2UL, 3UL, paddle::platform::errors::Unavailable(
+                                  "PADDLE_ENFORCE_LE tests failed."));
+  PADDLE_ENFORCE_LE(2.0, 3.2, paddle::platform::errors::Unavailable(
+                                  "PADDLE_ENFORCE_LE tests failed."));
 }
 TEST(ENFORCE_LE, FAIL) {
   bool caught_exception = false;
   try {
-    PADDLE_ENFORCE_GT(1, 2);
+    PADDLE_ENFORCE_GT(1, 2, paddle::platform::errors::InvalidArgument(
+                                "Expected 1 > 2, but received 1:1 <= 2:2."));
   } catch (paddle::platform::EnforceNotMet& error) {
     caught_exception = true;
     std::string ex_msg = error.what();
@@ -169,14 +195,20 @@ TEST(ENFORCE_LE, FAIL) {
 }
 
 TEST(ENFORCE_LT, OK) {
-  PADDLE_ENFORCE_LT(3, 10);
-  PADDLE_ENFORCE_LT(2UL, 3UL);
-  PADDLE_ENFORCE_LT(2, 3);
+  PADDLE_ENFORCE_LT(3, 10, paddle::platform::errors::Unavailable(
+                               "PADDLE_ENFORCE_LT tests failed."));
+  PADDLE_ENFORCE_LT(2UL, 3UL, paddle::platform::errors::Unavailable(
+                                  "PADDLE_ENFORCE_LT tests failed."));
+  PADDLE_ENFORCE_LT(2, 3, paddle::platform::errors::Unavailable(
+                              "PADDLE_ENFORCE_LT tests failed."));
 }
 TEST(ENFORCE_LT, FAIL) {
   bool caught_exception = false;
   try {
-    PADDLE_ENFORCE_LT(1UL, 0.12);
+    PADDLE_ENFORCE_LT(
+        1UL, 0.12,
+        paddle::platform::errors::InvalidArgument(
+            "Expected 1UL < 0.12, but received 1UL:1 >= 0.12:0.12."));
   } catch (paddle::platform::EnforceNotMet& error) {
     caught_exception = true;
     std::string ex_msg = error.what();
@@ -189,18 +221,20 @@ TEST(ENFORCE_LT, FAIL) {
 
 TEST(ENFORCE_NOT_NULL, OK) {
   int* a = new int;
-  PADDLE_ENFORCE_NOT_NULL(a);
+  PADDLE_ENFORCE_NOT_NULL(a, paddle::platform::errors::Unavailable(
+                                 "PADDLE_ENFORCE_NOT_NULL tests failed."));
   delete a;
 }
 TEST(ENFORCE_NOT_NULL, FAIL) {
   bool caught_exception = false;
   try {
     int* a = nullptr;
-    PADDLE_ENFORCE_NOT_NULL(a);
+    PADDLE_ENFORCE_NOT_NULL(
+        a, paddle::platform::errors::Unavailable("The a should not be null."));
   } catch (paddle::platform::EnforceNotMet& error) {
     caught_exception = true;
     std::string ex_msg = error.what();
-    EXPECT_TRUE(ex_msg.find("a should not be null") != std::string::npos);
+    EXPECT_TRUE(ex_msg.find("The a should not be null.") != std::string::npos);
   }
   EXPECT_TRUE(caught_exception);
 }
@@ -233,14 +267,16 @@ std::ostream& operator<<(std::ostream& os, const Dims& d) {
 
 TEST(ENFORCE_USER_DEFINED_CLASS, EQ) {
   Dims a{{1, 2, 3, 4}}, b{{1, 2, 3, 4}};
-  PADDLE_ENFORCE_EQ(a, b);
+  PADDLE_ENFORCE_EQ(a, b, paddle::platform::errors::Unavailable(
+                              "PADDLE_ENFORCE_EQ tests failed."));
 }
 
 TEST(ENFORCE_USER_DEFINED_CLASS, NE) {
   Dims a{{1, 2, 3, 4}}, b{{5, 6, 7, 8}};
   bool caught_exception = false;
   try {
-    PADDLE_ENFORCE_EQ(a, b);
+    PADDLE_ENFORCE_EQ(a, b, paddle::platform::errors::Unavailable(
+                                "PADDLE_ENFORCE_EQ tests failed."));
   } catch (paddle::platform::EnforceNotMet&) {
     caught_exception = true;
   }
@@ -329,12 +365,15 @@ TEST(enforce, cannot_to_string_type) {
                 "int can be converted to string");
   CannotToStringType obj1(3), obj2(4), obj3(3);
 
-  PADDLE_ENFORCE_NE(obj1, obj2, "Object 1 is not equal to Object 2");
-  PADDLE_ENFORCE_EQ(obj1, obj3, "Object 1 is equal to Object 3");
+  PADDLE_ENFORCE_NE(obj1, obj2, paddle::platform::errors::InvalidArgument(
+                                    "Object 1 is not equal to Object 2"));
+  PADDLE_ENFORCE_EQ(obj1, obj3, paddle::platform::errors::InvalidArgument(
+                                    "Object 1 is equal to Object 3"));
 
   std::string msg = "Compare obj1 with obj2";
   try {
-    PADDLE_ENFORCE_EQ(obj1, obj2, msg);
+    PADDLE_ENFORCE_EQ(obj1, obj2,
+                      paddle::platform::errors::InvalidArgument(msg));
   } catch (paddle::platform::EnforceNotMet& error) {
     std::string ex_msg = error.what();
     LOG(INFO) << ex_msg;
@@ -347,7 +386,7 @@ TEST(enforce, cannot_to_string_type) {
   msg = "Compare x with y";
   try {
     int x = 3, y = 2;
-    PADDLE_ENFORCE_EQ(x, y, msg);
+    PADDLE_ENFORCE_EQ(x, y, paddle::platform::errors::InvalidArgument(msg));
   } catch (paddle::platform::EnforceNotMet& error) {
     std::string ex_msg = error.what();
     LOG(INFO) << ex_msg;
@@ -357,14 +396,22 @@ TEST(enforce, cannot_to_string_type) {
   }
 
   std::set<int> set;
-  PADDLE_ENFORCE_EQ(set.begin(), set.end());
+  PADDLE_ENFORCE_EQ(set.begin(), set.end(),
+                    paddle::platform::errors::InvalidArgument(
+                        "The begin and end of set is not equal."));
   set.insert(3);
-  PADDLE_ENFORCE_NE(set.begin(), set.end());
+  PADDLE_ENFORCE_NE(set.begin(), set.end(),
+                    paddle::platform::errors::InvalidArgument(
+                        "The begin and end of set is equal."));
 
   std::list<float> list;
-  PADDLE_ENFORCE_EQ(list.begin(), list.end());
+  PADDLE_ENFORCE_EQ(list.begin(), list.end(),
+                    paddle::platform::errors::InvalidArgument(
+                        "The begin and end of list is not equal."));
   list.push_back(4);
-  PADDLE_ENFORCE_NE(list.begin(), list.end());
+  PADDLE_ENFORCE_NE(list.begin(), list.end(),
+                    paddle::platform::errors::InvalidArgument(
+                        "The begin and end of list is equal."));
 }
 
 TEST(GET_DATA_SAFELY_MACRO, SUCCESS) {
diff --git a/paddle/fluid/platform/float16_test.cc b/paddle/fluid/platform/float16_test.cc
index f411c3863ffb39a943a863bc2b2ae0f327d51fb9..261ec68483faf6ca7a34a641cd53cd2113381e9c 100644
--- a/paddle/fluid/platform/float16_test.cc
+++ b/paddle/fluid/platform/float16_test.cc
@@ -145,7 +145,9 @@ TEST(float16, lod_tensor_cpu) {
 
 TEST(float16, floating) {
   // compile time assert.
-  PADDLE_ENFORCE_EQ(std::is_floating_point<float16>::value, true);
+  PADDLE_ENFORCE_EQ(
+      std::is_floating_point<float16>::value, true,
+      platform::errors::Unavailable("The float16 support in CPU failed."));
 }
 
 TEST(float16, print) {
diff --git a/paddle/fluid/platform/float16_test.cu b/paddle/fluid/platform/float16_test.cu
index bf2038419cb064c508af01a7e0cd085df9ed6d6d..8bd13a7141d1d616225855956d2d257c5fa5ef66 100644
--- a/paddle/fluid/platform/float16_test.cu
+++ b/paddle/fluid/platform/float16_test.cu
@@ -261,8 +261,12 @@ TEST(float16, typeid) {
   int b(0);
 
   // compile time assert
-  PADDLE_ENFORCE_EQ(functor(a), true);
-  PADDLE_ENFORCE_EQ(functor2(b), false);
+  PADDLE_ENFORCE_EQ(
+      functor(a), true,
+      platform::errors::Unavailable("The float16 support in GPU failed."));
+  PADDLE_ENFORCE_EQ(
+      functor2(b), false,
+      platform::errors::Unavailable("The float16 support in GPU failed."));
 }
 
 // GPU test
diff --git a/paddle/fluid/platform/gpu_info.cc b/paddle/fluid/platform/gpu_info.cc
index 33fcc490691acb1dd8914d5cadfc0c6b9d752d32..5f63233d8bee4beefd6e1695d8bc3d6e5e4ae7fb 100644
--- a/paddle/fluid/platform/gpu_info.cc
+++ b/paddle/fluid/platform/gpu_info.cc
@@ -243,7 +243,9 @@ size_t GpuMaxAllocSize() {
 
 static size_t GpuAllocSize(bool realloc) {
   size_t available_to_alloc = GpuAvailableMemToAlloc();
-  PADDLE_ENFORCE_GT(available_to_alloc, 0, "No enough available GPU memory");
+  PADDLE_ENFORCE_GT(
+      available_to_alloc, 0,
+      platform::errors::ResourceExhausted("Not enough available GPU memory."));
   // If FLAGS_initial_gpu_memory_in_mb is 0, then initial memory will be
   // allocated by fraction
   size_t flag_mb = realloc ? FLAGS_reallocate_gpu_memory_in_mb
@@ -251,8 +253,9 @@ static size_t GpuAllocSize(bool realloc) {
   size_t alloc_bytes =
       (flag_mb > 0ul ? flag_mb << 20 : available_to_alloc *
                                            FLAGS_fraction_of_gpu_memory_to_use);
-  PADDLE_ENFORCE_GE(available_to_alloc, alloc_bytes,
-                    "No enough available GPU memory");
+  PADDLE_ENFORCE_GE(
+      available_to_alloc, alloc_bytes,
+      platform::errors::ResourceExhausted("Not enough available GPU memory."));
   VLOG(10) << "Alloc size is " << (alloc_bytes >> 20)
            << " MiB, is it Re-alloc: " << realloc;
   return alloc_bytes;
@@ -341,10 +344,10 @@ class RecordedCudaMallocHelper {
     PADDLE_ENFORCE_GE(
         dev_id, 0,
         platform::errors::OutOfRange(
-            "Device id must be not less than 0, but got %d", dev_id));
+            "Device id must be not less than 0, but got %d.", dev_id));
     PADDLE_ENFORCE_LT(
         dev_id, instances_.size(),
-        platform::errors::OutOfRange("Device id %d exceeds gpu card number %d",
+        platform::errors::OutOfRange("Device id %d exceeds gpu card number %d.",
                                      dev_id, instances_.size()));
     return instances_[dev_id].get();
   }
diff --git a/paddle/fluid/platform/gpu_launch_config.h b/paddle/fluid/platform/gpu_launch_config.h
index d57478b89781ed073cef0fa73e201784f73dfc6b..fd6e80527caf6d79bf61aa6c2f03fa14724f4d42 100644
--- a/paddle/fluid/platform/gpu_launch_config.h
+++ b/paddle/fluid/platform/gpu_launch_config.h
@@ -31,9 +31,10 @@ struct GpuLaunchConfig {
 };
 
 inline GpuLaunchConfig getGpuLaunchConfig(
-    const int N, const framework::ExecutionContext& ctx) {
+    const int N, const framework::ExecutionContext& ctx,
+    int max_threads = 1024) {
   int threads =
-      std::min(1024, ctx.cuda_device_context().GetMaxThreadsPerBlock());
+      std::min(max_threads, ctx.cuda_device_context().GetMaxThreadsPerBlock());
   int physical_thread_count =
       std::min(ctx.cuda_device_context().GetMaxPhysicalThreadCount(), N);
   int blocks = std::min((physical_thread_count + threads - 1) / threads,
diff --git a/paddle/fluid/platform/gpu_launch_param_config.h b/paddle/fluid/platform/gpu_launch_param_config.h
index c1ea06336002fe9ed76737938e083065e852b109..40f4ef975e76c90b67af62697b25da5f6d936c4f 100755
--- a/paddle/fluid/platform/gpu_launch_param_config.h
+++ b/paddle/fluid/platform/gpu_launch_param_config.h
@@ -39,7 +39,7 @@ inline GpuLaunchParamConfig GetGpuLaunchConfig1D(
     const platform::CUDADeviceContext& context, int element_count) {
   PADDLE_ENFORCE_GT(element_count, 0, platform::errors::InvalidArgument(
                                           "element count should greater than 0,"
-                                          " but received value is:%d",
+                                          " but received value is %d.",
                                           element_count));
 
   const int theory_thread_count = element_count;
diff --git a/paddle/fluid/platform/init.cc b/paddle/fluid/platform/init.cc
index ea67ff13b82b063c049a9a168cd9fad4dcae217b..d9c8026bd285e4c758b9b7a2a4de549d6b34b264 100644
--- a/paddle/fluid/platform/init.cc
+++ b/paddle/fluid/platform/init.cc
@@ -98,9 +98,8 @@ void InitP2P(std::vector<int> devices) {
       for (int j = 0; j < count; ++j) {
         if (devices[i] == devices[j]) continue;
         int can_acess = -1;
-        PADDLE_ENFORCE(
-            cudaDeviceCanAccessPeer(&can_acess, devices[i], devices[j]),
-            "Failed to test P2P access.");
+        PADDLE_ENFORCE_CUDA_SUCCESS(
+            cudaDeviceCanAccessPeer(&can_acess, devices[i], devices[j]));
         if (can_acess != 1) {
           LOG(WARNING) << "Cannot enable P2P access from " << devices[i]
                        << " to " << devices[j];
@@ -118,14 +117,18 @@ void InitCupti() {
 #ifdef PADDLE_WITH_CUPTI
   if (FLAGS_multiple_of_cupti_buffer_size == 1) return;
   size_t attrValue = 0, attrValueSize = sizeof(size_t);
-#define MULTIPLY_ATTR_VALUE(attr)                                 \
-  {                                                               \
-    PADDLE_ENFORCE(!platform::dynload::cuptiActivityGetAttribute( \
-        attr, &attrValueSize, &attrValue));                       \
-    attrValue *= FLAGS_multiple_of_cupti_buffer_size;             \
-    LOG(WARNING) << "Set " #attr " " << attrValue << " byte";     \
-    PADDLE_ENFORCE(!platform::dynload::cuptiActivitySetAttribute( \
-        attr, &attrValueSize, &attrValue));                       \
+#define MULTIPLY_ATTR_VALUE(attr)                                            \
+  {                                                                          \
+    PADDLE_ENFORCE_EQ(                                                       \
+        !platform::dynload::cuptiActivityGetAttribute(attr, &attrValueSize,  \
+                                                      &attrValue),           \
+        true, platform::errors::Unavailable("Get cupti attribute failed.")); \
+    attrValue *= FLAGS_multiple_of_cupti_buffer_size;                        \
+    LOG(WARNING) << "Set " #attr " " << attrValue << " byte";                \
+    PADDLE_ENFORCE_EQ(                                                       \
+        !platform::dynload::cuptiActivitySetAttribute(attr, &attrValueSize,  \
+                                                      &attrValue),           \
+        true, platform::errors::Unavailable("Set cupti attribute failed.")); \
   }
   MULTIPLY_ATTR_VALUE(CUPTI_ACTIVITY_ATTR_DEVICE_BUFFER_SIZE);
   MULTIPLY_ATTR_VALUE(CUPTI_ACTIVITY_ATTR_DEVICE_BUFFER_SIZE_CDP);
diff --git a/paddle/fluid/platform/mkldnn_helper.h b/paddle/fluid/platform/mkldnn_helper.h
index 08b86b072781f512fbc2e15e154e1e6fc25cf9e1..0fcb23679164079865947b0b0b539ae344732b58 100644
--- a/paddle/fluid/platform/mkldnn_helper.h
+++ b/paddle/fluid/platform/mkldnn_helper.h
@@ -178,6 +178,9 @@ inline mkldnn::memory::format_tag GetMKLDNNFormat(
       if (strides[0] >= strides[1] && strides[1] >= strides[2] &&
           strides[2] >= strides[3]) {
         return mkldnn::memory::format_tag::nchw;
+      } else if (strides[2] >= strides[3] && strides[3] >= strides[1] &&
+                 strides[1] >= strides[0]) {
+        return mkldnn::memory::format_tag::cdba;
       } else {
         return mkldnn::memory::format_tag::nhwc;
       }
diff --git a/paddle/fluid/platform/mkldnn_reuse.h b/paddle/fluid/platform/mkldnn_reuse.h
index 2d36ef2ce6aecf847d2116b335f9379e1af3b5ce..5d7143f56b3f394bb1a99c1b3802b7c20138dfb7 100644
--- a/paddle/fluid/platform/mkldnn_reuse.h
+++ b/paddle/fluid/platform/mkldnn_reuse.h
@@ -54,7 +54,7 @@ class MKLDNNHandlerT {
   }
 
   std::shared_ptr<TForward> AcquireForwardPrimitive() {
-    const std::string key_p = key_ + "@forward_p";
+    const std::string key_p = key_ + "@fwd_p";
     auto forward_p =
         std::static_pointer_cast<TForward>(dev_ctx_.GetBlob(key_p));
     if (forward_p == nullptr) {
@@ -65,7 +65,7 @@ class MKLDNNHandlerT {
   }
 
   std::shared_ptr<TBackward> AcquireBackwardPrimitive() {
-    const std::string key_p = key_ + "@backward_p";
+    const std::string key_p = key_ + "@bwd_p";
     auto backward_p =
         std::static_pointer_cast<TBackward>(dev_ctx_.GetBlob(key_p));
     if (backward_p == nullptr) {
@@ -112,11 +112,11 @@ class MKLDNNHandlerT {
 
  protected:
   bool isCached() {
-    const std::string key_pd = key_common_ + "@forward_pd";
+    const std::string key_pd = key_common_ + "@fwd_pd";
     fwd_pd_ = std::static_pointer_cast<typename TForward::primitive_desc>(
         dev_ctx_.GetBlob(key_pd));
 
-    const std::string key_p = key_ + "@forward_p";
+    const std::string key_p = key_ + "@fwd_p";
     return (dev_ctx_.GetBlob(key_p) != nullptr);
   }
 
@@ -129,7 +129,7 @@ class MKLDNNHandlerT {
     // Forward PD has to be passed to Grad op that
     // may be executed by diffrent thread, hence
     // for that one we use key that does not contain TID
-    const std::string key_pd = key_common_ + "@forward_pd";
+    const std::string key_pd = key_common_ + "@fwd_pd";
     fwd_pd_ = std::static_pointer_cast<typename TForward::primitive_desc>(
         dev_ctx_.GetBlob(key_pd));
     if (fwd_pd_ == nullptr) {
@@ -169,11 +169,13 @@ class MKLDNNHandlerT {
 
   template <typename... Args>
   void AcquireBackwardPrimitiveDescriptor(Args&&... args) {
-    const std::string key_fwd_pd = key_common_ + "@forward_pd";
+    const std::string key_fwd_pd = key_common_ + "@fwd_pd";
     fwd_pd_ = std::static_pointer_cast<typename TForward::primitive_desc>(
         dev_ctx_.GetBlob(key_fwd_pd));
-    PADDLE_ENFORCE_NOT_NULL(fwd_pd_);
-    const std::string key_pd = key_ + "@backward_pd";
+    PADDLE_ENFORCE_NOT_NULL(
+        fwd_pd_, platform::errors::Unavailable(
+                     "Get MKLDNN Forward primitive %s failed.", key_fwd_pd));
+    const std::string key_pd = key_ + "@bwd_pd";
     bwd_pd_ = std::static_pointer_cast<typename TBackward::primitive_desc>(
         dev_ctx_.GetBlob(key_pd));
     if (bwd_pd_ == nullptr) {
@@ -498,17 +500,17 @@ class BinaryMKLDNNHandler : public platform::MKLDNNHandlerT<T, dnnl::binary> {
     if (!this->isCached()) {
       PADDLE_ENFORCE_EQ(
           x->layout(), DataLayout::kMKLDNN,
-          platform::errors::InvalidArgument("Wrong layout set for X tensor"));
+          platform::errors::InvalidArgument("Wrong layout set for X tensor."));
       PADDLE_ENFORCE_NE(
           x->format(), MKLDNNMemoryFormat::undef,
-          platform::errors::InvalidArgument("Wrong format set for X tensor"));
+          platform::errors::InvalidArgument("Wrong format set for X tensor."));
 
       PADDLE_ENFORCE_EQ(
           y->layout(), DataLayout::kMKLDNN,
-          platform::errors::InvalidArgument("Wrong layout set for Y tensor"));
+          platform::errors::InvalidArgument("Wrong layout set for Y tensor."));
       PADDLE_ENFORCE_NE(
           y->format(), MKLDNNMemoryFormat::undef,
-          platform::errors::InvalidArgument("Wrong format set for Y tensor"));
+          platform::errors::InvalidArgument("Wrong format set for Y tensor."));
 
       const auto src_x_tz = framework::vectorize(x->dims());
       const auto src_y_tz = framework::vectorize(y->dims());
@@ -772,10 +774,10 @@ class PoolingMKLDNNHandler : public MKLDNNHandlerT<T, mkldnn::pooling_forward,
     if (!this->isCached()) {
       PADDLE_ENFORCE_EQ(input->layout(), DataLayout::kMKLDNN,
                         platform::errors::InvalidArgument(
-                            "Wrong layout set for Input tensor"));
+                            "Wrong layout set for Input tensor."));
       PADDLE_ENFORCE_NE(input->format(), MKLDNNMemoryFormat::undef,
                         platform::errors::InvalidArgument(
-                            "Wrong format set for Input tensor"));
+                            "Wrong format set for Input tensor."));
 
       const std::string pooling_type = ctx.Attr<std::string>("pooling_type");
 
@@ -793,15 +795,21 @@ class PoolingMKLDNNHandler : public MKLDNNHandlerT<T, mkldnn::pooling_forward,
           ctx.Attr<std::string>("padding_algorithm");
 
       // Only 2D pooling is supported now
-      PADDLE_ENFORCE_EQ(ksize.size(), 2,
-                        platform::errors::InvalidArgument(
-                            "ksize must be 2D, i.e. 2D pooling"));
-      PADDLE_ENFORCE_EQ(pooling_type == "max" || pooling_type == "avg", true,
-                        platform::errors::InvalidArgument(
-                            "pooling_type must be 'max' or 'avg'"));
-      PADDLE_ENFORCE_EQ(input->dims().size(), 4,
-                        platform::errors::InvalidArgument(
-                            "Input dim must be with 4, i.e. NCHW"));
+      PADDLE_ENFORCE_EQ(
+          ksize.size(), 2,
+          platform::errors::InvalidArgument(
+              "The ksize must be 2D, i.e. 2D pooling, but received %dD.",
+              ksize.size()));
+      PADDLE_ENFORCE_EQ(
+          pooling_type == "max" || pooling_type == "avg", true,
+          platform::errors::InvalidArgument(
+              "The pooling_type must be 'max' or 'avg', but received %s.",
+              pooling_type));
+      PADDLE_ENFORCE_EQ(
+          input->dims().size(), 4,
+          platform::errors::InvalidArgument(
+              "Input dim must be with 4, i.e. NCHW, but received %d.",
+              input->dims().size()));
 
       const auto input_dims = input->dims();
       framework::DDim data_dims =
@@ -1419,7 +1427,7 @@ static std::shared_ptr<mkldnn::memory> SetDstMemory(
       residual_param_data,
       platform::errors::PreconditionNotMet("Residual parameter is required for "
                                            "the DNNL conv+elementwise_add "
-                                           "fusion, but now it is missing"));
+                                           "fusion, but now it is missing."));
   std::shared_ptr<mkldnn::memory> user_residual_memory_p =
       handler->AcquireResidualDataMemory(user_residual_md,
                                          to_void_cast<T>(residual_param_data));
@@ -1450,8 +1458,10 @@ static void SetDstMemoryQuantized(
   T* output_data = output->mutable_data<T>(ctx.GetPlace());
   const size_t dst_dims = dst_tz.size();
   MKLDNNMemoryFormat dst_fmt;
-  PADDLE_ENFORCE_LE(dst_dims, 5,
-                    "Dst memory for quantization can not have dims > 5");
+  PADDLE_ENFORCE_LE(dst_dims, 5, platform::errors::InvalidArgument(
+                                     "Dst memory for quantization can not have "
+                                     "dims > 5. But received dst_dims is %d.",
+                                     dst_dims));
   dst_fmt = platform::MKLDNNFormatForSize(dst_dims, output_format);
 
   auto tmp_dst_md = platform::MKLDNNMemDesc(
diff --git a/paddle/fluid/platform/nccl_helper.h b/paddle/fluid/platform/nccl_helper.h
index 8b3e73200a367b06fcbbaf1a6160feeaa8da7aa9..22550de5b3fadd4688f430f7641e35a7864ca6b4 100644
--- a/paddle/fluid/platform/nccl_helper.h
+++ b/paddle/fluid/platform/nccl_helper.h
@@ -46,7 +46,8 @@ inline ncclDataType_t ToNCCLDataType(framework::proto::VarType::Type type) {
   } else if (type == framework::proto::VarType::FP16) {
     return ncclFloat16;
   } else {
-    PADDLE_THROW("Not supported");
+    PADDLE_THROW(platform::errors::Unimplemented(
+        "This datatype in nccl is not supported."));
   }
 }
 
@@ -95,7 +96,9 @@ struct NCCLContextMap {
   explicit NCCLContextMap(const std::vector<platform::Place> &places,
                           ncclUniqueId *nccl_id = nullptr,
                           size_t num_trainers = 1, size_t trainer_id = 0) {
-    PADDLE_ENFORCE_EQ(!places.empty(), true);
+    PADDLE_ENFORCE_EQ(!places.empty(), true,
+                      platform::errors::InvalidArgument(
+                          "The NCCL place should not be empty."));
     order_.reserve(places.size());
     for (auto &p : places) {
       int dev_id = BOOST_GET_CONST(CUDAPlace, p).device;
@@ -104,7 +107,8 @@ struct NCCLContextMap {
     }
     PADDLE_ENFORCE_EQ(
         order_.size(), contexts_.size(),
-        "NCCL Context Map does not support contain two or more same device");
+        platform::errors::Unavailable("NCCL Context Map does not support "
+                                      "contain two or more same device."));
 
     std::unique_ptr<ncclComm_t[]> comms(new ncclComm_t[order_.size()]);
     // if num_trainers == 1, should create a new nccl id for local comms.
@@ -113,7 +117,8 @@ struct NCCLContextMap {
       PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::ncclCommInitAll(
           comms.get(), static_cast<int>(order_.size()), order_.data()));
     } else {
-      PADDLE_ENFORCE_NOT_NULL(nccl_id);
+      PADDLE_ENFORCE_NOT_NULL(nccl_id, platform::errors::InvalidArgument(
+                                           "The NCCL id should not be null."));
       {
         int nranks = num_trainers * order_.size();
         NCCLGroupGuard gurad;
@@ -263,13 +268,18 @@ class NCCLCommunicator {
                             size_t trainers_num, size_t trainer_id,
                             size_t inter_trainers_num,
                             size_t exter_trainers_num) {
-    PADDLE_ENFORCE_EQ(trainers_num, inter_trainers_num * exter_trainers_num,
-                      "trainers_num:%llu != inter_trainers_num:%llu * "
-                      "exter_trainers_num:%llu",
-                      trainers_num, inter_trainers_num, exter_trainers_num);
-
-    PADDLE_ENFORCE_GT(inter_trainers_num, 1, "inter_trainers_num:%llu must > 1",
-                      inter_trainers_num);
+    PADDLE_ENFORCE_EQ(
+        trainers_num, inter_trainers_num * exter_trainers_num,
+        platform::errors::InvalidArgument(
+            "trainers_num:%llu != inter_trainers_num:%llu * "
+            "exter_trainers_num:%llu",
+            trainers_num, inter_trainers_num, exter_trainers_num));
+
+    PADDLE_ENFORCE_GT(
+        inter_trainers_num, 1,
+        platform::errors::InvalidArgument(
+            "The inter_trainers_num:%llu should be larger than 1.",
+            inter_trainers_num));
 
     int inter_trainer_id = trainer_id % inter_trainers_num;
     for (size_t i = 0; i < inter_nccl_ids.size(); i++) {
@@ -300,14 +310,16 @@ class NCCLCommunicator {
   bool NeedExterAllReduce() const { return h_exter_ctxs_.size() > 0; }
 
   NCCLContextMap *GetHierarchicalInterCtx(size_t run_order) const {
-    PADDLE_ENFORCE(h_inter_ctxs_.size() > 0,
-                   "must init hierarchical ctxs first!");
+    PADDLE_ENFORCE_GT(h_inter_ctxs_.size(), 0,
+                      platform::errors::InvalidArgument(
+                          "Hierarchical ctxs should be initialized firstly!"));
     return h_inter_ctxs_[run_order % h_inter_ctxs_.size()].get();
   }
 
   NCCLContextMap *GetHierarchicalExterCtx(size_t run_order) const {
-    PADDLE_ENFORCE(h_exter_ctxs_.size() > 0,
-                   "must init hierarchical ctxs first!");
+    PADDLE_ENFORCE_GT(h_exter_ctxs_.size(), 0,
+                      platform::errors::InvalidArgument(
+                          "Hierarchical ctxs should be initialized firstly!"));
     return h_exter_ctxs_[run_order % h_exter_ctxs_.size()].get();
   }
 
diff --git a/paddle/fluid/platform/profiler.cc b/paddle/fluid/platform/profiler.cc
index fc1d9a8799962be4110037125e755b18ee0b93ee..85759bc6e2ea3700df6a17f885385b85dfbcb6a3 100644
--- a/paddle/fluid/platform/profiler.cc
+++ b/paddle/fluid/platform/profiler.cc
@@ -94,10 +94,9 @@ void MemEvenRecorder::PushMemRecord(const void *ptr, const Place &place,
   if (g_state == ProfilerState::kDisabled) return;
   std::lock_guard<std::mutex> guard(mtx_);
   auto &events = address_memevent_[place];
-  PADDLE_ENFORCE_EQ(
-      events.count(ptr), 0,
-      platform::errors::InvalidArgument(
-          "The Place can't  exist in the stage of PushMemRecord"));
+  PADDLE_ENFORCE_EQ(events.count(ptr), 0,
+                    platform::errors::InvalidArgument(
+                        "The Place can't exist in the stage of PushMemRecord"));
   events.emplace(ptr, std::unique_ptr<RecordMemEvent>(
                           new MemEvenRecorder::RecordMemEvent(place, size)));
 }
diff --git a/paddle/fluid/platform/profiler_helper.h b/paddle/fluid/platform/profiler_helper.h
index 36c577fa0503b7bc9d5bbdb23e9e1674331235a4..c79195aa0db0d744748b27029d79375bfa032f2c 100644
--- a/paddle/fluid/platform/profiler_helper.h
+++ b/paddle/fluid/platform/profiler_helper.h
@@ -570,7 +570,7 @@ void PrintProfiler(
     } else {
       PADDLE_THROW(platform::errors::InvalidArgument(
           "Except profiler state must to be one of ['CPU', 'GPU' 'ALL'], but "
-          "received Invalid profiler state"));
+          "received Invalid profiler state."));
     }
 
     if (merge_thread) {
diff --git a/paddle/fluid/platform/resource_pool.h b/paddle/fluid/platform/resource_pool.h
index d988d12a759bd7f01785929bb7f17aeb3fb967c1..3603c0f24f279083a2ba4bdb5680a51cc41e3037 100644
--- a/paddle/fluid/platform/resource_pool.h
+++ b/paddle/fluid/platform/resource_pool.h
@@ -60,7 +60,7 @@ class ResourcePool : public std::enable_shared_from_this<ResourcePool<T>> {
       obj = creator_();
       PADDLE_ENFORCE_NOT_NULL(obj,
                               platform::errors::PermissionDenied(
-                                  "The creator should not return nullptr"));
+                                  "The creator should not return nullptr."));
       VLOG(10) << "Create new instance " << TypePtrName();
     } else {
       obj = instances_.back();
diff --git a/paddle/fluid/platform/stream_callback_manager.cc b/paddle/fluid/platform/stream_callback_manager.cc
index 5a9e24374f6f777c2286b8928eae9dcbe8be6378..365216566b265857f88834d4ee0d127ba960d59b 100644
--- a/paddle/fluid/platform/stream_callback_manager.cc
+++ b/paddle/fluid/platform/stream_callback_manager.cc
@@ -13,6 +13,7 @@
 // limitations under the License.
 
 #include "paddle/fluid/platform/stream_callback_manager.h"
+#include <utility>
 #include "paddle/fluid/platform/enforce.h"
 
 namespace paddle {
@@ -43,14 +44,16 @@ void StreamCallbackManager::AddCallback(std::function<void()> callback) const {
     });
   });
 #if CUDA_VERSION >= 10000
-  PADDLE_ENFORCE(cudaLaunchHostFunc(stream_, StreamCallbackFunc, func));
+  PADDLE_ENFORCE_CUDA_SUCCESS(
+      cudaLaunchHostFunc(stream_, StreamCallbackFunc, func));
 #else
-  PADDLE_ENFORCE(cudaStreamAddCallback(stream_, StreamCallbackFunc, func, 0));
+  PADDLE_ENFORCE_CUDA_SUCCESS(
+      cudaStreamAddCallback(stream_, StreamCallbackFunc, func, 0));
 #endif
 }
 
 void StreamCallbackManager::Wait() const {
-  PADDLE_ENFORCE(cudaStreamSynchronize(stream_));
+  PADDLE_ENFORCE_CUDA_SUCCESS(cudaStreamSynchronize(stream_));
   {
     std::lock_guard<std::mutex> lock(mtx_);
     if (last_future_.valid()) {
diff --git a/paddle/fluid/platform/transform.h b/paddle/fluid/platform/transform.h
index 7877d3e41c1c993662f5d91b263cbcb71db74c36..a0e428f0d1a0a283ea471a006adc2107891e4b17 100644
--- a/paddle/fluid/platform/transform.h
+++ b/paddle/fluid/platform/transform.h
@@ -83,7 +83,9 @@ struct Transform<platform::CUDADeviceContext> {
   void operator()(const platform::CUDADeviceContext& context, InputIter first,
                   InputIter last, OutputIter result, UnaryOperation op) {
     auto place = context.GetPlace();
-    PADDLE_ENFORCE(is_gpu_place(place), "It must use GPU place.");
+    PADDLE_ENFORCE_EQ(is_gpu_place(place), true,
+                      platform::errors::PreconditionNotMet(
+                          "The CUDA Transform must be used in GPU place."));
     thrust::transform(thrust::cuda::par.on(context.stream()),
                       details::CastToCUDATransformIterator(first),
                       details::CastToCUDATransformIterator(last),
@@ -96,7 +98,9 @@ struct Transform<platform::CUDADeviceContext> {
                   InputIter1 last1, InputIter2 first2, OutputIter result,
                   BinaryOperation op) {
     auto place = context.GetPlace();
-    PADDLE_ENFORCE(is_gpu_place(place), "It must use GPU place.");
+    PADDLE_ENFORCE_EQ(is_gpu_place(place), true,
+                      platform::errors::PreconditionNotMet(
+                          "The CUDA Transform must be used in GPU place."));
     thrust::transform(thrust::cuda::par.on(context.stream()),
                       details::CastToCUDATransformIterator(first1),
                       details::CastToCUDATransformIterator(last1),
diff --git a/paddle/fluid/pybind/imperative.cc b/paddle/fluid/pybind/imperative.cc
index e2ff4161db115167c9cd033c1d22c8e5c6198b55..626f6b1ecc217039b2e587413f26bc1ba688d27d 100644
--- a/paddle/fluid/pybind/imperative.cc
+++ b/paddle/fluid/pybind/imperative.cc
@@ -873,7 +873,7 @@ void BindImperative(py::module *m_ptr) {
            &imperative::Tracer::GetProgramDescTracer,
            py::return_value_policy::reference)
       .def("_generate_unique_name", &imperative::Tracer::GenerateUniqueName,
-           py::arg("key") = "tmp")
+           py::arg("key") = "eager_tmp")
       .def("trace",
            [](imperative::Tracer &self, const std::string &type,
               const PyNameVarBaseMap &ins, const PyNameVarBaseMap &outs,
diff --git a/paddle/fluid/pybind/inference_api.cc b/paddle/fluid/pybind/inference_api.cc
index 1b6c407e6bf1a2a38752acb3c096bbdc64c36da6..5a0b18a34f768f3fb4392abf1d796feb951990c3 100644
--- a/paddle/fluid/pybind/inference_api.cc
+++ b/paddle/fluid/pybind/inference_api.cc
@@ -501,6 +501,8 @@ void BindAnalysisPredictor(py::module *m) {
       .def("get_output_names", &AnalysisPredictor::GetOutputNames)
       .def("get_input_tensor_shape", &AnalysisPredictor::GetInputTensorShape)
       .def("zero_copy_run", &AnalysisPredictor::ZeroCopyRun)
+      .def("clear_intermediate_tensor",
+           &AnalysisPredictor::ClearIntermediateTensor)
       .def("create_feed_fetch_var", &AnalysisPredictor::CreateFeedFetchVar)
       .def("prepare_feed_fetch", &AnalysisPredictor::PrepareFeedFetch)
       .def("prepare_argument", &AnalysisPredictor::PrepareArgument)
diff --git a/paddle/fluid/pybind/op_function_generator.cc b/paddle/fluid/pybind/op_function_generator.cc
index 89ebb925363f57d68bb47b9e3b5fd133f8496811..7412eede118d122b14c69ab663836c156eb740e2 100644
--- a/paddle/fluid/pybind/op_function_generator.cc
+++ b/paddle/fluid/pybind/op_function_generator.cc
@@ -34,6 +34,7 @@
 // need to manually specify them in this map.
 std::map<std::string, std::set<std::string>> op_ins_map = {
     {"layer_norm", {"X", "Scale", "Bias"}},
+    {"instance_norm", {"X", "Scale", "Bias"}},
     {"gru_unit", {"Input", "HiddenPrev", "Weight", "Bias"}},
     {"label_smooth", {"X", "PriorDist"}},
     {"assign", {"X"}},
@@ -79,6 +80,7 @@ std::map<std::string, std::set<std::string>> op_passing_outs_map = {
     {"matmul", {"Out"}},
     {"fake_quantize_dequantize_moving_average_abs_max",
      {"Out", "OutScale", "OutAccum", "OutState"}},
+    {"fake_quantize_dequantize_abs_max", {"Out", "OutScale"}},
     {"amp_check_finite_and_scale", {"Out", "FoundInfinite"}},
 };
 
diff --git a/paddle/fluid/pybind/pybind.cc b/paddle/fluid/pybind/pybind.cc
index ce9b143db1dd286bcff87b0afff1572e285d6c3f..79ee871ee882d864fd41363c733b2bc09d4cebf9 100644
--- a/paddle/fluid/pybind/pybind.cc
+++ b/paddle/fluid/pybind/pybind.cc
@@ -1563,6 +1563,15 @@ All parameter, weight, gradient are variables in Paddle.
                                                              sleep_inter);
         },
         py::arg("cmd"), py::arg("time_out") = -1, py::arg("sleep_inter") = -1);
+  m.def("shell_execute_cmd",
+        [](const std::string &cmd, int time_out = 0, int sleep_inter = 0,
+           bool redirect_stderr = false) -> std::vector<std::string> {
+          return paddle::framework::shell_execute_cmd(
+              cmd, time_out, sleep_inter, redirect_stderr);
+        },
+        py::arg("cmd"), py::arg("time_out") = 0, py::arg("sleep_inter") = 0,
+        py::arg("redirect_stderr") = false);
+
 #ifdef PADDLE_WITH_CUDA
   m.def("is_float16_supported", [](const platform::CUDAPlace &place) -> bool {
     // Only GPUs with Compute Capability >= 53 support float16
diff --git a/paddle/fluid/pybind/reader_py.cc b/paddle/fluid/pybind/reader_py.cc
index 7703f8f48d7f6b92bd73af567b87f79441d6c3ec..0dd30e562b66847551e5f27b45042fb077fc7bc7 100644
--- a/paddle/fluid/pybind/reader_py.cc
+++ b/paddle/fluid/pybind/reader_py.cc
@@ -188,6 +188,14 @@ class MultiDeviceFeedReader {
 
       result.emplace_back();
       auto &ret = result.back();
+      PADDLE_ENFORCE_EQ(names_.size(), ret_[i].size(),
+                        platform::errors::InvalidArgument(
+                            "The sample number of reader's input data and the "
+                            "input number of feed list are not equal.\n"
+                            "Possible reasons are:\n"
+                            "  The generator is decorated by `paddle.batch` "
+                            "and configured by `set_batch_generator`, but here "
+                            "need to used `set_sample_list_generator`."));
       for (size_t j = 0; j < names_.size(); ++j) {
         ret.emplace(names_[j], std::move(ret_[i][j]));
       }
diff --git a/paddle/scripts/paddle_build.sh b/paddle/scripts/paddle_build.sh
index 6c716cb11100191bbd0b7e898bc999eebd93c3fb..1f7baf135d6983e96dc981c95ee65735458472e1 100755
--- a/paddle/scripts/paddle_build.sh
+++ b/paddle/scripts/paddle_build.sh
@@ -64,6 +64,9 @@ function cmake_base() {
     # Delete previous built whl packages
     rm -rf python/dist 2>/dev/null || true
 
+    # `gym` is only used in unittest, it's not suitable to add in requirements.txt.
+    # Add it dynamically.
+    echo "gym" >> ${PADDLE_ROOT}/python/requirements.txt
     # Support build for all python versions, currently
     # including cp27-cp27m and cp27-cp27mu.
     PYTHON_FLAGS=""
@@ -119,6 +122,8 @@ function cmake_base() {
                 exit 1
             fi
         fi
+        # delete `gym` to avoid modifying requirements.txt in *.whl
+        sed -i .bak "/^gym$/d" ${PADDLE_ROOT}/python/requirements.txt
     else
         if [ "$1" != "" ]; then
             echo "using python abi: $1"
@@ -175,6 +180,8 @@ function cmake_base() {
         else
             pip install -r ${PADDLE_ROOT}/python/requirements.txt
         fi
+        # delete `gym` to avoid modifying requirements.txt in *.whl
+        sed -i "/^gym$/d" ${PADDLE_ROOT}/python/requirements.txt
     fi
 
     if [ "$SYSTEM" == "Darwin" ]; then
@@ -193,7 +200,6 @@ function cmake_base() {
     Configuring cmake in /paddle/build ...
         -DCMAKE_BUILD_TYPE=${CMAKE_BUILD_TYPE:-Release}
         ${PYTHON_FLAGS}
-        -DWITH_DSO=ON
         -DWITH_GPU=${WITH_GPU:-OFF}
         -DWITH_AMD_GPU=${WITH_AMD_GPU:-OFF}
         -DWITH_DISTRIBUTE=${distibuted_flag}
@@ -214,15 +220,16 @@ function cmake_base() {
         -DCMAKE_INSTALL_PREFIX=${INSTALL_PREFIX:-/paddle/build}
         -DWITH_GRPC=${grpc_flag}
         -DWITH_LITE=${WITH_LITE:-OFF}
+        -DLITE_GIT_TAG=develop
     ========================================
 EOF
     # Disable UNITTEST_USE_VIRTUALENV in docker because
     # docker environment is fully controlled by this script.
     # See /Paddle/CMakeLists.txt, UNITTEST_USE_VIRTUALENV option.
+    set +e
     cmake .. \
         -DCMAKE_BUILD_TYPE=${CMAKE_BUILD_TYPE:-Release} \
         ${PYTHON_FLAGS} \
-        -DWITH_DSO=ON \
         -DWITH_GPU=${WITH_GPU:-OFF} \
         -DWITH_AMD_GPU=${WITH_AMD_GPU:-OFF} \
         -DWITH_DISTRIBUTE=${distibuted_flag} \
@@ -242,8 +249,11 @@ EOF
         -DPY_VERSION=${PY_VERSION:-2.7} \
         -DCMAKE_INSTALL_PREFIX=${INSTALL_PREFIX:-/paddle/build} \
         -DWITH_GRPC=${grpc_flag} \
-        -DWITH_LITE=${WITH_LITE:-OFF}
-
+        -DLITE_GIT_TAG=develop \
+        -DWITH_LITE=${WITH_LITE:-OFF};build_error=$?
+    if [ "$build_error" != 0 ];then
+        exit 7;
+    fi
 }
 
 function cmake_gen() {
@@ -295,6 +305,7 @@ function check_style() {
 #=================================================
 
 function build_base() {
+    set +e
     if [ "$SYSTEM" == "Linux" ];then
       if [ `nproc` -gt 16 ];then
           parallel_number=$(expr `nproc` - 8)
@@ -312,7 +323,10 @@ function build_base() {
         make clean
     fi
 
-    make install -j ${parallel_number}
+    make install -j ${parallel_number};build_error=$?
+    if [ "$build_error" != 0 ];then
+        exit 7;
+    fi
 }
 
 function build_size() {
@@ -365,6 +379,7 @@ function cmake_gen_and_build() {
 }
 
 function build_mac() {
+    set +e
     mkdir -p ${PADDLE_ROOT}/build
     cd ${PADDLE_ROOT}/build
     cat <<EOF
@@ -375,7 +390,11 @@ EOF
     if [[ "$ENABLE_MAKE_CLEAN" != "OFF" ]]; then
         make clean
     fi
-    make install -j 8
+    make install -j 8;build_error=$?
+    if [ "$build_error" != 0 ];then
+        exit 7;
+    fi
+    set -e
     build_size
 }
 
@@ -657,9 +676,9 @@ EOF
 
 
 function assert_api_spec_approvals() {
-    /bin/bash ${PADDLE_ROOT}/tools/check_api_approvals.sh
-    if [ "$?" != 0 ];then
-       exit 1
+    /bin/bash ${PADDLE_ROOT}/tools/check_api_approvals.sh;approval_error=$?
+    if [ "$approval_error" != 0 ];then
+       exit 6
     fi
 }
 
@@ -744,6 +763,23 @@ EOF
     fi
 }
 
+failed_test_lists=''
+tmp_dir=`mktemp -d`
+
+function collect_failed_tests() {
+    for file in `ls $tmp_dir`; do
+        exit_code=0
+        grep -q 'The following tests FAILED:' $tmp_dir/$file||exit_code=$?
+        if [ $exit_code -ne 0 ]; then
+            failuretest=''
+        else
+            failuretest=`grep -A 10000 'The following tests FAILED:' $tmp_dir/$file | sed 's/The following tests FAILED://g'|sed '/^$/d'`
+            failed_test_lists="${failed_test_lists}
+            ${failuretest}"
+        fi
+    done
+}
+
 function card_test() {
     set -m
     case_count $1 $2
@@ -766,7 +802,7 @@ function card_test() {
     fi
 
     trap 'caught_error' CHLD
-
+    tmpfile_rand=`date +%s%N`
     NUM_PROC=$[CUDA_DEVICE_COUNT/$cardnumber]
     for (( i = 0; i < $NUM_PROC; i++ )); do
         # CUDA_VISIBLE_DEVICES http://acceleware.com/blog/cudavisibledevices-masking-gpus
@@ -779,21 +815,21 @@ function card_test() {
                     cuda_list="$cuda_list,$[i*cardnumber+j]"
             fi
         done
+        tmpfile=$tmp_dir/$tmpfile_rand"_"$i
         if [ ${TESTING_DEBUG_MODE:-OFF} == "ON" ] ; then
             if [[ $cardnumber == $CUDA_DEVICE_COUNT ]]; then
-                ctest -I $i,,$NUM_PROC -R "($testcases)" -V &
-            else
-                env CUDA_VISIBLE_DEVICES=$cuda_list ctest -I $i,,$NUM_PROC -R "($testcases)" -V &
+                (ctest -I $i,,$NUM_PROC -R "($testcases)" -V | tee $tmpfile; test ${PIPESTATUS[0]} -eq 0) &
+            else  
+                (env CUDA_VISIBLE_DEVICES=$cuda_list ctest -I $i,,$NUM_PROC -R "($testcases)" -V | tee $tmpfile; test ${PIPESTATUS[0]} -eq 0) &
             fi
         else
             if [[ $cardnumber == $CUDA_DEVICE_COUNT ]]; then
-                ctest -I $i,,$NUM_PROC -R "($testcases)" --output-on-failure &
+                (ctest -I $i,,$NUM_PROC -R "($testcases)" --output-on-failure | tee $tmpfile; test ${PIPESTATUS[0]} -eq 0) &
             else
-                env CUDA_VISIBLE_DEVICES=$cuda_list ctest -I $i,,$NUM_PROC -R "($testcases)" --output-on-failure &
+                (env CUDA_VISIBLE_DEVICES=$cuda_list ctest -I $i,,$NUM_PROC -R "($testcases)" --output-on-failure | tee $tmpfile; test ${PIPESTATUS[0]} -eq 0) &
             fi
         fi
     done
-
     wait; # wait for all subshells to finish
     ut_endTime_s=`date +%s`
     if [ "$2" == "" ]; then
@@ -804,7 +840,7 @@ function card_test() {
     set +m
 }
 
-function parallel_test_base() {
+function parallel_test_base_gpu() {
     if [ ${WITH_TESTING:-ON} == "ON" ] ; then
     cat <<EOF
     ========================================
@@ -878,6 +914,16 @@ set +x
         card_test "$single_card_tests_1" 1    # run cases with single GPU
         card_test "$multiple_card_tests" 2  # run cases with two GPUs
         card_test "$exclusive_tests"        # run cases exclusively, in this cases would be run with 4/8 GPUs
+        collect_failed_tests
+        if [ -n "${failed_test_lists}" ];then
+            failed_test_lists_ult=`echo "${failed_test_lists}" |grep -Po '[^ ].*$'`
+            echo "========================================"
+            echo "Summary Failed Tests... "
+            echo "========================================"
+            echo "The following tests FAILED: "
+            echo "${failed_test_lists_ult}"
+        fi
+        rm -f $tmp_dir/*
         if [[ "$EXIT_CODE" != "0" ]]; then
             exit 8;
         fi
@@ -885,11 +931,34 @@ set -ex
     fi
 }
 
+function parallel_test_base_cpu() {
+    mkdir -p ${PADDLE_ROOT}/build
+    cd ${PADDLE_ROOT}/build
+    if [ ${WITH_TESTING:-ON} == "ON" ] ; then
+    cat <<EOF
+    ========================================
+    Running unit cpu tests ...
+    ========================================
+EOF
+        ut_startTime_s=`date +%s`
+        ctest --output-on-failure -j $1
+        ut_endTime_s=`date +%s`
+        echo "CPU testCase Time: $[ $ut_endTime_s - $ut_startTime_s ]s"
+        if [[ "$EXIT_CODE" != "0" ]]; then
+            exit 8;
+        fi
+    fi
+}
+
 function parallel_test() {
     ut_total_startTime_s=`date +%s`
     mkdir -p ${PADDLE_ROOT}/build
     cd ${PADDLE_ROOT}/build
-    parallel_test_base
+    if [ "$WITH_GPU" == "ON" ];then
+        parallel_test_base_gpu
+    else
+        parallel_test_base_cpu ${PROC_RUN:-1}
+    fi
     ut_total_endTime_s=`date +%s`
     echo "TestCases Total Time: $[ $ut_total_endTime_s - $ut_total_startTime_s ]s"
 }
@@ -1149,11 +1218,14 @@ EOF
     if [[ "$1" != "" ]]; then
       parallel_number=$1
     fi
-    startTime_s=`date +%s` 
-    cmake .. -DWITH_DISTRIBUTE=OFF -DON_INFER=ON -DCUDA_ARCH_NAME=${CUDA_ARCH_NAME:-Auto}
-
-    make -j ${parallel_number} fluid_lib_dist
-    make -j ${parallel_number} inference_lib_dist
+    startTime_s=`date +%s`
+    set +e
+    cmake .. -DWITH_DISTRIBUTE=OFF -DON_INFER=ON -DCUDA_ARCH_NAME=${CUDA_ARCH_NAME:-Auto};build_error=$?
+    make -j ${parallel_number} fluid_lib_dist;build_error=$?
+    make -j ${parallel_number} inference_lib_dist;build_error=$?
+    if [ "$build_error" != 0 ];then
+        exit 7;
+    fi
     endTime_s=`date +%s`
     echo "Build Time: $[ $endTime_s - $startTime_s ]s"
     build_size "fluid_inference"
diff --git a/patches/eigen/MathFunctions.h b/patches/eigen/MathFunctions.h
new file mode 100644
index 0000000000000000000000000000000000000000..9f6a4d0e3328ffbe864273b69dc0033d50b813ae
--- /dev/null
+++ b/patches/eigen/MathFunctions.h
@@ -0,0 +1,1938 @@
+// Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2006-2010 Benoit Jacob <jacob.benoit.1@gmail.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_MATHFUNCTIONS_H
+#define EIGEN_MATHFUNCTIONS_H
+
+// source: http://www.geom.uiuc.edu/~huberty/math5337/groupe/digits.html
+// TODO this should better be moved to NumTraits
+#define EIGEN_PI \
+  3.141592653589793238462643383279502884197169399375105820974944592307816406L
+
+namespace Eigen {
+
+// On WINCE, std::abs is defined for int only, so let's defined our own
+// overloads:
+// This issue has been confirmed with MSVC 2008 only, but the issue might exist
+// for more recent versions too.
+#if EIGEN_OS_WINCE && EIGEN_COMP_MSVC && EIGEN_COMP_MSVC <= 1500
+long abs(long x) { return (labs(x)); }
+double abs(double x) { return (fabs(x)); }
+float abs(float x) { return (fabsf(x)); }
+long double abs(long double x) { return (fabsl(x)); }
+#endif
+
+namespace internal {
+
+/** \internal \class global_math_functions_filtering_base
+  *
+  * What it does:
+  * Defines a typedef 'type' as follows:
+  * - if type T has a member typedef
+ * Eigen_BaseClassForSpecializationOfGlobalMathFuncImpl, then
+  *   global_math_functions_filtering_base<T>::type is a typedef for it.
+  * - otherwise, global_math_functions_filtering_base<T>::type is a typedef for
+ * T.
+  *
+  * How it's used:
+  * To allow to defined the global math functions (like sin...) in certain
+ * cases, like the Array expressions.
+  * When you do sin(array1+array2), the object array1+array2 has a complicated
+ * expression type, all what you want to know
+  * is that it inherits ArrayBase. So we implement a partial specialization of
+ * sin_impl for ArrayBase<Derived>.
+  * So we must make sure to use sin_impl<ArrayBase<Derived> > and not
+ * sin_impl<Derived>, otherwise our partial specialization
+  * won't be used. How does sin know that? That's exactly what
+ * global_math_functions_filtering_base tells it.
+  *
+  * How it's implemented:
+  * SFINAE in the style of enable_if. Highly susceptible of breaking compilers.
+ * With GCC, it sure does work, but if you replace
+  * the typename dummy by an integer template parameter, it doesn't work
+ * anymore!
+  */
+
+template <typename T, typename dummy = void>
+struct global_math_functions_filtering_base {
+  typedef T type;
+};
+
+template <typename T>
+struct always_void {
+  typedef void type;
+};
+
+template <typename T>
+struct global_math_functions_filtering_base<
+    T,
+    typename always_void<
+        typename T::Eigen_BaseClassForSpecializationOfGlobalMathFuncImpl>::
+        type> {
+  typedef typename T::Eigen_BaseClassForSpecializationOfGlobalMathFuncImpl type;
+};
+
+#define EIGEN_MATHFUNC_IMPL(func, scalar)                             \
+  Eigen::internal::func##_impl<                                       \
+      typename Eigen::internal::global_math_functions_filtering_base< \
+          scalar>::type>
+#define EIGEN_MATHFUNC_RETVAL(func, scalar)                           \
+  typename Eigen::internal::func##_retval<                            \
+      typename Eigen::internal::global_math_functions_filtering_base< \
+          scalar>::type>::type
+
+/****************************************************************************
+* Implementation of real                                                 *
+****************************************************************************/
+
+template <typename Scalar, bool IsComplex = NumTraits<Scalar>::IsComplex>
+struct real_default_impl {
+  typedef typename NumTraits<Scalar>::Real RealScalar;
+  EIGEN_DEVICE_FUNC
+  static inline RealScalar run(const Scalar& x) { return x; }
+};
+
+template <typename Scalar>
+struct real_default_impl<Scalar, true> {
+  typedef typename NumTraits<Scalar>::Real RealScalar;
+  EIGEN_DEVICE_FUNC
+  static inline RealScalar run(const Scalar& x) {
+    using std::real;
+    return real(x);
+  }
+};
+
+template <typename Scalar>
+struct real_impl : real_default_impl<Scalar> {};
+
+#if defined(EIGEN_GPU_COMPILE_PHASE)
+template <typename T>
+struct real_impl<std::complex<T>> {
+  typedef T RealScalar;
+  EIGEN_DEVICE_FUNC
+  static inline T run(const std::complex<T>& x) { return x.real(); }
+};
+#endif
+
+template <typename Scalar>
+struct real_retval {
+  typedef typename NumTraits<Scalar>::Real type;
+};
+
+/****************************************************************************
+* Implementation of imag                                                 *
+****************************************************************************/
+
+template <typename Scalar, bool IsComplex = NumTraits<Scalar>::IsComplex>
+struct imag_default_impl {
+  typedef typename NumTraits<Scalar>::Real RealScalar;
+  EIGEN_DEVICE_FUNC
+  static inline RealScalar run(const Scalar&) { return RealScalar(0); }
+};
+
+template <typename Scalar>
+struct imag_default_impl<Scalar, true> {
+  typedef typename NumTraits<Scalar>::Real RealScalar;
+  EIGEN_DEVICE_FUNC
+  static inline RealScalar run(const Scalar& x) {
+    using std::imag;
+    return imag(x);
+  }
+};
+
+template <typename Scalar>
+struct imag_impl : imag_default_impl<Scalar> {};
+
+#if defined(EIGEN_GPU_COMPILE_PHASE)
+template <typename T>
+struct imag_impl<std::complex<T>> {
+  typedef T RealScalar;
+  EIGEN_DEVICE_FUNC
+  static inline T run(const std::complex<T>& x) { return x.imag(); }
+};
+#endif
+
+template <typename Scalar>
+struct imag_retval {
+  typedef typename NumTraits<Scalar>::Real type;
+};
+
+/****************************************************************************
+* Implementation of real_ref                                             *
+****************************************************************************/
+
+template <typename Scalar>
+struct real_ref_impl {
+  typedef typename NumTraits<Scalar>::Real RealScalar;
+  EIGEN_DEVICE_FUNC
+  static inline RealScalar& run(Scalar& x) {
+    return reinterpret_cast<RealScalar*>(&x)[0];
+  }
+  EIGEN_DEVICE_FUNC
+  static inline const RealScalar& run(const Scalar& x) {
+    return reinterpret_cast<const RealScalar*>(&x)[0];
+  }
+};
+
+template <typename Scalar>
+struct real_ref_retval {
+  typedef typename NumTraits<Scalar>::Real& type;
+};
+
+/****************************************************************************
+* Implementation of imag_ref                                             *
+****************************************************************************/
+
+template <typename Scalar, bool IsComplex>
+struct imag_ref_default_impl {
+  typedef typename NumTraits<Scalar>::Real RealScalar;
+  EIGEN_DEVICE_FUNC
+  static inline RealScalar& run(Scalar& x) {
+    return reinterpret_cast<RealScalar*>(&x)[1];
+  }
+  EIGEN_DEVICE_FUNC
+  static inline const RealScalar& run(const Scalar& x) {
+    return reinterpret_cast<RealScalar*>(&x)[1];
+  }
+};
+
+template <typename Scalar>
+struct imag_ref_default_impl<Scalar, false> {
+  EIGEN_DEVICE_FUNC
+  static inline Scalar run(Scalar&) { return Scalar(0); }
+  EIGEN_DEVICE_FUNC
+  static inline const Scalar run(const Scalar&) { return Scalar(0); }
+};
+
+template <typename Scalar>
+struct imag_ref_impl
+    : imag_ref_default_impl<Scalar, NumTraits<Scalar>::IsComplex> {};
+
+template <typename Scalar>
+struct imag_ref_retval {
+  typedef typename NumTraits<Scalar>::Real& type;
+};
+
+/****************************************************************************
+* Implementation of conj                                                 *
+****************************************************************************/
+
+template <typename Scalar, bool IsComplex = NumTraits<Scalar>::IsComplex>
+struct conj_default_impl {
+  EIGEN_DEVICE_FUNC
+  static inline Scalar run(const Scalar& x) { return x; }
+};
+
+template <typename Scalar>
+struct conj_default_impl<Scalar, true> {
+  EIGEN_DEVICE_FUNC
+  static inline Scalar run(const Scalar& x) {
+    using std::conj;
+    return conj(x);
+  }
+};
+
+template <typename Scalar>
+struct conj_impl : conj_default_impl<Scalar> {};
+
+#if defined(EIGEN_GPU_COMPILE_PHASE)
+template <typename T>
+struct conj_impl<std::complex<T>> {
+  EIGEN_DEVICE_FUNC
+  static inline std::complex<T> run(const std::complex<T>& x) {
+    return std::complex<T>(x.real(), -x.imag());
+  }
+};
+#endif
+
+template <typename Scalar>
+struct conj_retval {
+  typedef Scalar type;
+};
+
+/****************************************************************************
+* Implementation of abs2                                                 *
+****************************************************************************/
+
+template <typename Scalar, bool IsComplex>
+struct abs2_impl_default {
+  typedef typename NumTraits<Scalar>::Real RealScalar;
+  EIGEN_DEVICE_FUNC
+  static inline RealScalar run(const Scalar& x) { return x * x; }
+};
+
+template <typename Scalar>
+struct abs2_impl_default<Scalar, true>  // IsComplex
+{
+  typedef typename NumTraits<Scalar>::Real RealScalar;
+  EIGEN_DEVICE_FUNC
+  static inline RealScalar run(const Scalar& x) {
+    return x.real() * x.real() + x.imag() * x.imag();
+  }
+};
+
+template <typename Scalar>
+struct abs2_impl {
+  typedef typename NumTraits<Scalar>::Real RealScalar;
+  EIGEN_DEVICE_FUNC
+  static inline RealScalar run(const Scalar& x) {
+    return abs2_impl_default<Scalar, NumTraits<Scalar>::IsComplex>::run(x);
+  }
+};
+
+template <typename Scalar>
+struct abs2_retval {
+  typedef typename NumTraits<Scalar>::Real type;
+};
+
+/****************************************************************************
+* Implementation of norm1                                                *
+****************************************************************************/
+
+template <typename Scalar, bool IsComplex>
+struct norm1_default_impl;
+
+template <typename Scalar>
+struct norm1_default_impl<Scalar, true> {
+  typedef typename NumTraits<Scalar>::Real RealScalar;
+  EIGEN_DEVICE_FUNC
+  static inline RealScalar run(const Scalar& x) {
+    EIGEN_USING_STD_MATH(abs);
+    return abs(x.real()) + abs(x.imag());
+  }
+};
+
+template <typename Scalar>
+struct norm1_default_impl<Scalar, false> {
+  EIGEN_DEVICE_FUNC
+  static inline Scalar run(const Scalar& x) {
+    EIGEN_USING_STD_MATH(abs);
+    return abs(x);
+  }
+};
+
+template <typename Scalar>
+struct norm1_impl : norm1_default_impl<Scalar, NumTraits<Scalar>::IsComplex> {};
+
+template <typename Scalar>
+struct norm1_retval {
+  typedef typename NumTraits<Scalar>::Real type;
+};
+
+/****************************************************************************
+* Implementation of hypot                                                *
+****************************************************************************/
+
+template <typename Scalar>
+struct hypot_impl;
+
+template <typename Scalar>
+struct hypot_retval {
+  typedef typename NumTraits<Scalar>::Real type;
+};
+
+/****************************************************************************
+* Implementation of cast                                                 *
+****************************************************************************/
+
+template <typename OldType, typename NewType>
+struct cast_impl {
+  EIGEN_DEVICE_FUNC
+  static inline NewType run(const OldType& x) {
+    return static_cast<NewType>(x);
+  }
+};
+
+// here, for once, we're plainly returning NewType: we don't want cast to do
+// weird things.
+
+template <typename OldType, typename NewType>
+EIGEN_DEVICE_FUNC inline NewType cast(const OldType& x) {
+  return cast_impl<OldType, NewType>::run(x);
+}
+
+/****************************************************************************
+* Implementation of round                                                   *
+****************************************************************************/
+
+#if EIGEN_HAS_CXX11_MATH
+template <typename Scalar>
+struct round_impl {
+  EIGEN_DEVICE_FUNC
+  static inline Scalar run(const Scalar& x) {
+    EIGEN_STATIC_ASSERT((!NumTraits<Scalar>::IsComplex),
+                        NUMERIC_TYPE_MUST_BE_REAL)
+    EIGEN_USING_STD_MATH(round);
+    return round(x);
+  }
+};
+#else
+template <typename Scalar>
+struct round_impl {
+  EIGEN_DEVICE_FUNC
+  static inline Scalar run(const Scalar& x) {
+    EIGEN_STATIC_ASSERT((!NumTraits<Scalar>::IsComplex),
+                        NUMERIC_TYPE_MUST_BE_REAL)
+    EIGEN_USING_STD_MATH(floor);
+    EIGEN_USING_STD_MATH(ceil);
+    return (x > Scalar(0)) ? floor(x + Scalar(0.5)) : ceil(x - Scalar(0.5));
+  }
+};
+#endif
+
+template <typename Scalar>
+struct round_retval {
+  typedef Scalar type;
+};
+
+/****************************************************************************
+* Implementation of rint                                                    *
+****************************************************************************/
+
+template <typename Scalar>
+struct rint_impl {
+  EIGEN_DEVICE_FUNC
+  static inline Scalar run(const Scalar& x) {
+    EIGEN_STATIC_ASSERT((!NumTraits<Scalar>::IsComplex),
+                        NUMERIC_TYPE_MUST_BE_REAL)
+#if EIGEN_HAS_CXX11_MATH
+    EIGEN_USING_STD_MATH(rint);
+#endif
+    return rint(x);
+  }
+};
+
+#if !EIGEN_HAS_CXX11_MATH
+template <>
+struct rint_impl<double> {
+  EIGEN_DEVICE_FUNC
+  static inline double run(const double& x) { return ::rint(x); }
+};
+template <>
+struct rint_impl<float> {
+  EIGEN_DEVICE_FUNC
+  static inline float run(const float& x) { return ::rintf(x); }
+};
+#endif
+
+template <typename Scalar>
+struct rint_retval {
+  typedef Scalar type;
+};
+
+/****************************************************************************
+* Implementation of arg                                                     *
+****************************************************************************/
+
+#if EIGEN_HAS_CXX11_MATH
+template <typename Scalar>
+struct arg_impl {
+  EIGEN_DEVICE_FUNC
+  static inline Scalar run(const Scalar& x) {
+#if defined(EIGEN_HIP_DEVICE_COMPILE)
+    // HIP does not seem to have a native device side implementation for the
+    // math routine "arg"
+    using std::arg;
+#else
+    EIGEN_USING_STD_MATH(arg);
+#endif
+    return arg(x);
+  }
+};
+#else
+template <typename Scalar, bool IsComplex = NumTraits<Scalar>::IsComplex>
+struct arg_default_impl {
+  typedef typename NumTraits<Scalar>::Real RealScalar;
+  EIGEN_DEVICE_FUNC
+  static inline RealScalar run(const Scalar& x) {
+    return (x < Scalar(0)) ? Scalar(EIGEN_PI) : Scalar(0);
+  }
+};
+
+template <typename Scalar>
+struct arg_default_impl<Scalar, true> {
+  typedef typename NumTraits<Scalar>::Real RealScalar;
+  EIGEN_DEVICE_FUNC
+  static inline RealScalar run(const Scalar& x) {
+    EIGEN_USING_STD_MATH(arg);
+    return arg(x);
+  }
+};
+
+template <typename Scalar>
+struct arg_impl : arg_default_impl<Scalar> {};
+#endif
+
+template <typename Scalar>
+struct arg_retval {
+  typedef typename NumTraits<Scalar>::Real type;
+};
+
+/****************************************************************************
+* Implementation of expm1                                                   *
+****************************************************************************/
+
+// This implementation is based on GSL Math's expm1.
+namespace std_fallback {
+// fallback expm1 implementation in case there is no expm1(Scalar) function in
+// namespace of Scalar,
+// or that there is no suitable std::expm1 function available. Implementation
+// attributed to Kahan. See: http://www.plunk.org/~hatch/rightway.php.
+template <typename Scalar>
+EIGEN_DEVICE_FUNC inline Scalar expm1(const Scalar& x) {
+  EIGEN_STATIC_ASSERT_NON_INTEGER(Scalar)
+  typedef typename NumTraits<Scalar>::Real RealScalar;
+
+  EIGEN_USING_STD_MATH(exp);
+  Scalar u = exp(x);
+  if (numext::equal_strict(u, Scalar(1))) {
+    return x;
+  }
+  Scalar um1 = u - RealScalar(1);
+  if (numext::equal_strict(um1, Scalar(-1))) {
+    return RealScalar(-1);
+  }
+
+  EIGEN_USING_STD_MATH(log);
+  Scalar logu = log(u);
+  return numext::equal_strict(u, logu) ? u : (u - RealScalar(1)) * x / logu;
+}
+}
+
+template <typename Scalar>
+struct expm1_impl {
+  EIGEN_DEVICE_FUNC static inline Scalar run(const Scalar& x) {
+    EIGEN_STATIC_ASSERT_NON_INTEGER(Scalar)
+#if EIGEN_HAS_CXX11_MATH
+    using std::expm1;
+#else
+    using std_fallback::expm1;
+#endif
+    return expm1(x);
+  }
+};
+
+// Specialization for complex types that are not supported by std::expm1.
+template <typename RealScalar>
+struct expm1_impl<std::complex<RealScalar>> {
+  EIGEN_DEVICE_FUNC static inline std::complex<RealScalar> run(
+      const std::complex<RealScalar>& x) {
+    EIGEN_STATIC_ASSERT_NON_INTEGER(RealScalar)
+    RealScalar xr = x.real();
+    RealScalar xi = x.imag();
+    // expm1(z) = exp(z) - 1
+    //          = exp(x +  i * y) - 1
+    //          = exp(x) * (cos(y) + i * sin(y)) - 1
+    //          = exp(x) * cos(y) - 1 + i * exp(x) * sin(y)
+    // Imag(expm1(z)) = exp(x) * sin(y)
+    // Real(expm1(z)) = exp(x) * cos(y) - 1
+    //          = exp(x) * cos(y) - 1.
+    //          = expm1(x) + exp(x) * (cos(y) - 1)
+    //          = expm1(x) + exp(x) * (2 * sin(y / 2) ** 2)
+
+    // TODO better use numext::expm1 and numext::sin (but that would require
+    // forward declarations or moving this specialization down).
+    RealScalar erm1 = expm1_impl<RealScalar>::run(xr);
+    RealScalar er = erm1 + RealScalar(1.);
+    EIGEN_USING_STD_MATH(sin);
+    RealScalar sin2 = sin(xi / RealScalar(2.));
+    sin2 = sin2 * sin2;
+    RealScalar s = sin(xi);
+    RealScalar real_part = erm1 - RealScalar(2.) * er * sin2;
+    return std::complex<RealScalar>(real_part, er * s);
+  }
+};
+
+template <typename Scalar>
+struct expm1_retval {
+  typedef Scalar type;
+};
+
+/****************************************************************************
+* Implementation of log1p                                                   *
+****************************************************************************/
+
+namespace std_fallback {
+// fallback log1p implementation in case there is no log1p(Scalar) function in
+// namespace of Scalar,
+// or that there is no suitable std::log1p function available
+template <typename Scalar>
+EIGEN_DEVICE_FUNC inline Scalar log1p(const Scalar& x) {
+  EIGEN_STATIC_ASSERT_NON_INTEGER(Scalar)
+  typedef typename NumTraits<Scalar>::Real RealScalar;
+  EIGEN_USING_STD_MATH(log);
+  Scalar x1p = RealScalar(1) + x;
+  Scalar log_1p = log(x1p);
+  const bool is_small = numext::equal_strict(x1p, Scalar(1));
+  const bool is_inf = numext::equal_strict(x1p, log_1p);
+  return (is_small || is_inf) ? x : x * (log_1p / (x1p - RealScalar(1)));
+}
+}
+
+template <typename Scalar>
+struct log1p_impl {
+  EIGEN_DEVICE_FUNC static inline Scalar run(const Scalar& x) {
+    EIGEN_STATIC_ASSERT_NON_INTEGER(Scalar)
+#if EIGEN_HAS_CXX11_MATH
+    using std::log1p;
+#else
+    using std_fallback::log1p;
+#endif
+    return log1p(x);
+  }
+};
+
+// Specialization for complex types that are not supported by std::log1p.
+template <typename RealScalar>
+struct log1p_impl<std::complex<RealScalar>> {
+  EIGEN_DEVICE_FUNC static inline std::complex<RealScalar> run(
+      const std::complex<RealScalar>& x) {
+    EIGEN_STATIC_ASSERT_NON_INTEGER(RealScalar)
+    return std_fallback::log1p(x);
+  }
+};
+
+template <typename Scalar>
+struct log1p_retval {
+  typedef Scalar type;
+};
+
+/****************************************************************************
+* Implementation of pow                                                  *
+****************************************************************************/
+
+template <typename ScalarX,
+          typename ScalarY,
+          bool IsInteger =
+              NumTraits<ScalarX>::IsInteger&& NumTraits<ScalarY>::IsInteger>
+struct pow_impl {
+  // typedef Scalar retval;
+  typedef typename ScalarBinaryOpTraits<
+      ScalarX,
+      ScalarY,
+      internal::scalar_pow_op<ScalarX, ScalarY>>::ReturnType result_type;
+  static EIGEN_DEVICE_FUNC inline result_type run(const ScalarX& x,
+                                                  const ScalarY& y) {
+    EIGEN_USING_STD_MATH(pow);
+    return pow(x, y);
+  }
+};
+
+template <typename ScalarX, typename ScalarY>
+struct pow_impl<ScalarX, ScalarY, true> {
+  typedef ScalarX result_type;
+  static EIGEN_DEVICE_FUNC inline ScalarX run(ScalarX x, ScalarY y) {
+    ScalarX res(1);
+    eigen_assert(!NumTraits<ScalarY>::IsSigned || y >= 0);
+    if (y & 1) res *= x;
+    y >>= 1;
+    while (y) {
+      x *= x;
+      if (y & 1) res *= x;
+      y >>= 1;
+    }
+    return res;
+  }
+};
+
+/****************************************************************************
+* Implementation of random                                               *
+****************************************************************************/
+
+template <typename Scalar, bool IsComplex, bool IsInteger>
+struct random_default_impl {};
+
+template <typename Scalar>
+struct random_impl : random_default_impl<Scalar,
+                                         NumTraits<Scalar>::IsComplex,
+                                         NumTraits<Scalar>::IsInteger> {};
+
+template <typename Scalar>
+struct random_retval {
+  typedef Scalar type;
+};
+
+template <typename Scalar>
+inline EIGEN_MATHFUNC_RETVAL(random, Scalar)
+    random(const Scalar& x, const Scalar& y);
+template <typename Scalar>
+inline EIGEN_MATHFUNC_RETVAL(random, Scalar) random();
+
+template <typename Scalar>
+struct random_default_impl<Scalar, false, false> {
+  static inline Scalar run(const Scalar& x, const Scalar& y) {
+    return x + (y - x) * Scalar(std::rand()) / Scalar(RAND_MAX);
+  }
+  static inline Scalar run() {
+    return run(Scalar(NumTraits<Scalar>::IsSigned ? -1 : 0), Scalar(1));
+  }
+};
+
+enum {
+  meta_floor_log2_terminate,
+  meta_floor_log2_move_up,
+  meta_floor_log2_move_down,
+  meta_floor_log2_bogus
+};
+
+template <unsigned int n, int lower, int upper>
+struct meta_floor_log2_selector {
+  enum {
+    middle = (lower + upper) / 2,
+    value = (upper <= lower + 1)
+                ? int(meta_floor_log2_terminate)
+                : (n < (1 << middle)) ? int(meta_floor_log2_move_down)
+                                      : (n == 0) ? int(meta_floor_log2_bogus)
+                                                 : int(meta_floor_log2_move_up)
+  };
+};
+
+template <unsigned int n,
+          int lower = 0,
+          int upper = sizeof(unsigned int) * CHAR_BIT - 1,
+          int selector = meta_floor_log2_selector<n, lower, upper>::value>
+struct meta_floor_log2 {};
+
+template <unsigned int n, int lower, int upper>
+struct meta_floor_log2<n, lower, upper, meta_floor_log2_move_down> {
+  enum {
+    value = meta_floor_log2<
+        n,
+        lower,
+        meta_floor_log2_selector<n, lower, upper>::middle>::value
+  };
+};
+
+template <unsigned int n, int lower, int upper>
+struct meta_floor_log2<n, lower, upper, meta_floor_log2_move_up> {
+  enum {
+    value = meta_floor_log2<n,
+                            meta_floor_log2_selector<n, lower, upper>::middle,
+                            upper>::value
+  };
+};
+
+template <unsigned int n, int lower, int upper>
+struct meta_floor_log2<n, lower, upper, meta_floor_log2_terminate> {
+  enum {
+    value = (n >= ((unsigned int)(1) << (lower + 1))) ? lower + 1 : lower
+  };
+};
+
+template <unsigned int n, int lower, int upper>
+struct meta_floor_log2<n, lower, upper, meta_floor_log2_bogus> {
+  // no value, error at compile time
+};
+
+template <typename Scalar>
+struct random_default_impl<Scalar, false, true> {
+  static inline Scalar run(const Scalar& x, const Scalar& y) {
+    if (y <= x) return x;
+    // ScalarU is the unsigned counterpart of Scalar, possibly Scalar itself.
+    typedef typename make_unsigned<Scalar>::type ScalarU;
+    // ScalarX is the widest of ScalarU and unsigned int.
+    // We'll deal only with ScalarX and unsigned int below thus avoiding signed
+    // types and arithmetic and signed overflows (which are undefined behavior).
+    typedef typename conditional<(ScalarU(-1) > unsigned(-1)),
+                                 ScalarU,
+                                 unsigned>::type ScalarX;
+    // The following difference doesn't overflow, provided our integer types are
+    // two's
+    // complement and have the same number of padding bits in signed and
+    // unsigned variants.
+    // This is the case in most modern implementations of C++.
+    ScalarX range = ScalarX(y) - ScalarX(x);
+    ScalarX offset = 0;
+    ScalarX divisor = 1;
+    ScalarX multiplier = 1;
+    const unsigned rand_max = RAND_MAX;
+    if (range <= rand_max)
+      divisor = (rand_max + 1) / (range + 1);
+    else
+      multiplier = 1 + range / (rand_max + 1);
+    // Rejection sampling.
+    do {
+      offset = (unsigned(std::rand()) * multiplier) / divisor;
+    } while (offset > range);
+    return Scalar(ScalarX(x) + offset);
+  }
+
+  static inline Scalar run() {
+#ifdef EIGEN_MAKING_DOCS
+    return run(Scalar(NumTraits<Scalar>::IsSigned ? -10 : 0), Scalar(10));
+#else
+    enum {
+        rand_bits = meta_floor_log2<(unsigned int)(RAND_MAX) + 1>::value,
+        scalar_bits = sizeof(Scalar) * CHAR_BIT,
+        shift = EIGEN_PLAIN_ENUM_MAX(0, int(rand_bits) - int(scalar_bits)),
+        offset = NumTraits<Scalar>::IsSigned
+                     ? (1 << (EIGEN_PLAIN_ENUM_MIN(rand_bits, scalar_bits) - 1))
+                     : 0};
+    return Scalar((std::rand() >> shift) - offset);
+#endif
+  }
+};
+
+template <typename Scalar>
+struct random_default_impl<Scalar, true, false> {
+  static inline Scalar run(const Scalar& x, const Scalar& y) {
+    return Scalar(random(x.real(), y.real()), random(x.imag(), y.imag()));
+  }
+  static inline Scalar run() {
+    typedef typename NumTraits<Scalar>::Real RealScalar;
+    return Scalar(random<RealScalar>(), random<RealScalar>());
+  }
+};
+
+template <typename Scalar>
+inline EIGEN_MATHFUNC_RETVAL(random, Scalar)
+    random(const Scalar& x, const Scalar& y) {
+  return EIGEN_MATHFUNC_IMPL(random, Scalar)::run(x, y);
+}
+
+template <typename Scalar>
+inline EIGEN_MATHFUNC_RETVAL(random, Scalar) random() {
+  return EIGEN_MATHFUNC_IMPL(random, Scalar)::run();
+}
+
+// Implementation of is* functions
+
+// std::is* do not work with fast-math and gcc, std::is* are available on MSVC
+// 2013 and newer, as well as in clang.
+#if (EIGEN_HAS_CXX11_MATH &&                               \
+     !(EIGEN_COMP_GNUC_STRICT && __FINITE_MATH_ONLY__)) || \
+    (EIGEN_COMP_MSVC >= 1800) || (EIGEN_COMP_CLANG)
+#define EIGEN_USE_STD_FPCLASSIFY 1
+#else
+#define EIGEN_USE_STD_FPCLASSIFY 0
+#endif
+
+template <typename T>
+EIGEN_DEVICE_FUNC
+    typename internal::enable_if<internal::is_integral<T>::value, bool>::type
+    isnan_impl(const T&) {
+  return false;
+}
+
+template <typename T>
+EIGEN_DEVICE_FUNC
+    typename internal::enable_if<internal::is_integral<T>::value, bool>::type
+    isinf_impl(const T&) {
+  return false;
+}
+
+template <typename T>
+EIGEN_DEVICE_FUNC
+    typename internal::enable_if<internal::is_integral<T>::value, bool>::type
+    isfinite_impl(const T&) {
+  return true;
+}
+
+template <typename T>
+EIGEN_DEVICE_FUNC
+    typename internal::enable_if<(!internal::is_integral<T>::value) &&
+                                     (!NumTraits<T>::IsComplex),
+                                 bool>::type
+    isfinite_impl(const T& x) {
+#if defined(EIGEN_GPU_COMPILE_PHASE)
+  return (::isfinite)(x);
+#elif EIGEN_USE_STD_FPCLASSIFY
+  using std::isfinite;
+  return isfinite EIGEN_NOT_A_MACRO(x);
+#else
+  return x <= NumTraits<T>::highest() && x >= NumTraits<T>::lowest();
+#endif
+}
+
+template <typename T>
+EIGEN_DEVICE_FUNC
+    typename internal::enable_if<(!internal::is_integral<T>::value) &&
+                                     (!NumTraits<T>::IsComplex),
+                                 bool>::type
+    isinf_impl(const T& x) {
+#if defined(EIGEN_GPU_COMPILE_PHASE)
+  return (::isinf)(x);
+#elif EIGEN_USE_STD_FPCLASSIFY
+  using std::isinf;
+  return isinf EIGEN_NOT_A_MACRO(x);
+#else
+  return x > NumTraits<T>::highest() || x < NumTraits<T>::lowest();
+#endif
+}
+
+template <typename T>
+EIGEN_DEVICE_FUNC
+    typename internal::enable_if<(!internal::is_integral<T>::value) &&
+                                     (!NumTraits<T>::IsComplex),
+                                 bool>::type
+    isnan_impl(const T& x) {
+#if defined(EIGEN_GPU_COMPILE_PHASE)
+  return (::isnan)(x);
+#elif EIGEN_USE_STD_FPCLASSIFY
+  using std::isnan;
+  return isnan EIGEN_NOT_A_MACRO(x);
+#else
+  return x != x;
+#endif
+}
+
+#if (!EIGEN_USE_STD_FPCLASSIFY)
+
+#if EIGEN_COMP_MSVC
+
+template <typename T>
+EIGEN_DEVICE_FUNC bool isinf_msvc_helper(T x) {
+  return _fpclass(x) == _FPCLASS_NINF || _fpclass(x) == _FPCLASS_PINF;
+}
+
+// MSVC defines a _isnan builtin function, but for double only
+EIGEN_DEVICE_FUNC inline bool isnan_impl(const long double& x) {
+  return _isnan(x) != 0;
+}
+EIGEN_DEVICE_FUNC inline bool isnan_impl(const double& x) {
+  return _isnan(x) != 0;
+}
+EIGEN_DEVICE_FUNC inline bool isnan_impl(const float& x) {
+  return _isnan(x) != 0;
+}
+
+EIGEN_DEVICE_FUNC inline bool isinf_impl(const long double& x) {
+  return isinf_msvc_helper(x);
+}
+EIGEN_DEVICE_FUNC inline bool isinf_impl(const double& x) {
+  return isinf_msvc_helper(x);
+}
+EIGEN_DEVICE_FUNC inline bool isinf_impl(const float& x) {
+  return isinf_msvc_helper(x);
+}
+
+#elif (defined __FINITE_MATH_ONLY__ && __FINITE_MATH_ONLY__ && EIGEN_COMP_GNUC)
+
+#if EIGEN_GNUC_AT_LEAST(5, 0)
+#define EIGEN_TMP_NOOPT_ATTRIB \
+  EIGEN_DEVICE_FUNC inline __attribute__((optimize("no-finite-math-only")))
+#else
+// NOTE the inline qualifier and noinline attribute are both needed: the former
+// is to avoid linking issue (duplicate symbol),
+//      while the second prevent too aggressive optimizations in fast-math mode:
+#define EIGEN_TMP_NOOPT_ATTRIB \
+  EIGEN_DEVICE_FUNC inline     \
+      __attribute__((noinline, optimize("no-finite-math-only")))
+#endif
+
+template <>
+EIGEN_TMP_NOOPT_ATTRIB bool isnan_impl(const long double& x) {
+  return __builtin_isnan(x);
+}
+template <>
+EIGEN_TMP_NOOPT_ATTRIB bool isnan_impl(const double& x) {
+  return __builtin_isnan(x);
+}
+template <>
+EIGEN_TMP_NOOPT_ATTRIB bool isnan_impl(const float& x) {
+  return __builtin_isnan(x);
+}
+template <>
+EIGEN_TMP_NOOPT_ATTRIB bool isinf_impl(const double& x) {
+  return __builtin_isinf(x);
+}
+template <>
+EIGEN_TMP_NOOPT_ATTRIB bool isinf_impl(const float& x) {
+  return __builtin_isinf(x);
+}
+template <>
+EIGEN_TMP_NOOPT_ATTRIB bool isinf_impl(const long double& x) {
+  return __builtin_isinf(x);
+}
+
+#undef EIGEN_TMP_NOOPT_ATTRIB
+
+#endif
+
+#endif
+
+// The following overload are defined at the end of this file
+template <typename T>
+EIGEN_DEVICE_FUNC bool isfinite_impl(const std::complex<T>& x);
+template <typename T>
+EIGEN_DEVICE_FUNC bool isnan_impl(const std::complex<T>& x);
+template <typename T>
+EIGEN_DEVICE_FUNC bool isinf_impl(const std::complex<T>& x);
+
+template <typename T>
+T generic_fast_tanh_float(const T& a_x);
+}  // end namespace internal
+
+/****************************************************************************
+* Generic math functions                                                    *
+****************************************************************************/
+
+namespace numext {
+
+#if (!defined(EIGEN_GPUCC))
+template <typename T>
+EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE T mini(const T& x, const T& y) {
+  EIGEN_USING_STD_MATH(min);
+  return min EIGEN_NOT_A_MACRO(x, y);
+}
+
+template <typename T>
+EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE T maxi(const T& x, const T& y) {
+  EIGEN_USING_STD_MATH(max);
+  return max EIGEN_NOT_A_MACRO(x, y);
+}
+#else
+template <typename T>
+EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE T mini(const T& x, const T& y) {
+  return y < x ? y : x;
+}
+template <>
+EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE float mini(const float& x,
+                                                 const float& y) {
+  return fminf(x, y);
+}
+template <>
+EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE double mini(const double& x,
+                                                  const double& y) {
+  return fmin(x, y);
+}
+template <>
+EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE long double mini(const long double& x,
+                                                       const long double& y) {
+#if defined(EIGEN_HIPCC)
+  // no "fminl" on HIP yet
+  return (x < y) ? x : y;
+#else
+  return fminl(x, y);
+#endif
+}
+
+template <typename T>
+EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE T maxi(const T& x, const T& y) {
+  return x < y ? y : x;
+}
+template <>
+EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE float maxi(const float& x,
+                                                 const float& y) {
+  return fmaxf(x, y);
+}
+template <>
+EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE double maxi(const double& x,
+                                                  const double& y) {
+  return fmax(x, y);
+}
+template <>
+EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE long double maxi(const long double& x,
+                                                       const long double& y) {
+#if defined(EIGEN_HIPCC)
+  // no "fmaxl" on HIP yet
+  return (x > y) ? x : y;
+#else
+  return fmaxl(x, y);
+#endif
+}
+#endif
+
+#if defined(SYCL_DEVICE_ONLY)
+
+#define SYCL_SPECIALIZE_SIGNED_INTEGER_TYPES_BINARY(NAME, FUNC) \
+  SYCL_SPECIALIZE_BINARY_FUNC(NAME, FUNC, cl::sycl::cl_char)    \
+  SYCL_SPECIALIZE_BINARY_FUNC(NAME, FUNC, cl::sycl::cl_short)   \
+  SYCL_SPECIALIZE_BINARY_FUNC(NAME, FUNC, cl::sycl::cl_int)     \
+  SYCL_SPECIALIZE_BINARY_FUNC(NAME, FUNC, cl::sycl::cl_long)
+#define SYCL_SPECIALIZE_SIGNED_INTEGER_TYPES_UNARY(NAME, FUNC) \
+  SYCL_SPECIALIZE_UNARY_FUNC(NAME, FUNC, cl::sycl::cl_char)    \
+  SYCL_SPECIALIZE_UNARY_FUNC(NAME, FUNC, cl::sycl::cl_short)   \
+  SYCL_SPECIALIZE_UNARY_FUNC(NAME, FUNC, cl::sycl::cl_int)     \
+  SYCL_SPECIALIZE_UNARY_FUNC(NAME, FUNC, cl::sycl::cl_long)
+#define SYCL_SPECIALIZE_UNSIGNED_INTEGER_TYPES_BINARY(NAME, FUNC) \
+  SYCL_SPECIALIZE_BINARY_FUNC(NAME, FUNC, cl::sycl::cl_uchar)     \
+  SYCL_SPECIALIZE_BINARY_FUNC(NAME, FUNC, cl::sycl::cl_ushort)    \
+  SYCL_SPECIALIZE_BINARY_FUNC(NAME, FUNC, cl::sycl::cl_uint)      \
+  SYCL_SPECIALIZE_BINARY_FUNC(NAME, FUNC, cl::sycl::cl_ulong)
+#define SYCL_SPECIALIZE_UNSIGNED_INTEGER_TYPES_UNARY(NAME, FUNC) \
+  SYCL_SPECIALIZE_UNARY_FUNC(NAME, FUNC, cl::sycl::cl_uchar)     \
+  SYCL_SPECIALIZE_UNARY_FUNC(NAME, FUNC, cl::sycl::cl_ushort)    \
+  SYCL_SPECIALIZE_UNARY_FUNC(NAME, FUNC, cl::sycl::cl_uint)      \
+  SYCL_SPECIALIZE_UNARY_FUNC(NAME, FUNC, cl::sycl::cl_ulong)
+#define SYCL_SPECIALIZE_INTEGER_TYPES_BINARY(NAME, FUNC)  \
+  SYCL_SPECIALIZE_SIGNED_INTEGER_TYPES_BINARY(NAME, FUNC) \
+  SYCL_SPECIALIZE_UNSIGNED_INTEGER_TYPES_BINARY(NAME, FUNC)
+#define SYCL_SPECIALIZE_INTEGER_TYPES_UNARY(NAME, FUNC)  \
+  SYCL_SPECIALIZE_SIGNED_INTEGER_TYPES_UNARY(NAME, FUNC) \
+  SYCL_SPECIALIZE_UNSIGNED_INTEGER_TYPES_UNARY(NAME, FUNC)
+#define SYCL_SPECIALIZE_FLOATING_TYPES_BINARY(NAME, FUNC)     \
+  SYCL_SPECIALIZE_BINARY_FUNC(NAME, FUNC, cl::sycl::cl_float) \
+  SYCL_SPECIALIZE_BINARY_FUNC(NAME, FUNC, cl::sycl::cl_double)
+#define SYCL_SPECIALIZE_FLOATING_TYPES_UNARY(NAME, FUNC)     \
+  SYCL_SPECIALIZE_UNARY_FUNC(NAME, FUNC, cl::sycl::cl_float) \
+  SYCL_SPECIALIZE_UNARY_FUNC(NAME, FUNC, cl::sycl::cl_double)
+#define SYCL_SPECIALIZE_FLOATING_TYPES_UNARY_FUNC_RET_TYPE(                \
+    NAME, FUNC, RET_TYPE)                                                  \
+  SYCL_SPECIALIZE_GEN_UNARY_FUNC(NAME, FUNC, RET_TYPE, cl::sycl::cl_float) \
+  SYCL_SPECIALIZE_GEN_UNARY_FUNC(NAME, FUNC, RET_TYPE, cl::sycl::cl_double)
+
+#define SYCL_SPECIALIZE_GEN_UNARY_FUNC(NAME, FUNC, RET_TYPE, ARG_TYPE)     \
+  template <>                                                              \
+  EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE RET_TYPE NAME(const ARG_TYPE& x) { \
+    return cl::sycl::FUNC(x);                                              \
+  }
+
+#define SYCL_SPECIALIZE_UNARY_FUNC(NAME, FUNC, TYPE) \
+  SYCL_SPECIALIZE_GEN_UNARY_FUNC(NAME, FUNC, TYPE, TYPE)
+
+#define SYCL_SPECIALIZE_GEN1_BINARY_FUNC(                                   \
+    NAME, FUNC, RET_TYPE, ARG_TYPE1, ARG_TYPE2)                             \
+  template <>                                                               \
+  EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE RET_TYPE NAME(const ARG_TYPE1& x,   \
+                                                      const ARG_TYPE2& y) { \
+    return cl::sycl::FUNC(x, y);                                            \
+  }
+
+#define SYCL_SPECIALIZE_GEN2_BINARY_FUNC(NAME, FUNC, RET_TYPE, ARG_TYPE) \
+  SYCL_SPECIALIZE_GEN1_BINARY_FUNC(NAME, FUNC, RET_TYPE, ARG_TYPE, ARG_TYPE)
+
+#define SYCL_SPECIALIZE_BINARY_FUNC(NAME, FUNC, TYPE) \
+  SYCL_SPECIALIZE_GEN2_BINARY_FUNC(NAME, FUNC, TYPE, TYPE)
+
+SYCL_SPECIALIZE_INTEGER_TYPES_BINARY(mini, min)
+SYCL_SPECIALIZE_FLOATING_TYPES_BINARY(mini, fmin)
+SYCL_SPECIALIZE_INTEGER_TYPES_BINARY(maxi, max)
+SYCL_SPECIALIZE_FLOATING_TYPES_BINARY(maxi, fmax)
+
+#endif
+
+template <typename Scalar>
+EIGEN_DEVICE_FUNC inline EIGEN_MATHFUNC_RETVAL(real, Scalar)
+    real(const Scalar& x) {
+  return EIGEN_MATHFUNC_IMPL(real, Scalar)::run(x);
+}
+
+template <typename Scalar>
+EIGEN_DEVICE_FUNC inline typename internal::add_const_on_value_type<
+    EIGEN_MATHFUNC_RETVAL(real_ref, Scalar)>::type
+real_ref(const Scalar& x) {
+  return internal::real_ref_impl<Scalar>::run(x);
+}
+
+template <typename Scalar>
+EIGEN_DEVICE_FUNC inline EIGEN_MATHFUNC_RETVAL(real_ref, Scalar)
+    real_ref(Scalar& x) {
+  return EIGEN_MATHFUNC_IMPL(real_ref, Scalar)::run(x);
+}
+
+template <typename Scalar>
+EIGEN_DEVICE_FUNC inline EIGEN_MATHFUNC_RETVAL(imag, Scalar)
+    imag(const Scalar& x) {
+  return EIGEN_MATHFUNC_IMPL(imag, Scalar)::run(x);
+}
+
+template <typename Scalar>
+EIGEN_DEVICE_FUNC inline EIGEN_MATHFUNC_RETVAL(arg, Scalar)
+    arg(const Scalar& x) {
+  return EIGEN_MATHFUNC_IMPL(arg, Scalar)::run(x);
+}
+
+template <typename Scalar>
+EIGEN_DEVICE_FUNC inline typename internal::add_const_on_value_type<
+    EIGEN_MATHFUNC_RETVAL(imag_ref, Scalar)>::type
+imag_ref(const Scalar& x) {
+  return internal::imag_ref_impl<Scalar>::run(x);
+}
+
+template <typename Scalar>
+EIGEN_DEVICE_FUNC inline EIGEN_MATHFUNC_RETVAL(imag_ref, Scalar)
+    imag_ref(Scalar& x) {
+  return EIGEN_MATHFUNC_IMPL(imag_ref, Scalar)::run(x);
+}
+
+template <typename Scalar>
+EIGEN_DEVICE_FUNC inline EIGEN_MATHFUNC_RETVAL(conj, Scalar)
+    conj(const Scalar& x) {
+  return EIGEN_MATHFUNC_IMPL(conj, Scalar)::run(x);
+}
+
+template <typename Scalar>
+EIGEN_DEVICE_FUNC inline EIGEN_MATHFUNC_RETVAL(abs2, Scalar)
+    abs2(const Scalar& x) {
+  return EIGEN_MATHFUNC_IMPL(abs2, Scalar)::run(x);
+}
+
+EIGEN_DEVICE_FUNC
+inline bool abs2(bool x) { return x; }
+
+template <typename T>
+EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE T absdiff(const T& x, const T& y) {
+  return x > y ? x - y : y - x;
+}
+template <>
+EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE float absdiff(const float& x,
+                                                    const float& y) {
+  return fabsf(x - y);
+}
+template <>
+EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE double absdiff(const double& x,
+                                                     const double& y) {
+  return fabs(x - y);
+}
+template <>
+EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE long double absdiff(
+    const long double& x, const long double& y) {
+#if defined(EIGEN_HIPCC)
+  // no "fabsl" on HIP yet
+  return (x > y) ? x : y;
+#else
+  return fabsl(x - y);
+#endif
+}
+
+template <typename Scalar>
+EIGEN_DEVICE_FUNC inline EIGEN_MATHFUNC_RETVAL(norm1, Scalar)
+    norm1(const Scalar& x) {
+  return EIGEN_MATHFUNC_IMPL(norm1, Scalar)::run(x);
+}
+
+template <typename Scalar>
+EIGEN_DEVICE_FUNC inline EIGEN_MATHFUNC_RETVAL(hypot, Scalar)
+    hypot(const Scalar& x, const Scalar& y) {
+  return EIGEN_MATHFUNC_IMPL(hypot, Scalar)::run(x, y);
+}
+
+#if defined(SYCL_DEVICE_ONLY)
+SYCL_SPECIALIZE_FLOATING_TYPES_BINARY(hypot, hypot)
+#endif
+
+template <typename Scalar>
+EIGEN_DEVICE_FUNC inline EIGEN_MATHFUNC_RETVAL(log1p, Scalar)
+    log1p(const Scalar& x) {
+  return EIGEN_MATHFUNC_IMPL(log1p, Scalar)::run(x);
+}
+
+#if defined(SYCL_DEVICE_ONLY)
+SYCL_SPECIALIZE_FLOATING_TYPES_UNARY(log1p, log1p)
+#endif
+
+#if defined(EIGEN_GPUCC)
+template <>
+EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE float log1p(const float& x) {
+  return ::log1pf(x);
+}
+
+template <>
+EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE double log1p(const double& x) {
+  return ::log1p(x);
+}
+#endif
+
+template <typename ScalarX, typename ScalarY>
+EIGEN_DEVICE_FUNC inline
+    typename internal::pow_impl<ScalarX, ScalarY>::result_type
+    pow(const ScalarX& x, const ScalarY& y) {
+  return internal::pow_impl<ScalarX, ScalarY>::run(x, y);
+}
+
+#if defined(SYCL_DEVICE_ONLY)
+SYCL_SPECIALIZE_FLOATING_TYPES_BINARY(pow, pow)
+#endif
+
+template <typename T>
+EIGEN_DEVICE_FUNC bool(isnan)(const T& x) {
+  return internal::isnan_impl(x);
+}
+template <typename T>
+EIGEN_DEVICE_FUNC bool(isinf)(const T& x) {
+  return internal::isinf_impl(x);
+}
+template <typename T>
+EIGEN_DEVICE_FUNC bool(isfinite)(const T& x) {
+  return internal::isfinite_impl(x);
+}
+
+#if defined(SYCL_DEVICE_ONLY)
+SYCL_SPECIALIZE_FLOATING_TYPES_UNARY_FUNC_RET_TYPE(isnan, isnan, bool)
+SYCL_SPECIALIZE_FLOATING_TYPES_UNARY_FUNC_RET_TYPE(isinf, isinf, bool)
+SYCL_SPECIALIZE_FLOATING_TYPES_UNARY_FUNC_RET_TYPE(isfinite, isfinite, bool)
+#endif
+
+template <typename Scalar>
+EIGEN_DEVICE_FUNC inline EIGEN_MATHFUNC_RETVAL(rint, Scalar)
+    rint(const Scalar& x) {
+  return EIGEN_MATHFUNC_IMPL(rint, Scalar)::run(x);
+}
+
+template <typename Scalar>
+EIGEN_DEVICE_FUNC inline EIGEN_MATHFUNC_RETVAL(round, Scalar)
+    round(const Scalar& x) {
+  return EIGEN_MATHFUNC_IMPL(round, Scalar)::run(x);
+}
+
+#if defined(SYCL_DEVICE_ONLY)
+SYCL_SPECIALIZE_FLOATING_TYPES_UNARY(round, round)
+#endif
+
+template <typename T>
+EIGEN_DEVICE_FUNC T(floor)(const T& x) {
+  EIGEN_USING_STD_MATH(floor);
+  return floor(x);
+}
+
+#if defined(SYCL_DEVICE_ONLY)
+SYCL_SPECIALIZE_FLOATING_TYPES_UNARY(floor, floor)
+#endif
+
+#if defined(EIGEN_GPUCC)
+template <>
+EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE float floor(const float& x) {
+  return ::floorf(x);
+}
+
+template <>
+EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE double floor(const double& x) {
+  return ::floor(x);
+}
+#endif
+
+template <typename T>
+EIGEN_DEVICE_FUNC T(ceil)(const T& x) {
+  EIGEN_USING_STD_MATH(ceil);
+  return ceil(x);
+}
+
+#if defined(SYCL_DEVICE_ONLY)
+SYCL_SPECIALIZE_FLOATING_TYPES_UNARY(ceil, ceil)
+#endif
+
+#if defined(EIGEN_GPUCC)
+template <>
+EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE float ceil(const float& x) {
+  return ::ceilf(x);
+}
+
+template <>
+EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE double ceil(const double& x) {
+  return ::ceil(x);
+}
+#endif
+
+/** Log base 2 for 32 bits positive integers.
+  * Conveniently returns 0 for x==0. */
+inline int log2(int x) {
+  eigen_assert(x >= 0);
+  unsigned int v(x);
+  static const int table[32] = {0,  9,  1,  10, 13, 21, 2,  29, 11, 14, 16,
+                                18, 22, 25, 3,  30, 8,  12, 20, 28, 15, 17,
+                                24, 7,  19, 27, 23, 6,  26, 5,  4,  31};
+  v |= v >> 1;
+  v |= v >> 2;
+  v |= v >> 4;
+  v |= v >> 8;
+  v |= v >> 16;
+  return table[(v * 0x07C4ACDDU) >> 27];
+}
+
+/** \returns the square root of \a x.
+  *
+  * It is essentially equivalent to
+  * \code using std::sqrt; return sqrt(x); \endcode
+  * but slightly faster for float/double and some compilers (e.g., gcc), thanks
+ * to
+  * specializations when SSE is enabled.
+  *
+  * It's usage is justified in performance critical functions, like
+ * norm/normalize.
+  */
+template <typename T>
+EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE T sqrt(const T& x) {
+  EIGEN_USING_STD_MATH(sqrt);
+  return sqrt(x);
+}
+
+#if defined(SYCL_DEVICE_ONLY)
+SYCL_SPECIALIZE_FLOATING_TYPES_UNARY(sqrt, sqrt)
+#endif
+
+template <typename T>
+EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE T log(const T& x) {
+  EIGEN_USING_STD_MATH(log);
+  return log(x);
+}
+
+#if defined(SYCL_DEVICE_ONLY)
+SYCL_SPECIALIZE_FLOATING_TYPES_UNARY(log, log)
+#endif
+
+#if defined(EIGEN_GPUCC)
+template <>
+EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE float log(const float& x) {
+  return ::logf(x);
+}
+
+template <>
+EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE double log(const double& x) {
+  return ::log(x);
+}
+#endif
+
+template <typename T>
+EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE
+    typename internal::enable_if<NumTraits<T>::IsSigned ||
+                                     NumTraits<T>::IsComplex,
+                                 typename NumTraits<T>::Real>::type
+    abs(const T& x) {
+  EIGEN_USING_STD_MATH(abs);
+  return abs(x);
+}
+
+template <typename T>
+EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE
+    typename internal::enable_if<!(NumTraits<T>::IsSigned ||
+                                   NumTraits<T>::IsComplex),
+                                 typename NumTraits<T>::Real>::type
+    abs(const T& x) {
+  return x;
+}
+
+#if defined(SYCL_DEVICE_ONLY)
+SYCL_SPECIALIZE_INTEGER_TYPES_UNARY(abs, abs)
+SYCL_SPECIALIZE_FLOATING_TYPES_UNARY(abs, fabs)
+#endif
+
+#if defined(EIGEN_GPUCC)
+template <>
+EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE float abs(const float& x) {
+  return ::fabsf(x);
+}
+
+template <>
+EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE double abs(const double& x) {
+  return ::fabs(x);
+}
+
+template <>
+EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE float abs(const std::complex<float>& x) {
+  return ::hypotf(x.real(), x.imag());
+}
+
+template <>
+EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE double abs(
+    const std::complex<double>& x) {
+  return ::hypot(x.real(), x.imag());
+}
+#endif
+
+template <typename T>
+EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE T exp(const T& x) {
+  EIGEN_USING_STD_MATH(exp);
+  return exp(x);
+}
+
+#if defined(SYCL_DEVICE_ONLY)
+SYCL_SPECIALIZE_FLOATING_TYPES_UNARY(exp, exp)
+#endif
+
+#if defined(EIGEN_GPUCC)
+template <>
+EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE float exp(const float& x) {
+  return ::expf(x);
+}
+
+template <>
+EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE double exp(const double& x) {
+  return ::exp(x);
+}
+
+template <>
+EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE std::complex<float> exp(
+    const std::complex<float>& x) {
+  float com = ::expf(x.real());
+  float res_real = com * ::cosf(x.imag());
+  float res_imag = com * ::sinf(x.imag());
+  return std::complex<float>(res_real, res_imag);
+}
+
+template <>
+EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE std::complex<double> exp(
+    const std::complex<double>& x) {
+  double com = ::exp(x.real());
+  double res_real = com * ::cos(x.imag());
+  double res_imag = com * ::sin(x.imag());
+  return std::complex<double>(res_real, res_imag);
+}
+#endif
+
+template <typename Scalar>
+EIGEN_DEVICE_FUNC inline EIGEN_MATHFUNC_RETVAL(expm1, Scalar)
+    expm1(const Scalar& x) {
+  return EIGEN_MATHFUNC_IMPL(expm1, Scalar)::run(x);
+}
+
+#if defined(SYCL_DEVICE_ONLY)
+SYCL_SPECIALIZE_FLOATING_TYPES_UNARY(expm1, expm1)
+#endif
+
+#if defined(EIGEN_GPUCC)
+template <>
+EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE float expm1(const float& x) {
+  return ::expm1f(x);
+}
+
+template <>
+EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE double expm1(const double& x) {
+  return ::expm1(x);
+}
+#endif
+
+template <typename T>
+EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE T cos(const T& x) {
+  EIGEN_USING_STD_MATH(cos);
+  return cos(x);
+}
+
+#if defined(SYCL_DEVICE_ONLY)
+SYCL_SPECIALIZE_FLOATING_TYPES_UNARY(cos, cos)
+#endif
+
+#if defined(EIGEN_GPUCC)
+template <>
+EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE float cos(const float& x) {
+  return ::cosf(x);
+}
+
+template <>
+EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE double cos(const double& x) {
+  return ::cos(x);
+}
+#endif
+
+template <typename T>
+EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE T sin(const T& x) {
+  EIGEN_USING_STD_MATH(sin);
+  return sin(x);
+}
+
+#if defined(SYCL_DEVICE_ONLY)
+SYCL_SPECIALIZE_FLOATING_TYPES_UNARY(sin, sin)
+#endif
+
+#if defined(EIGEN_GPUCC)
+template <>
+EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE float sin(const float& x) {
+  return ::sinf(x);
+}
+
+template <>
+EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE double sin(const double& x) {
+  return ::sin(x);
+}
+#endif
+
+template <typename T>
+EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE T tan(const T& x) {
+  EIGEN_USING_STD_MATH(tan);
+  return tan(x);
+}
+
+#if defined(SYCL_DEVICE_ONLY)
+SYCL_SPECIALIZE_FLOATING_TYPES_UNARY(tan, tan)
+#endif
+
+#if defined(EIGEN_GPUCC)
+template <>
+EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE float tan(const float& x) {
+  return ::tanf(x);
+}
+
+template <>
+EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE double tan(const double& x) {
+  return ::tan(x);
+}
+#endif
+
+template <typename T>
+EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE T acos(const T& x) {
+  EIGEN_USING_STD_MATH(acos);
+  return acos(x);
+}
+
+#if EIGEN_HAS_CXX11_MATH
+template <typename T>
+EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE T acosh(const T& x) {
+  EIGEN_USING_STD_MATH(acosh);
+  return acosh(x);
+}
+#endif
+
+#if defined(SYCL_DEVICE_ONLY)
+SYCL_SPECIALIZE_FLOATING_TYPES_UNARY(acos, acos)
+SYCL_SPECIALIZE_FLOATING_TYPES_UNARY(acosh, acosh)
+#endif
+
+#if defined(EIGEN_GPUCC)
+template <>
+EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE float acos(const float& x) {
+  return ::acosf(x);
+}
+
+template <>
+EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE double acos(const double& x) {
+  return ::acos(x);
+}
+#endif
+
+template <typename T>
+EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE T asin(const T& x) {
+  EIGEN_USING_STD_MATH(asin);
+  return asin(x);
+}
+
+#if EIGEN_HAS_CXX11_MATH
+template <typename T>
+EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE T asinh(const T& x) {
+  EIGEN_USING_STD_MATH(asinh);
+  return asinh(x);
+}
+#endif
+
+#if defined(SYCL_DEVICE_ONLY)
+SYCL_SPECIALIZE_FLOATING_TYPES_UNARY(asin, asin)
+SYCL_SPECIALIZE_FLOATING_TYPES_UNARY(asinh, asinh)
+#endif
+
+#if defined(EIGEN_GPUCC)
+template <>
+EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE float asin(const float& x) {
+  return ::asinf(x);
+}
+
+template <>
+EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE double asin(const double& x) {
+  return ::asin(x);
+}
+#endif
+
+template <typename T>
+EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE T atan(const T& x) {
+  EIGEN_USING_STD_MATH(atan);
+  return atan(x);
+}
+
+#if EIGEN_HAS_CXX11_MATH
+template <typename T>
+EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE T atanh(const T& x) {
+  EIGEN_USING_STD_MATH(atanh);
+  return atanh(x);
+}
+#endif
+
+#if defined(SYCL_DEVICE_ONLY)
+SYCL_SPECIALIZE_FLOATING_TYPES_UNARY(atan, atan)
+SYCL_SPECIALIZE_FLOATING_TYPES_UNARY(atanh, atanh)
+#endif
+
+#if defined(EIGEN_GPUCC)
+template <>
+EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE float atan(const float& x) {
+  return ::atanf(x);
+}
+
+template <>
+EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE double atan(const double& x) {
+  return ::atan(x);
+}
+#endif
+
+template <typename T>
+EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE T cosh(const T& x) {
+  EIGEN_USING_STD_MATH(cosh);
+  return cosh(x);
+}
+
+#if defined(SYCL_DEVICE_ONLY)
+SYCL_SPECIALIZE_FLOATING_TYPES_UNARY(cosh, cosh)
+#endif
+
+#if defined(EIGEN_GPUCC)
+template <>
+EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE float cosh(const float& x) {
+  return ::coshf(x);
+}
+
+template <>
+EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE double cosh(const double& x) {
+  return ::cosh(x);
+}
+#endif
+
+template <typename T>
+EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE T sinh(const T& x) {
+  EIGEN_USING_STD_MATH(sinh);
+  return sinh(x);
+}
+
+#if defined(SYCL_DEVICE_ONLY)
+SYCL_SPECIALIZE_FLOATING_TYPES_UNARY(sinh, sinh)
+#endif
+
+#if defined(EIGEN_GPUCC)
+template <>
+EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE float sinh(const float& x) {
+  return ::sinhf(x);
+}
+
+template <>
+EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE double sinh(const double& x) {
+  return ::sinh(x);
+}
+#endif
+
+template <typename T>
+EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE T tanh(const T& x) {
+  EIGEN_USING_STD_MATH(tanh);
+  return tanh(x);
+}
+
+#if (!defined(EIGEN_GPUCC)) && EIGEN_FAST_MATH && !defined(SYCL_DEVICE_ONLY)
+EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE float tanh(float x) {
+  return internal::generic_fast_tanh_float(x);
+}
+#endif
+
+#if defined(SYCL_DEVICE_ONLY)
+SYCL_SPECIALIZE_FLOATING_TYPES_UNARY(tanh, tanh)
+#endif
+
+#if defined(EIGEN_GPUCC)
+template <>
+EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE float tanh(const float& x) {
+  return ::tanhf(x);
+}
+
+template <>
+EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE double tanh(const double& x) {
+  return ::tanh(x);
+}
+#endif
+
+template <typename T>
+EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE T fmod(const T& a, const T& b) {
+  EIGEN_USING_STD_MATH(fmod);
+  return fmod(a, b);
+}
+
+#if defined(SYCL_DEVICE_ONLY)
+SYCL_SPECIALIZE_FLOATING_TYPES_BINARY(fmod, fmod)
+#endif
+
+#if defined(EIGEN_GPUCC)
+template <>
+EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE float fmod(const float& a,
+                                                 const float& b) {
+  return ::fmodf(a, b);
+}
+
+template <>
+EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE double fmod(const double& a,
+                                                  const double& b) {
+  return ::fmod(a, b);
+}
+#endif
+
+#if defined(SYCL_DEVICE_ONLY)
+#undef SYCL_SPECIALIZE_SIGNED_INTEGER_TYPES_BINARY
+#undef SYCL_SPECIALIZE_SIGNED_INTEGER_TYPES_UNARY
+#undef SYCL_SPECIALIZE_UNSIGNED_INTEGER_TYPES_BINARY
+#undef SYCL_SPECIALIZE_UNSIGNED_INTEGER_TYPES_UNARY
+#undef SYCL_SPECIALIZE_INTEGER_TYPES_BINARY
+#undef SYCL_SPECIALIZE_UNSIGNED_INTEGER_TYPES_UNARY
+#undef SYCL_SPECIALIZE_FLOATING_TYPES_BINARY
+#undef SYCL_SPECIALIZE_FLOATING_TYPES_UNARY
+#undef SYCL_SPECIALIZE_FLOATING_TYPES_UNARY_FUNC_RET_TYPE
+#undef SYCL_SPECIALIZE_GEN_UNARY_FUNC
+#undef SYCL_SPECIALIZE_UNARY_FUNC
+#undef SYCL_SPECIALIZE_GEN1_BINARY_FUNC
+#undef SYCL_SPECIALIZE_GEN2_BINARY_FUNC
+#undef SYCL_SPECIALIZE_BINARY_FUNC
+#endif
+
+}  // end namespace numext
+
+namespace internal {
+
+template <typename T>
+EIGEN_DEVICE_FUNC bool isfinite_impl(const std::complex<T>& x) {
+  return (numext::isfinite)(numext::real(x)) &&
+         (numext::isfinite)(numext::imag(x));
+}
+
+template <typename T>
+EIGEN_DEVICE_FUNC bool isnan_impl(const std::complex<T>& x) {
+  return (numext::isnan)(numext::real(x)) || (numext::isnan)(numext::imag(x));
+}
+
+template <typename T>
+EIGEN_DEVICE_FUNC bool isinf_impl(const std::complex<T>& x) {
+  return ((numext::isinf)(numext::real(x)) ||
+          (numext::isinf)(numext::imag(x))) &&
+         (!(numext::isnan)(x));
+}
+
+/****************************************************************************
+* Implementation of fuzzy comparisons                                       *
+****************************************************************************/
+
+template <typename Scalar, bool IsComplex, bool IsInteger>
+struct scalar_fuzzy_default_impl {};
+
+template <typename Scalar>
+struct scalar_fuzzy_default_impl<Scalar, false, false> {
+  typedef typename NumTraits<Scalar>::Real RealScalar;
+  template <typename OtherScalar>
+  EIGEN_DEVICE_FUNC static inline bool isMuchSmallerThan(
+      const Scalar& x, const OtherScalar& y, const RealScalar& prec) {
+    return numext::abs(x) <= numext::abs(y) * prec;
+  }
+  EIGEN_DEVICE_FUNC
+  static inline bool isApprox(const Scalar& x,
+                              const Scalar& y,
+                              const RealScalar& prec) {
+    return numext::abs(x - y) <=
+           numext::mini(numext::abs(x), numext::abs(y)) * prec;
+  }
+  EIGEN_DEVICE_FUNC
+  static inline bool isApproxOrLessThan(const Scalar& x,
+                                        const Scalar& y,
+                                        const RealScalar& prec) {
+    return x <= y || isApprox(x, y, prec);
+  }
+};
+
+template <typename Scalar>
+struct scalar_fuzzy_default_impl<Scalar, false, true> {
+  typedef typename NumTraits<Scalar>::Real RealScalar;
+  template <typename OtherScalar>
+  EIGEN_DEVICE_FUNC static inline bool isMuchSmallerThan(const Scalar& x,
+                                                         const Scalar&,
+                                                         const RealScalar&) {
+    return x == Scalar(0);
+  }
+  EIGEN_DEVICE_FUNC
+  static inline bool isApprox(const Scalar& x,
+                              const Scalar& y,
+                              const RealScalar&) {
+    return x == y;
+  }
+  EIGEN_DEVICE_FUNC
+  static inline bool isApproxOrLessThan(const Scalar& x,
+                                        const Scalar& y,
+                                        const RealScalar&) {
+    return x <= y;
+  }
+};
+
+template <typename Scalar>
+struct scalar_fuzzy_default_impl<Scalar, true, false> {
+  typedef typename NumTraits<Scalar>::Real RealScalar;
+  template <typename OtherScalar>
+  EIGEN_DEVICE_FUNC static inline bool isMuchSmallerThan(
+      const Scalar& x, const OtherScalar& y, const RealScalar& prec) {
+    return numext::abs2(x) <= numext::abs2(y) * prec * prec;
+  }
+  EIGEN_DEVICE_FUNC
+  static inline bool isApprox(const Scalar& x,
+                              const Scalar& y,
+                              const RealScalar& prec) {
+    return numext::abs2(x - y) <=
+           numext::mini(numext::abs2(x), numext::abs2(y)) * prec * prec;
+  }
+};
+
+template <typename Scalar>
+struct scalar_fuzzy_impl
+    : scalar_fuzzy_default_impl<Scalar,
+                                NumTraits<Scalar>::IsComplex,
+                                NumTraits<Scalar>::IsInteger> {};
+
+template <typename Scalar, typename OtherScalar>
+EIGEN_DEVICE_FUNC inline bool isMuchSmallerThan(
+    const Scalar& x,
+    const OtherScalar& y,
+    const typename NumTraits<Scalar>::Real& precision =
+        NumTraits<Scalar>::dummy_precision()) {
+  return scalar_fuzzy_impl<Scalar>::template isMuchSmallerThan<OtherScalar>(
+      x, y, precision);
+}
+
+template <typename Scalar>
+EIGEN_DEVICE_FUNC inline bool isApprox(
+    const Scalar& x,
+    const Scalar& y,
+    const typename NumTraits<Scalar>::Real& precision =
+        NumTraits<Scalar>::dummy_precision()) {
+  return scalar_fuzzy_impl<Scalar>::isApprox(x, y, precision);
+}
+
+template <typename Scalar>
+EIGEN_DEVICE_FUNC inline bool isApproxOrLessThan(
+    const Scalar& x,
+    const Scalar& y,
+    const typename NumTraits<Scalar>::Real& precision =
+        NumTraits<Scalar>::dummy_precision()) {
+  return scalar_fuzzy_impl<Scalar>::isApproxOrLessThan(x, y, precision);
+}
+
+/******************************************
+***  The special case of the  bool type ***
+******************************************/
+
+template <>
+struct random_impl<bool> {
+  static inline bool run() { return random<int>(0, 1) == 0 ? false : true; }
+};
+
+template <>
+struct scalar_fuzzy_impl<bool> {
+  typedef bool RealScalar;
+
+  template <typename OtherScalar>
+  EIGEN_DEVICE_FUNC static inline bool isMuchSmallerThan(const bool& x,
+                                                         const bool&,
+                                                         const bool&) {
+    return !x;
+  }
+
+  EIGEN_DEVICE_FUNC
+  static inline bool isApprox(bool x, bool y, bool) { return x == y; }
+
+  EIGEN_DEVICE_FUNC
+  static inline bool isApproxOrLessThan(const bool& x,
+                                        const bool& y,
+                                        const bool&) {
+    return (!x) || y;
+  }
+};
+
+}  // end namespace internal
+
+}  // end namespace Eigen
+
+#endif  // EIGEN_MATHFUNCTIONS_H
diff --git a/python/paddle/__init__.py b/python/paddle/__init__.py
index 9862ef8ac069233f7a436635fdd875e39ea0416f..8f370019a5655e65eaa3a963beeab62ac559b6ae 100644
--- a/python/paddle/__init__.py
+++ b/python/paddle/__init__.py
@@ -36,6 +36,7 @@ import paddle.distributed
 import paddle.sysconfig
 import paddle.tensor
 import paddle.nn
+import paddle.fleet
 import paddle.framework
 import paddle.imperative
 import paddle.optimizer
@@ -107,7 +108,7 @@ from .tensor.manipulation import flatten  #DEFINE_ALIAS
 from .tensor.manipulation import gather  #DEFINE_ALIAS
 from .tensor.manipulation import gather_nd  #DEFINE_ALIAS
 from .tensor.manipulation import reshape  #DEFINE_ALIAS
-from .tensor.manipulation import reverse  #DEFINE_ALIAS
+from .tensor.manipulation import flip as reverse  #DEFINE_ALIAS
 from .tensor.manipulation import scatter  #DEFINE_ALIAS
 from .tensor.manipulation import scatter_nd_add  #DEFINE_ALIAS
 from .tensor.manipulation import scatter_nd  #DEFINE_ALIAS
@@ -145,7 +146,6 @@ from .tensor.math import exp  #DEFINE_ALIAS
 from .tensor.math import floor  #DEFINE_ALIAS
 from .tensor.math import increment  #DEFINE_ALIAS
 from .tensor.math import log  #DEFINE_ALIAS
-from .tensor.math import mul  #DEFINE_ALIAS
 from .tensor.math import multiplex  #DEFINE_ALIAS
 from .tensor.math import pow  #DEFINE_ALIAS
 from .tensor.math import reciprocal  #DEFINE_ALIAS
diff --git a/python/paddle/distributed/__init__.py b/python/paddle/distributed/__init__.py
index 1e71b316b06779ef861c9fb9612e30a62f810f7d..d0c32e26092f6ea25771279418582a24ea449ab2 100644
--- a/python/paddle/distributed/__init__.py
+++ b/python/paddle/distributed/__init__.py
@@ -11,6 +11,3 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-from . import fs_wrapper
-
-__all__ = fs_wrapper.__all__
diff --git a/python/paddle/distributed/fs_wrapper.py b/python/paddle/distributed/fs_wrapper.py
deleted file mode 100644
index d73d144e1c47544a214d73ac677f12e71230d058..0000000000000000000000000000000000000000
--- a/python/paddle/distributed/fs_wrapper.py
+++ /dev/null
@@ -1,225 +0,0 @@
-# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import paddle.fluid as fluid
-import sys
-import abc
-import os
-from pathlib import PurePosixPath
-import shutil
-
-__all__ = ['FS', 'LocalFS', 'BDFS']
-
-
-class FS(object):
-    @abc.abstractmethod
-    def list_dirs(self, fs_path):
-        pass
-
-    @abc.abstractmethod
-    def ls_dir(self, fs_path):
-        pass
-
-    @abc.abstractmethod
-    def stat(self, fs_path):
-        pass
-
-    @abc.abstractmethod
-    def upload(self, local_path, fs_path):
-        pass
-
-    @abc.abstractmethod
-    def download(self, fs_path, local_path):
-        pass
-
-    @abc.abstractmethod
-    def mkdir(self, fs_path):
-        pass
-
-    @abc.abstractmethod
-    def mv(self, fs_src_path, fs_dst_path):
-        pass
-
-    @abc.abstractmethod
-    def rmr(self, fs_path):
-        pass
-
-    @abc.abstractmethod
-    def rm(self, fs_path):
-        pass
-
-    @abc.abstractmethod
-    def delete(self, fs_path):
-        pass
-
-    @abc.abstractmethod
-    def need_upload_download(self):
-        pass
-
-
-class LocalFS(FS):
-    def list_dirs(self, fs_path):
-        if not self.stat(fs_path):
-            return []
-
-        return [
-            f for f in os.listdir(fs_path) if os.path.isdir(fs_path + "/" + f)
-        ]
-
-    def ls_dir(self, fs_path):
-        return [f for f in os.listdir(fs_path)]
-
-    def stat(self, fs_path):
-        return os.path.exists(fs_path)
-
-    def mkdir(self, fs_path):
-        assert not os.path.isfile(fs_path), "{} is already a file".format(
-            fs_path)
-        os.system("mkdir -p {}".format(fs_path))
-
-    def mv(self, fs_src_path, fs_dst_path):
-        os.rename(fs_src_path, fs_dst_path)
-
-    def rmr(self, fs_path):
-        shutil.rmtree(fs_path)
-
-    def rm(self, fs_path):
-        os.remove(fs_path)
-
-    def delete(self, fs_path):
-        if not self.stat(fs_path):
-            return
-
-        if os.path.isfile(fs_path):
-            return self.rm(fs_path)
-
-        return self.rmr(fs_path)
-
-    def need_upload_download(self):
-        return False
-
-
-class BDFS(FS):
-    def __init__(self,
-                 hdfs_name,
-                 hdfs_ugi,
-                 time_out=20 * 60 * 1000,
-                 sleep_inter=1000):
-        self._base_cmd = "hadoop fs -Dfs.default.name=\"{}\" -Dhadoop.job.ugi=\"{}\"".format(
-            hdfs_name, hdfs_ugi)
-        self._time_out = time_out
-        self._sleep_inter = sleep_inter
-
-    def _run_cmd(self, cmd):
-        ret = fluid.core.run_cmd(cmd, self._time_out, self._sleep_inter)
-        if len(ret) <= 0:
-            return []
-
-        lines = ret.splitlines()
-        return lines
-
-    def list_dirs(self, fs_path):
-        if not self.stat(fs_path):
-            return []
-
-        dirs, _ = self.ls_dir(fs_path)
-        return dirs
-
-    def ls_dir(self, fs_path):
-        """
-        list directory under fs_path, and only give the pure name, not include the fs_path
-        """
-        cmd = "{} -ls {}".format(self._base_cmd, fs_path)
-        lines = self._run_cmd(cmd)
-
-        dirs = []
-        files = []
-        for line in lines:
-            arr = line.split()
-            if len(arr) != 8:
-                continue
-
-            if fs_path not in arr[7]:
-                continue
-
-            p = PurePosixPath(arr[7])
-            if arr[0][0] == 'd':
-                dirs.append(p.name)
-            else:
-                files.append(p.name)
-
-        return dirs, files
-
-    def is_dir(self, fs_path):
-        cmd = "{} -test -d {} ; echo $?".format(self._base_cmd, fs_path)
-
-        test = self._run_cmd(cmd)
-        if test[0].strip() == "0":
-            return True
-
-        return False
-
-    def stat(self, fs_path):
-        cmd = "{} -test -e {} ; echo $?".format(self._base_cmd, fs_path)
-
-        test = self._run_cmd(cmd)
-        if test[0].strip() == "0":
-            return True
-
-        return False
-
-    def upload(self, local_path, fs_path):
-        cmd = "{} -put {} {}".format(self._base_cmd, local_path, fs_path)
-        fluid.core.run_cmd(cmd, self._time_out, self._sleep_inter)
-
-    def download(self, fs_path, local_path):
-        cmd = "{} -get {} {}/".format(self._base_cmd, fs_path, local_path)
-        fluid.core.run_cmd(cmd, self._time_out, self._sleep_inter)
-
-    def mkdir(self, fs_path):
-
-        if not self.stat(fs_path):
-            cmd = "{} -mkdir {}".format(self._base_cmd, fs_path)
-            fluid.core.run_cmd(cmd, self._time_out, self._sleep_inter)
-
-    def mv(self, fs_src_path, fs_dst_path):
-        cmd = "{} -mv {} {}".format(self._base_cmd, fs_src_path, fs_dst_path)
-        fluid.core.run_cmd(cmd, self._time_out, self._sleep_inter)
-
-    def rmr(self, fs_path):
-        if not self.stat(fs_path):
-            return
-
-        cmd = "{} -rmr {}".format(self._base_cmd, fs_path)
-        return fluid.core.run_cmd(cmd, self._time_out, self._sleep_inter)
-
-    def rm(self, fs_path):
-        if not self.stat(fs_path):
-            return
-
-        cmd = "{} -rm {}".format(self._base_cmd, fs_path)
-        return fluid.core.run_cmd(cmd, self._time_out, self._sleep_inter)
-
-    def delete(self, fs_path):
-        if not self.stat(fs_path):
-            return
-
-        is_dir = self.is_dir(fs_path)
-        if is_dir:
-            return self.rmr(fs_path)
-
-        return self.rm(fs_path)
-
-    def need_upload_download(self):
-        return True
diff --git a/python/paddle/distributed/utils.py b/python/paddle/distributed/utils.py
index 511d501a2206e3ce106b7b76a5f3463b48353ff4..0bfd75b4994402359651be3bd6247847a6427ffb 100644
--- a/python/paddle/distributed/utils.py
+++ b/python/paddle/distributed/utils.py
@@ -143,7 +143,7 @@ class Trainer(object):
             return False
 
         if self.endpoint != t.endpoint or \
-                self.rank != t.rank :
+                self.rank != t.rank:
             return False
 
         for a, b in zip(self.gpus, t.gpus):
@@ -331,7 +331,9 @@ class TrainerProc(object):
     def __init__(self):
         self.proc = None
         self.log_fn = None
+        self.log_offset = None
         self.rank = None
+        self.local_rank = None
         self.cmd = None
 
 
@@ -377,7 +379,9 @@ def start_local_trainers(cluster,
         tp = TrainerProc()
         tp.proc = proc
         tp.rank = t.rank
+        tp.local_rank = idx
         tp.log_fn = fn
+        tp.log_offset = 0 if fn else None
         tp.cmd = cmd
 
         procs.append(tp)
@@ -385,6 +389,21 @@ def start_local_trainers(cluster,
     return procs
 
 
+def pull_worker_log(tp):
+    if tp.log_fn:
+        with open(tp.log_fn.name, 'r') as fin:
+            fin.seek(tp.log_offset, 0)
+            for line in fin:
+                try:
+                    sys.stdout.write(line)
+                except UnicodeEncodeError:
+                    sys.stdout.write(
+                        'UnicodeEncodeError occurs at this line. '
+                        'Please refer to the original log file "%s"\n' %
+                        tp.log_fn.name)
+            tp.log_offset = fin.tell()
+
+
 def watch_local_trainers(procs, nranks):
     try:
         error = False
@@ -392,6 +411,9 @@ def watch_local_trainers(procs, nranks):
         # wait all process finish or one error
         alive = False
         for p in procs:
+            if p.log_fn and p.local_rank == 0:
+                pull_worker_log(p)
+
             ret = p.proc.poll()
             if ret is None:
                 alive = True
diff --git a/python/paddle/fleet/__init__.py b/python/paddle/fleet/__init__.py
index 343a6ca9bd7dfd5dfb30caf77d1cb6bd10c1d090..a5a8d12ed440077714a59773e1c870848e9de229 100644
--- a/python/paddle/fleet/__init__.py
+++ b/python/paddle/fleet/__init__.py
@@ -13,16 +13,11 @@
 # limitations under the License.
 
 # TODO: define distributed api under this directory, 
-# __all__ = ['metric',
-#            'optimizer',
-#            'RoleMaker',
-#            'dataset',
-#            '	DatasetFactory',
-#            '	InMemoryDataset',
-#            '	QueueDataset',
-#            'transpiler',
-#            '	DistributeTranspiler',
-#            '	DistributeTranspilerConfig',
-#            '	HashName',
-#            '	RoundRobin',
-#            'collective']
+from .base.distributed_strategy import DistributedStrategy
+#from .base.role_maker import PaddleCloudRoleMaker, UserDefinedRoleMaker
+#from .base.fleet_base import Fleet
+
+#__all__ = [
+#    "DistributedStrategy", "PaddleCloudRoleMaker", "UserDefinedRoleMaker"
+#]
+__all__ = ['DistributedStrategy']
diff --git a/python/paddle/fleet/base/__init__.py b/python/paddle/fleet/base/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..abf198b97e6e818e1fbe59006f98492640bcee54
--- /dev/null
+++ b/python/paddle/fleet/base/__init__.py
@@ -0,0 +1,13 @@
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
diff --git a/python/paddle/fleet/base/distributed_strategy.py b/python/paddle/fleet/base/distributed_strategy.py
new file mode 100644
index 0000000000000000000000000000000000000000..0ebaff3a0f70c734b97b1da509fdaa0b080c5e3f
--- /dev/null
+++ b/python/paddle/fleet/base/distributed_strategy.py
@@ -0,0 +1,514 @@
+#   Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from paddle.fleet.proto import distributed_strategy_pb2
+from paddle.fluid.framework import Variable
+
+
+class DistributedJobInfo(object):
+    """
+    DistributedJobInfo will serialize all distributed training information
+    Just for inner use: 1) debug 2) replicate experiments
+    """
+
+    def __init__(self):
+        self.job_info = distributed_strategy_pb2.DistributedJobInfo()
+
+    def _set_worker_num(self, worker_num):
+        self.job_info.worker_num = worker_num
+
+    def _set_server_num(self, server_num):
+        self.job_info.server_num = server_num
+
+    def _set_worker_ips(self, worker_ips):
+        self.job_info.worker_ips.extend(worker_ips)
+
+    def _set_server_endpoints(self, server_endpoints):
+        self.job_info.server_endpoints.extend(server_endpoints)
+
+    def _set_origin_startup(self, origin_startup_prog):
+        self.job_info.origin_startup = str(origin_startup_prog)
+
+    def _set_origin_main(self, origin_main_prog):
+        self.job_info.origin_main = str(origin_main_prog)
+
+    def _distributed_main(self, distributed_main_prog):
+        self.job_info.distributed_main = str(distributed_main_prog)
+
+    def _optimizer_name(self, optimizer_name):
+        self.job_info.optimizer_name = optimizer_name
+
+    def _set_distributed_strategy(self, dist_strategy):
+        self.job_info.strategy = dist_strategy
+
+
+class DistributedStrategy(object):
+    def __init__(self):
+        self.strategy = distributed_strategy_pb2.DistributedStrategy()
+
+    @property
+    def amp(self):
+        return self.strategy.amp
+
+    @amp.setter
+    def amp(self, flag):
+        if isinstance(flag, bool):
+            self.strategy.amp = flag
+        else:
+            print("WARNING: amp should have value of bool type")
+
+    @property
+    def amp_loss_scaling(self):
+        return self.strategy.amp_loss_scaling
+
+    @amp_loss_scaling.setter
+    def amp_loss_scaling(self, value):
+        if isinstance(value, int):
+            self.strategy.amp_loss_scaling = value
+        else:
+            print("WARNING: amp_loss_scaling should have value of int type")
+
+    @property
+    def recompute(self):
+        return self.strategy.recompute
+
+    @recompute.setter
+    def recompute(self, flag):
+        if isinstance(flag, bool):
+            self.strategy.recompute = flag
+        else:
+            print("WARNING: recompute should have value of bool type")
+
+    @property
+    def recompute_checkpoints(self):
+        return self.strategy.recompute_checkpoints
+
+    @recompute_checkpoints.setter
+    def recompute_checkpoints(self, checkpoints):
+        if isinstance(checkpoints, list):
+            str_list = True
+            var_list = True
+            for item in checkpoints:
+                if not isinstance(item, str):
+                    str_list = False
+                if not isinstance(item, Variable):
+                    var_list = False
+
+            assert (str_list and var_list) == False
+            if str_list:
+                self.strategy.ClearField("recompute_checkpoints")
+                self.strategy.recompute_checkpoints.extend(checkpoints)
+            elif var_list:
+                names = [x.name for x in checkpoints]
+                self.strategy.ClearField("recompute_checkpoints")
+                self.strategy.recompute_checkpoints.extend(names)
+            else:
+                print(
+                    "WARNING: recompute_checkpoints should have value of list[Variable] or list[name] type"
+                )
+        else:
+            print(
+                "WARNING: recompute_checkpoints should have value of list[Variable] or list[name] type"
+            )
+
+    @property
+    def pipeline(self):
+        return self.strategy.pipeline
+
+    @pipeline.setter
+    def pipeline(self, flag):
+        if isinstance(flag, bool):
+            self.strategy.pipeline = flag
+        else:
+            print("WARNING: pipeline should have value of bool type")
+
+    @property
+    def pipeline_micro_batch(self):
+        return self.strategy.pipeline_micro_batch
+
+    @pipeline_micro_batch.setter
+    def pipeline_micro_batch(self, value):
+        if isinstance(value, int):
+            self.strategy.pipeline_micro_batch = value
+        else:
+            print("WARNING: pipeline micro batch should have value of int type")
+
+    @property
+    def localsgd(self):
+        return self.strategy.localsgd
+
+    @localsgd.setter
+    def localsgd(self, flag):
+        if isinstance(flag, bool):
+            self.strategy.localsgd = flag
+        else:
+            print("WARNING: localsgd should have value of bool type")
+
+    @property
+    def localsgd_k_step(self):
+        return self.strategy.localsgd_k_step
+
+    @localsgd_k_step.setter
+    def localsgd_k_step(self, value):
+        if isinstance(value, int):
+            self.strategy.localsgd_k_step = value
+        else:
+            print("WARNING: localsgd_k_step should have value of int type")
+
+    @property
+    def dgc(self):
+        return self.strategy.dgc
+
+    @dgc.setter
+    def dgc(self, flag):
+        if isinstance(flag, bool):
+            self.strategy.dgc = flag
+        else:
+            print("WARNING: dgc should have value of bool type")
+
+    @property
+    def hierachical_allreduce(self):
+        return self.strategy.hierachical_allreduce
+
+    @hierachical_allreduce.setter
+    def hierachical_allreduce(self, flag):
+        if isinstance(flag, bool):
+            self.strategy.hierachical_allreduce = flag
+        else:
+            print(
+                "WARNING: hierachical_allreduce should have value of bool type")
+
+    @property
+    def nccl_comm_num(self):
+        return self.strategy.nccl_comm_num
+
+    @nccl_comm_num.setter
+    def nccl_comm_num(self, value):
+        if isinstance(value, int):
+            self.strategy.nccl_comm_num = value
+        else:
+            print("WARNING: nccl_comm_num should have value of int type")
+
+    @property
+    def gradient_merge(self):
+        return self.strategy.gradient_merge
+
+    @gradient_merge.setter
+    def gradient_merge(self, flag):
+        if isinstance(flag, bool):
+            self.strategy.gradient_merge = flag
+        else:
+            print("WARNING: gradient_merge should have value of bool type")
+
+    @property
+    def gradient_merge_k_step(self):
+        return self.strategy.gradient_merge_k_step
+
+    @gradient_merge_k_step.setter
+    def gradient_merge_k_step(self, value):
+        if isinstance(value, int):
+            self.strategy.gradient_merge_k_step = value
+        else:
+            print(
+                "WARNING: gradient_merge_k_step should have value of int type")
+
+    @property
+    def sequential_execution(self):
+        return self.strategy.sequential_execution
+
+    @sequential_execution.setter
+    def sequential_execution(self, flag):
+        if isinstance(flag, bool):
+            self.strategy.sequential_execution = flag
+        else:
+            print(
+                "WARNING: sequential_execution should have value of bool type")
+
+    @property
+    def lars(self):
+        return self.strategy.lars
+
+    @lars.setter
+    def lars(self, flag):
+        if isinstance(flag, bool):
+            self.strategy.lars = flag
+        else:
+            print("WARNING: lars should have value of bool type")
+
+    @property
+    def lamb(self):
+        return self.strategy.lamb
+
+    @lamb.setter
+    def lamb(self, flag):
+        if isinstance(flag, bool):
+            self.strategy.lamb = flag
+        else:
+            print("WARNING: lamb should have value of bool type")
+
+    @property
+    def fuse_elewise_add_act_ops(self):
+        return self.strategy.fuse_elewise_add_act_ops
+
+    @fuse_elewise_add_act_ops.setter
+    def fuse_elewise_add_act_ops(self, flag):
+        if isinstance(flag, bool):
+            self.strategy.fuse_elewise_add_act_ops = flag
+        else:
+            print(
+                "WARNING: fuse_elewise_add_act_ops should have value of bool type"
+            )
+
+    @property
+    def fuse_bn_act_ops(self):
+        return self.strategy.fuse_bn_act_ops
+
+    @fuse_bn_act_ops.setter
+    def fuse_bn_act_ops(self, flag):
+        if isinstance(flag, bool):
+            self.strategy.fuse_bn_act_ops = flag
+        else:
+            print("WARNING: fuse_bn_act_ops should have value of bool type")
+
+    @property
+    def enable_auto_fusion(self):
+        return self.strategy.enable_auto_fusion
+
+    @enable_auto_fusion.setter
+    def enable_auto_fusion(self, flag):
+        if isinstance(flag, bool):
+            self.strategy.enable_auto_fusion = flag
+        else:
+            print("WARNING: enable_auto_fusion should have value of bool type")
+
+    @property
+    def fuse_relu_depthwise_conv(self):
+        return self.strategy.fuse_relu_depthwise_conv
+
+    @fuse_relu_depthwise_conv.setter
+    def fuse_relu_depthwise_conv(self, flag):
+        if isinstance(flag, bool):
+            self.strategy.fuse_relu_depthwise_conv = flag
+        else:
+            print(
+                "WARNING: fuse_relu_depthwise_conv should have value of bool type"
+            )
+
+    @property
+    def enable_inplace(self):
+        return self.strategy.enable_inplace
+
+    @enable_inplace.setter
+    def enable_inplace(self, flag):
+        if isinstance(flag, bool):
+            self.strategy.enable_inplace = flag
+        else:
+            print("WARNING: enable_inplace should have value of bool type")
+
+    @property
+    def fuse_all_reduce_ops(self):
+        return self.strategy.fuse_all_reduce_ops
+
+    @fuse_all_reduce_ops.setter
+    def fuse_all_reduce_ops(self, flag):
+        if isinstance(flag, bool):
+            self.strategy.fuse_all_reduce_ops = flag
+        else:
+            print("WARNING: fuse_all_reduce_ops should have value of bool type")
+
+    @property
+    def num_iteration_per_drop_scope(self):
+        return self.strategy.num_iteration_per_drop_scope
+
+    @num_iteration_per_drop_scope.setter
+    def num_iteration_per_drop_scope(self, flag):
+        if isinstance(flag, int):
+            self.strategy.num_iteration_per_drop_scope = flag
+        else:
+            print(
+                "WARNING: num_iteration_per_drop_scope should have value of int type"
+            )
+
+    @property
+    def sync_batch_norm(self):
+        return self.strategy.sync_batch_norm
+
+    @sync_batch_norm.setter
+    def sync_batch_norm(self, flag):
+        if isinstance(flag, bool):
+            self.strategy.sync_batch_norm = flag
+        else:
+            print("WARNING: sync_batch_norm should have value of bool type")
+
+    @property
+    def fuse_all_optimizer_ops(self):
+        return self.strategy.fuse_all_optimizer_ops
+
+    @fuse_all_optimizer_ops.setter
+    def fuse_all_optimizer_ops(self, flag):
+        if isinstance(flag, bool):
+            self.strategy.fuse_all_optimizer_ops = flag
+        else:
+            print(
+                "WARNING: fuse_all_optimizer_ops should have value of bool type")
+
+    @property
+    def sync(self):
+        return self.strategy.sync
+
+    @sync.setter
+    def sync(self, flag):
+        if isinstance(flag, bool):
+            self.strategy.sync = flag
+        else:
+            print("WARNING: sync should have value of bool type")
+
+    @property
+    def async_k_step(self):
+        return self.strategy.async_k_step
+
+    @async_k_step.setter
+    def async_k_step(self, value):
+        if isinstance(value, int):
+            self.strategy.async_k_step = value
+        else:
+            print("WARNING: async_k_step should have value of int type")
+
+    @property
+    def max_merge_var_num(self):
+        return self.strategy.max_merge_var_num
+
+    @max_merge_var_num.setter
+    def max_merge_var_num(self, value):
+        if isinstance(value, int):
+            self.strategy.max_merge_var_num = value
+        else:
+            print("WARNING: max_merge_var_num should have value of int type")
+
+    @property
+    def send_queue_size(self):
+        return self.strategy.send_queue_size
+
+    @send_queue_size.setter
+    def send_queue_size(self, value):
+        if isinstance(value, int):
+            self.strategy.send_queue_size = value
+        else:
+            print("WARNING: send_queue_size should have value of int type")
+
+    @property
+    def independent_recv_thread(self):
+        return self.strategy.independent_recv_thread
+
+    @independent_recv_thread.setter
+    def independent_recv_thread(self, value):
+        if isinstance(value, bool):
+            self.strategy.independent_recv_thread = value
+        else:
+            print(
+                "WARNING: independent_recv_thread should have value of int type")
+
+    @property
+    def min_send_grad_num_before_recv(self):
+        return self.strategy.min_send_grad_num_before_recv
+
+    @min_send_grad_num_before_recv.setter
+    def min_send_grad_num_before_recv(self, value):
+        if isinstance(value, int):
+            self.strategy.min_send_grad_num_before_recv = value
+        else:
+            print(
+                "WARNING: min_send_grad_num_before_recv should have value of int type"
+            )
+
+    @property
+    def thread_pool_size(self):
+        return self.strategy.thread_pool_size
+
+    @thread_pool_size.setter
+    def thread_pool_size(self, value):
+        if isinstance(value, int):
+            self.strategy.thread_pool_size = value
+        else:
+            print("WARNING:thread_pool_size should have value of int type")
+
+    @property
+    def send_wait_times(self):
+        return self.strategy.send_wait_times
+
+    @send_wait_times.setter
+    def send_wait_times(self, value):
+        if isinstance(value, int):
+            self.strategy.send_wait_times = value
+        else:
+            print("WARNING: send_wait_times should have value of int type")
+
+    @property
+    def runtime_split_send_recv(self):
+        return self.strategy.runtime_split_send_recv
+
+    @runtime_split_send_recv.setter
+    def runtime_split_send_recv(self, flag):
+        if isinstance(flag, bool):
+            self.strategy.runtime_split_send_recv = flag
+        else:
+            print("WARNING: runtime_split_send_recv should be bool type")
+
+    @property
+    def use_thread_barrier(self):
+        return self.strategy.use_thread_barrier
+
+    @use_thread_barrier.setter
+    def use_thread_barrier(self, flag):
+        if isinstance(flag, bool):
+            self.strategy.use_thread_barrier = flag
+        else:
+            print("WARNING: use_thread_barrier should be bool type")
+
+    @property
+    def enable_backward_optimizer_op_deps(self):
+        return self.strategy.enable_backward_optimizer_op_deps
+
+    @enable_backward_optimizer_op_deps.setter
+    def enable_backward_optimizer_op_deps(self, flag):
+        if isinstance(flag, bool):
+            self.strategy.enable_backward_optimizer_op_deps = flag
+        else:
+            print(
+                "WARNING: enable_backward_optimizer_op_deps should be bool type")
+
+    @property
+    def elastic(self):
+        return self.strategy.elastic
+
+    @elastic.setter
+    def elastic(self, flag):
+        if isinstance(flag, bool):
+            self.strategy.elastic = flag
+        else:
+            print("WARNING: elastic should have value of bool type")
+
+    @property
+    def auto(self):
+        return self.strategy.auto
+
+    @auto.setter
+    def auto(self, flag):
+        if isinstance(flag, bool):
+            self.strategy.auto = flag
+        else:
+            print("WARNING: auto should have value of bool type")
+
+    def __repr__(self):
+        return str(self.strategy)
diff --git a/python/paddle/fleet/base/fleet_base.py b/python/paddle/fleet/base/fleet_base.py
new file mode 100644
index 0000000000000000000000000000000000000000..881044006479e074283c645c5247efa08c3b37b9
--- /dev/null
+++ b/python/paddle/fleet/base/fleet_base.py
@@ -0,0 +1,19 @@
+#   Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+from paddle.fleet import RoleMakerBase
+from . import obj_creator
+
+# __all__ = ['Fleet']
diff --git a/python/paddle/fleet/base/obj_creator.py b/python/paddle/fleet/base/obj_creator.py
new file mode 100644
index 0000000000000000000000000000000000000000..15a403d79edcf7210863b624074827494684c38a
--- /dev/null
+++ b/python/paddle/fleet/base/obj_creator.py
@@ -0,0 +1,23 @@
+#   Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from util_base import UtilBase
+
+
+def _create_fleet_obj_from_role_maker(role_maker):
+    pass
+
+
+def _create_fleet_util_from_role_maker(role_maker):
+    pass
diff --git a/python/paddle/fleet/base/role_maker.py b/python/paddle/fleet/base/role_maker.py
new file mode 100644
index 0000000000000000000000000000000000000000..f6b5c8ac12e92dcbe6ca710f20d509cabaafac63
--- /dev/null
+++ b/python/paddle/fleet/base/role_maker.py
@@ -0,0 +1,16 @@
+#   Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Defination of Role Makers."""
+
+# __all__ = ['RoleMakerBase', 'UserDefinedRoleMaker', 'PaddleCloudRoleMaker']
diff --git a/python/paddle/fleet/base/util_base.py b/python/paddle/fleet/base/util_base.py
new file mode 100644
index 0000000000000000000000000000000000000000..7654d0bcd9cd657ab79e9acf74b8fdfb72c489de
--- /dev/null
+++ b/python/paddle/fleet/base/util_base.py
@@ -0,0 +1,64 @@
+#   Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Fleet Utils."""
+"""distributed operations"""
+"""basic collective operations in python"""
+"""remote file system"""
+
+# __all__ = ['UtilBase']
+'''
+class UtilBase(object):
+    def __init__(self, role_maker, fleet_obj):
+        self.role_maker = roke_maker
+        self.fleet_obj = fleet_obj
+
+    def set_file_system(self, fs_client):
+        self.fs_client = fs_client
+
+    def broadcast(self):
+        pass
+
+    def all_gather(self):
+        pass
+
+    def all_reduce(self):
+        pass
+
+    def reduce_scatter(self):
+        pass
+
+    def reduce(self):
+        pass
+
+    def get_file_shard(self, files):
+        pass
+
+    def feed_gen(self, batch_size, feed_vars_dims, feeded_vars_filelist):
+        pass
+
+    def save_program(program, output_dir):
+        pass
+
+    def load_program(input_dir):
+        pass
+
+    def load_var():
+        pass
+
+    def save_var():
+        pass
+
+    def print_on_rank(self):
+        pass
+'''
diff --git a/python/paddle/fleet/collective/__init__.py b/python/paddle/fleet/collective/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..8647330f3290f3142cabca9a7e3fe162a9838dda
--- /dev/null
+++ b/python/paddle/fleet/collective/__init__.py
@@ -0,0 +1,12 @@
+#   Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
diff --git a/python/paddle/fleet/dataset/__init__.py b/python/paddle/fleet/dataset/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..8647330f3290f3142cabca9a7e3fe162a9838dda
--- /dev/null
+++ b/python/paddle/fleet/dataset/__init__.py
@@ -0,0 +1,12 @@
+#   Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
diff --git a/python/paddle/fleet/metrics/__init__.py b/python/paddle/fleet/metrics/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..abf198b97e6e818e1fbe59006f98492640bcee54
--- /dev/null
+++ b/python/paddle/fleet/metrics/__init__.py
@@ -0,0 +1,13 @@
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
diff --git a/python/paddle/fleet/metrics/metric.py b/python/paddle/fleet/metrics/metric.py
new file mode 100644
index 0000000000000000000000000000000000000000..847ddc47ac89114f2012bc6b9990a69abfe39fb3
--- /dev/null
+++ b/python/paddle/fleet/metrics/metric.py
@@ -0,0 +1,13 @@
+#   Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
diff --git a/python/paddle/fleet/parameter_server/__init__.py b/python/paddle/fleet/parameter_server/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..847ddc47ac89114f2012bc6b9990a69abfe39fb3
--- /dev/null
+++ b/python/paddle/fleet/parameter_server/__init__.py
@@ -0,0 +1,13 @@
+#   Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
diff --git a/python/paddle/fluid/backward.py b/python/paddle/fluid/backward.py
index e72f7a04e6057c652f653d8901ca178b094a0de7..898c7d295641863740288e3f4e1da39266bce183 100644
--- a/python/paddle/fluid/backward.py
+++ b/python/paddle/fluid/backward.py
@@ -45,7 +45,7 @@ class ProgramStats(object):
         input_names = []
         for name in self.var_op_deps:
             if len(self.var_op_deps[name]["var_as_output_ops"]) == 0 and \
-               len(self.var_op_deps[name]["var_as_input_ops"]) > 0:
+                len(self.var_op_deps[name]["var_as_input_ops"]) > 0:
                 if self.block.var(name).persistable:
                     continue
                 input_names.append(name)
@@ -433,7 +433,7 @@ def _addup_repetitive_outputs_(op_descs, block_idx):
                         ] + arg_names[arg_idx:]
 
                     new_name = var_name + "@RENAME@block" + str(block_idx) + "@" + \
-                        str(var_rename_count[var_name])
+                               str(var_rename_count[var_name])
                     var_rename_count[var_name] += 1
                     arg_names[arg_idx] = new_name
                     op_desc.set_output(param_name, arg_names)
@@ -611,7 +611,7 @@ def _find_not_need_ops(grad_op_descs, forward_ops, input_grad_names_set):
     not_need_op_descs_set = set(not_need_op_descs)
     grad_op_descs_set = set(grad_op_descs)
     # If a backward computational graph is simply one sub-graph header, the
-    # not_need_op_descs will be whole graph, this IF clause avoids it. 
+    # not_need_op_descs will be whole graph, this IF clause avoids it.
     if grad_op_descs_set == not_need_op_descs_set:
         return set()
     return not_need_op_descs_set
@@ -662,7 +662,7 @@ def _append_backward_ops_with_checkpoints_(
     checkpoints_name = list(set(checkpoints_name))
     local_block = block.program._create_block()
     buffer_block = block.program._create_block()
-    # 0) deal with forward recomputing program descs  
+    # 0) deal with forward recomputing program descs
     program_stat = ProgramStats(block, ops)
     program_stat.modify_forward_desc_for_recompute()
     program_stat.build_stats()
@@ -797,32 +797,51 @@ def _append_backward_ops_with_checkpoints_(
     return program_stat, checkpoints_name, vars_should_be_hold, recompute_segments
 
 
-def _get_sub_block_path(sub_block, sub_block_op_desc, no_grad_set):
+def _get_sub_block_path(sub_block,
+                        sub_block_op_desc,
+                        no_grad_set,
+                        op_path_dict,
+                        sub_block_target_names=None):
     """
     Get output vars in subblock which will be assigned to parent block.
-    It is used to find the grad path in subblock
+    It is used to find the grad path in subblock.
+
+    Args:
+        sub_block(Block): The sub-block in which to get op path.
+        sub_block_op_desc: The op desc of the sub-block op such as 'while', 'conditional_block' and 'recurrent'.
+        no_grad_set(set): The set of no grad var name. no_grad_set will be changed.
+        op_path_dict(dict): op_path_dict will be changed.
+            key(int) block index
+            val(list) the op path of block(index)
+        sub_block_target_names(set): Target var names of sub-block.
+    Return:
+        The forward op path of sub-block corresponding to backward op.
     """
+
     assert sub_block_op_desc.has_attr(
         "sub_block") and sub_block.idx == sub_block_op_desc._block_attr_id(
             "sub_block")
-    # TODO(huihuangzheng): add support for recurrent op and while op
-    if sub_block_op_desc.type == "conditional_block":
-        sub_outputs = []
-        sub_assign_to_out_ops = []
-        for var in sub_block_op_desc.output_arg_names:
+    assert isinstance(sub_block_target_names, (set, type(None)))
+
+    if sub_block_target_names is None:
+        sub_block_target_names = sub_block_op_desc.output_arg_names
+
+    # TODO(huihuangzheng): add support for recurrent op.
+    if sub_block_op_desc.type in ["conditional_block", "while"]:
+        # Step1: get the output vars in sub-block
+        sub_outputs = [
+            sub_block._var_recursive(var) for var in sub_block_target_names
+        ]
+        for var in sub_block_target_names:
             for op_desc in sub_block.ops:
-                if op_desc.type == "assign" and var in op_desc.output_arg_names:
-                    sub_assign_to_out_ops.append(op_desc)
+                if var in op_desc.output_arg_names:
                     for name in op_desc.input_arg_names:
-                        if sub_block.has_var(name):
-                            sub_outputs.append(sub_block.var(name))
+                        sub_outputs.append(sub_block._var_recursive(name))
 
+        # Step2: find op path of sub-block
+        is_while = sub_block_op_desc.type in ["while"]
         sub_block_op_path = _find_op_path_(sub_block, sub_outputs, [],
-                                           no_grad_set)
-        # TODO better way than finding in list
-        for op_desc in sub_assign_to_out_ops:
-            if op_desc not in sub_block_op_path:
-                sub_block_op_path.append(op_desc)
+                                           no_grad_set, op_path_dict, is_while)
         return sub_block_op_path
     return sub_block.ops
 
@@ -846,7 +865,8 @@ def _append_backward_ops_(block,
                           no_grad_dict,
                           grad_to_var,
                           callbacks=None,
-                          input_grad_names_set=None):
+                          input_grad_names_set=None,
+                          op_path_dict=None):
     """
     Create all grad ops, and insert them into given block
 
@@ -864,6 +884,9 @@ def _append_backward_ops_(block,
         input_grad_names_set(set): this set is used to store the gradients' name which is
             generated by backward ops, and input_grad_names_set can help to prune the unnecessary
             backward ops.
+        op_path_dict(dict): op_path_dict will be changed.
+            key(int) block index
+            val(list) the op path of block(index)
     """
     if callbacks is not None:
         assert (isinstance(callbacks, list))
@@ -888,11 +911,10 @@ def _append_backward_ops_(block,
             # see follwing comments for why set None here.
             pre_input_grad_names_set = copy.copy(input_grad_names_set)
             input_grad_names_set = None
-            sub_block_path = _get_sub_block_path(sub_block, op,
-                                                 no_grad_dict[sub_block.idx])
+            sub_block_path = op_path_dict[op._block_attr_id("sub_block")]
             _append_backward_ops_(sub_block, sub_block_path, grad_sub_block,
                                   no_grad_dict, grad_to_var, callbacks,
-                                  input_grad_names_set)
+                                  input_grad_names_set, op_path_dict)
             input_grad_names_set = pre_input_grad_names_set
 
             program._rollback()
@@ -1013,7 +1035,7 @@ def _find_parent_op_(sub_block):
                     "sub_block") == sub_block_id:
                 return op
 
-    # NOTE(paddle-dev): When optimizer is added in conditional block, 
+    # NOTE(paddle-dev): When optimizer is added in conditional block,
     # sub_block may not be found.
     return None
 
@@ -1072,7 +1094,7 @@ def _append_backward_vars_(block, start_op_idx, grad_to_var, grad_info_map):
             if var != core.empty_var_name()
         ]
 
-        # If the outputs of grad op is empty, just remove it 
+        # If the outputs of grad op is empty, just remove it
         if not outputs:
             ops_to_remove.append(op_idx)
             continue
@@ -1358,7 +1380,10 @@ def append_backward(loss,
 
         block_no_grad_set = set(
             map(_strip_grad_suffix_, no_grad_dict[block_idx]))
-        op_path = _find_op_path_(block, [loss], [], block_no_grad_set)
+
+        op_path_dict = dict()
+        op_path = _find_op_path_(block, [loss], [], block_no_grad_set,
+                                 op_path_dict)
 
         no_grad_vars = _find_no_grad_vars(block, op_path, [loss],
                                           block_no_grad_set)
@@ -1371,13 +1396,13 @@ def append_backward(loss,
         # For double backward, input_grad_names is used for filtering
         # some non-used gradients op(s).
 
-        # Todo(liym27): need a better design.
+        # TODO(liym27): need a better design.
         # not support double grad in control flow sub-block now.
         if not is_in_control_flow:
             if program._appending_grad_times > 1:
                 input_grad_names_set = set([_append_grad_suffix_(loss.name)])
 
-        # Todo: support _append_backward_ops_with_checkpoints_ in
+        # TODO: support _append_backward_ops_with_checkpoints_ in
         #  sub-block (control flow)
         if checkpoints != None and \
                 isinstance(checkpoints, list) and \
@@ -1400,7 +1425,8 @@ def append_backward(loss,
                 no_grad_dict,
                 grad_to_var,
                 callbacks,
-                input_grad_names_set=input_grad_names_set)
+                input_grad_names_set=input_grad_names_set,
+                op_path_dict=op_path_dict)
 
     grad_info_map = dict()
 
@@ -1508,13 +1534,14 @@ def _get_output_names(cur_block, targets):
     """
 
     block = targets[0].block if targets else cur_block
-    prog = cur_block.program
-    if _is_ancestor_block(block, cur_block):
-        return set()
-
     current_output_names = set([out.name for out in targets])
 
-    # if `cur_block` is an ancestor of `targets[0].block`, run while loop
+    # 1. If `targets` in cur_block or the ancestral block of `cur_block`
+    if block.idx == cur_block.idx or _is_ancestor_block(block, cur_block):
+        return current_output_names
+
+    # 2. If `cur_block` is an ancestor of `targets[0].block`, run while loop
+    prog = cur_block.program
     while block.idx != cur_block.idx:
         assert block.parent_idx != -1
         parent_block = prog.block(block.parent_idx)
@@ -1554,12 +1581,32 @@ def _find_no_grad_vars(block, op_path, targets, no_grad_set):
     return set(no_grad_var)
 
 
-def _find_op_path_(block, outputs, inputs, no_grad_set):
+def _find_op_path_(block,
+                   targets,
+                   inputs,
+                   no_grad_set,
+                   op_path_dict=None,
+                   is_while=False):
     """
-    no_grad_set will also be changed
+    It is used to find the grad path in `block`.
+
+    Args:
+        block(Block): The block in which to get op path.
+        targets(list[Variable]): The target variables.
+        inputs(list[Variable]): The input variables.
+        no_grad_set(set): The set of no grad var name. no_grad_set will be changed.
+        op_path_dict(dict): op_path_dict will be changed. op_path_dict will be changed.
+            key(int) block index
+            val(list) the op path of block(index)
+        is_while(bool): Whether or not `block` is while block
+    Return:
+        The forward op path of block corresponding to backward op.
     """
+
     input_names = set([inp.name for inp in inputs])
-    output_names = _get_output_names(block, outputs)
+    output_names = _get_output_names(block, targets)
+    if op_path_dict is None:
+        op_path_dict = dict()
 
     relevant_op_flags = [True] * len(block.ops)
 
@@ -1576,6 +1623,15 @@ def _find_op_path_(block, outputs, inputs, no_grad_set):
                 relevant_op_flags[i] = False
 
     for i, op in reversed(list(enumerate(block.ops))):
+        if op.has_attr("sub_block"):
+            sub_block_id = op._block_attr_id("sub_block")
+            sub_block = block.program.block(sub_block_id)
+            sub_block_target_names = output_names & set(op.output_arg_names)
+            sub_block_path = _get_sub_block_path(sub_block, op,
+                                                 set(), op_path_dict,
+                                                 sub_block_target_names)
+            op_path_dict[sub_block_id] = sub_block_path
+
         if _some_in_set_(
                 op.desc.output_arg_names(),
                 output_names) and core.has_non_empty_grad_op_maker(op.type):
@@ -1585,6 +1641,14 @@ def _find_op_path_(block, outputs, inputs, no_grad_set):
         else:
             relevant_op_flags[i] = False
 
+    if is_while:
+        # If block is while block, dealing with op specifically again.
+        # TODO(liym27): Consider special types of ops.
+        for i, op in reversed(list(enumerate(block.ops))):
+            if relevant_op_flags[i] == False \
+                    and _some_in_set_(op.desc.output_arg_names(),output_names):
+                relevant_op_flags[i] = True
+
     op_path = [
         block.ops[i] for i in range(len(block.ops)) if relevant_op_flags[i]
     ]
@@ -1688,7 +1752,10 @@ def calc_gradient(targets, inputs, target_gradients=None, no_grad_set=None):
             raise "input must be in the same program as targets"
 
     block_no_grad_set = set(map(_strip_grad_suffix_, no_grad_dict[0]))
-    op_path = _find_op_path_(block, targets, inputs, block_no_grad_set)
+
+    op_path_dict = dict()
+    op_path = _find_op_path_(block, targets, inputs, block_no_grad_set,
+                             op_path_dict)
     no_grad_dict[0].update(list(map(_append_grad_suffix_, block_no_grad_set)))
     grad_to_var = dict()
     grad_info_map = dict()
@@ -1698,7 +1765,8 @@ def calc_gradient(targets, inputs, target_gradients=None, no_grad_set=None):
         block,
         no_grad_dict,
         grad_to_var,
-        input_grad_names_set=input_grad_names_set)
+        input_grad_names_set=input_grad_names_set,
+        op_path_dict=op_path_dict)
 
     # Because calc_gradient may be called multiple times,
     # we need rename the internal gradient variables so that they have
diff --git a/python/paddle/fluid/contrib/layers/nn.py b/python/paddle/fluid/contrib/layers/nn.py
index 273a669a1414e858920f6f5c2ad1fce8810eb829..50e6eaa80c135b24efa3844a6387278cc247af3a 100644
--- a/python/paddle/fluid/contrib/layers/nn.py
+++ b/python/paddle/fluid/contrib/layers/nn.py
@@ -35,7 +35,7 @@ __all__ = [
     'match_matrix_tensor', 'tree_conv', 'fused_embedding_seq_pool',
     'multiclass_nms2', 'search_pyramid_hash', 'shuffle_batch', 'partial_concat',
     'partial_sum', 'tdm_child', 'rank_attention', 'tdm_sampler', 'batch_fc',
-    '_pull_box_extended_sparse'
+    '_pull_box_extended_sparse', 'bilateral_slice'
 ]
 
 
@@ -1409,3 +1409,65 @@ def _pull_box_extended_sparse(input, size, extend_size=64, dtype='float32'):
     if len(outs) == 1:
         return outs[0], outs_extend[0]
     return outs, outs_extend
+
+
+def bilateral_slice(x, guide, grid, has_offset, name=None):
+    """
+    :alias_main: paddle.nn.functional.bilateral_slice
+	:alias: paddle.nn.functional.bilateral_slice,paddle.nn.functional.vision.bilateral_slice
+	:old_api: paddle.fluid.layers.bilateral_slice
+
+    This operation implements bilateral slicing on the input according to the guide map.
+    For more information of bilateral slicing, please refer to Deep Bilateral Learning for Real-Time Image Enhancement <https://groups.csail.mit.edu/graphics/hdrnet/data/hdrnet.pdf>_
+
+    Args:
+        x(Variable): The input tensor, which is a 4-D tensor with shape
+                     [N, C, H, W], N is the batch size, C is the channel
+                     number, H and W is the feature height and width.
+                     The data type is float32 and float64.
+        guide(Variable): Input grid tensor of shape [N, H, W]. The
+                        data type is float32 and float64.
+        grid(Variable): Input grid tensor of shape [N, C, D, H, W]. The
+                        data type is float32 and float64.
+        has_offset(bool): Whether to slice with affine offset.
+        name(str, optional): For detailed information, please refer
+                             to :ref:`api_guide_Name`. Usually name is no need to set and
+                             None by default.
+
+    Returns:
+        Variable: Output of shape [N, C, H, W]. The data type is same as input tensor.
+
+    Examples:
+
+        .. code-block:: python
+
+            import paddle.fluid as fluid
+
+            x = fluid.data(name='x', shape=[None, 3, 101, 60], dtype='float32')
+            guide = fluid.data(name='guide', shape=[None, 101, 60], dtype='float32')
+            grid = fluid.data(name='grid', shape=[None, 12, 8, 10, 6], dtype='float32')
+
+            # without offset
+            output = fluid.layers.bilateral_slice(x, guide, grid, has_offset=False)
+            
+            # has offset
+            output = fluid.layers.bilateral_slice(x, guide, grid, has_offset=True)
+
+    """
+    helper = LayerHelper("bilateral_slice", **locals())
+
+    check_variable_and_dtype(x, 'x', ['float32', 'float64'], 'bilateral_slice')
+    check_variable_and_dtype(guide, 'guide', ['float32', 'float64'],
+                             'bilateral_slice')
+    check_variable_and_dtype(grid, 'grid', ['float32', 'float64'],
+                             'bilateral_slice')
+
+    out = helper.create_variable_for_type_inference(x.dtype)
+    inputs = {'X': x, 'Guide': guide, 'Grid': grid}
+
+    helper.append_op(
+        type='bilateral_slice',
+        inputs=inputs,
+        attrs={'has_offset': has_offset},
+        outputs={'Out': out})
+    return out
diff --git a/python/paddle/fluid/contrib/slim/quantization/__init__.py b/python/paddle/fluid/contrib/slim/quantization/__init__.py
index ce409b0dbfc6b83b8074f198880d85518a633fc6..ee7e6536f2eff240b7a6f28407103a4f7887f074 100644
--- a/python/paddle/fluid/contrib/slim/quantization/__init__.py
+++ b/python/paddle/fluid/contrib/slim/quantization/__init__.py
@@ -20,15 +20,18 @@ from . import quantization_strategy
 from .quantization_strategy import *
 from . import mkldnn_post_training_strategy
 from .mkldnn_post_training_strategy import *
-from . import qat_int8_mkldnn_pass
-from .qat_int8_mkldnn_pass import *
-from . import qat2_int8_mkldnn_pass
-from .qat2_int8_mkldnn_pass import *
+from . import quant_int8_mkldnn_pass
+from .quant_int8_mkldnn_pass import *
+from . import quant2_int8_mkldnn_pass
+from .quant2_int8_mkldnn_pass import *
 from . import post_training_quantization
 from .post_training_quantization import *
+from . import imperative
+from .imperative import *
 
 __all__ = quantization_pass.__all__ + quantization_strategy.__all__
 __all__ += mkldnn_post_training_strategy.__all__
-__all__ += qat_int8_mkldnn_pass.__all__
-__all__ += qat2_int8_mkldnn_pass.__all__
+__all__ += quant_int8_mkldnn_pass.__all__
+__all__ += quant2_int8_mkldnn_pass.__all__
 __all__ += post_training_quantization.__all__
+__all__ += imperative.__all__
diff --git a/python/paddle/fluid/contrib/slim/quantization/imperative/__init__.py b/python/paddle/fluid/contrib/slim/quantization/imperative/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..7ea62b5f3246ff5e1bf001eea88a198c23faf78a
--- /dev/null
+++ b/python/paddle/fluid/contrib/slim/quantization/imperative/__init__.py
@@ -0,0 +1,25 @@
+#   Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+
+from . import quant_nn
+from .quant_nn import *
+
+from . import qat
+from .qat import *
+
+__all__ = []
+__all__ += quant_nn.__all__
+__all__ += qat.__all__
diff --git a/python/paddle/fluid/contrib/slim/quantization/imperative/qat.py b/python/paddle/fluid/contrib/slim/quantization/imperative/qat.py
new file mode 100644
index 0000000000000000000000000000000000000000..c77648ac7b56e2c1a2f7bae6311fe7e5c2eceaa4
--- /dev/null
+++ b/python/paddle/fluid/contrib/slim/quantization/imperative/qat.py
@@ -0,0 +1,229 @@
+#   Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import logging
+import numpy as np
+import sys
+from paddle.fluid import dygraph
+from paddle.fluid.dygraph.nn import Conv2D
+from paddle.fluid.dygraph.nn import Linear
+from paddle.fluid.log_helper import get_logger
+from . import quant_nn
+
+__all__ = ['ImperativeQuantAware']
+
+_logger = get_logger(
+    __name__, logging.INFO, fmt='%(asctime)s-%(levelname)s: %(message)s')
+
+
+class ImperativeQuantAware(object):
+    """
+    Add the fake quant logic for given quantizable layers, namely add the quant_dequant
+    computational logic both for activation inputs and weight inputs.
+    """
+
+    def __init__(self,
+                 weight_bits=8,
+                 activation_bits=8,
+                 weight_quantize_type='abs_max',
+                 activation_quantize_type='moving_average_abs_max',
+                 moving_rate=0.9,
+                 quantizable_layer_type=['Conv2D', 'Linear']):
+        """
+        The constructor for ImperativeQuantAware.
+
+        Args:
+            weight_bits(int): quantization bit number for weights,
+                whereas the bias is not quantized.
+            activation_bits(int): quantization bit number for activations.
+            weight_quantize_type(str): quantization type for weights,
+                which supports 'abs_max' now. The 'moving_average_abs_max'
+                usually is not used for weights, since weights are fixed once the
+                model is well trained.
+            activation_quantize_type(str): quantization type for activations,
+                which supports 'abs_max' and 'moving_average_abs_max' now.
+                If using 'abs_max' mode, the quantization scale will be calculated
+                dynamically each step in both training and testing period. If using
+                'moving_average_abs_max', the static quantization scale will be calculated
+                during training and used in inference.
+            moving_rate(float): the parameter for 'moving_average_abs_max' quantization.
+            quantizable_op_type(list[str]): List the type of layers that will be quantized. 
+                Default is ['Conv2D', 'Linear']. The quantizable_op_type in
+                QuantizationFreezePass and ConvertToInt8Pass must be the same as this.
+
+
+        Examples:
+        .. code-block:: python
+
+            from paddle.fluid.contrib.slim.quantization \
+                import ImperativeQuantAware
+            from paddle.incubate.hapi.vision.models \
+                import resnet
+            
+            model = resnet.resnet50(pretrained=True)
+
+            imperative_qat = ImperativeQuantAware(
+                weight_quantize_type='abs_max',
+                activation_quantize_type='moving_average_abs_max')
+            
+            # Add the fake quant logical.
+            # The original model will be rewrite.
+            imperative_qat.quantize(model)
+
+            # Fine-tune the quantized model
+            # ...
+            
+            # Save quant model for the inference.
+            imperative_qat.save_quantized_model(
+                dirname="./resnet50_qat",
+                model=model,
+                input_shape=[(3, 224, 224)],
+                input_dtype=['float32'],
+                feed=[0],
+                fetch=[0])
+        """
+        super(ImperativeQuantAware, self).__init__()
+        self._weight_bits = weight_bits
+        self._activation_bits = activation_bits
+        self._moving_rate = moving_rate
+
+        quant_type = {'abs_max', 'moving_average_abs_max'}
+        if activation_quantize_type not in quant_type:
+            raise ValueError(
+                "Unknown activation_quantize_type : '%s'. It can only be "
+                "'abs_max' or 'moving_average_abs_max' now." %
+                (str(activation_quantize_type)))
+        if weight_quantize_type not in quant_type:
+            raise ValueError(
+                "Unknown weight_quantize_type: '%s'. It can only be "
+                "'abs_max' or 'moving_average_abs_max' now." %
+                (str(weight_quantize_type)))
+        self._activation_quantize_type = activation_quantize_type
+        self._weight_quantize_type = weight_quantize_type
+
+        self._quant_layers_map = {'Conv2D': Conv2D, 'Linear': Linear}
+        self._quantizable_layer_type = tuple(
+            self._quant_layers_map[layer]
+            if layer in self._quant_layers_map else layer
+            for layer in quantizable_layer_type)
+        for layer in self._quantizable_layer_type:
+            assert not isinstance(
+                layer, str), "{} is unspported to be quantized.".format(layer)
+
+    def quantize(self, model):
+        """
+        According to weights' and activations' quantization types, the model will be added some fake
+        quant ops, such as fake_quantize_dequantize_moving_average_abs_max, fake_quantize_dequantize_abs_max
+        and so on.
+
+        Args:
+            model(fluid.dygraph.Layer): the model to be quantized.
+        Returns:
+            None
+        """
+        for name, layer in model.named_sublayers():
+            if not isinstance(layer, self._quantizable_layer_type):
+                continue
+
+            scopes = name.split('.')
+            target = scopes[-1]
+            obj = model
+            parent = model
+            for i in range(len(scopes) - 1):
+                obj = getattr(parent, scopes[i])
+                parent = obj
+
+            quant_layer = self._get_quantized_counterpart(layer)
+            setattr(obj, target, quant_layer)
+
+    def save_quantized_model(self,
+                             dirname,
+                             model,
+                             input_shape,
+                             input_dtype,
+                             feed,
+                             fetch,
+                             append_batch_size=True):
+        """
+        Save the quantized model for the inference.
+
+        Args:
+            dirname (str): the directory to save the quantized model.
+            model(fluid.dygraph.Layer): the quantized model to be saved.
+            input_shape(list[tuple(int)]): The shape value for each input,
+                e.g. [(3, 224, 224)].
+            input_dtype(list[str]): The dtype value for each input,
+                e.g. ['float32'].
+            feed(list[int]): the indices of the input variables of the
+                imperative functions which will be saved as input variables in
+                inference model.
+            fetch(list[int]): the indices of the returned variable of the
+                imperative functions which will be saved as output variables in
+                inference model.
+            append_batch_size(bool, optional):
+                If true, it prepends an extra axis to the input_shape, meanwhile,
+                the input_shape shouldn't contain the batch size dimension.
+                Otherwise, it just uses the input_shape. Default True.
+        Returns:
+            None
+        """
+        assert isinstance(
+            input_shape, list), "The parameter `input_shape` shoubld be a list."
+        assert isinstance(
+            input_dtype, list), "The parameter `input_dtype` shoubld be a list."
+        assert isinstance(feed, list), "The parameter `feed` shoubld be a list."
+        assert isinstance(fetch,
+                          list), "The parameter `fetch` shoubld be a list."
+        assert len(input_shape) == len(
+            input_dtype
+        ), "The length of input_shape should be equal to  input_dtype's."
+        assert len(input_dtype) == len(
+            feed), "The length of input_shape should be equal to  feed's."
+
+        def _convert(model, *args):
+            return model(*args)
+
+        prog_trans = dygraph.ProgramTranslator()
+        with dygraph.guard():
+            model.eval()
+            input_vars = []
+            for shape, dtype in zip(input_shape, input_dtype):
+                raw_data = np.random.random(shape)
+                input_data = raw_data[np.newaxis, :].astype(
+                    dtype) if append_batch_size else raw_data.astype(dtype)
+                input_var = dygraph.to_variable(input_data)
+                input_vars.append(input_var)
+            prog_trans.get_output(_convert, model, *input_vars)
+        prog_trans.save_inference_model(dirname, feed, fetch)
+
+    def _get_quantized_counterpart(self, layer):
+        quant_layers = tuple(self._quant_layers_map.values())
+        quantized_counterpart = tuple('Quantized' + k
+                                      for k in self._quant_layers_map.keys())
+
+        predicate = lambda value: isinstance(layer, value)
+        index_generator = (i for i, v in enumerate(quant_layers)
+                           if predicate(v))
+
+        try:
+            index = next(index_generator)
+        except StopIteration:
+            _logger.fatal("The layer {} is unsupported to be quantized.".format(
+                layer.full_name()))
+            sys.exit(-1)
+
+        quantized_layer = quant_nn.__dict__[quantized_counterpart[index]](
+            layer, self._weight_bits, self._activation_bits, self._moving_rate,
+            self._weight_quantize_type, self._activation_quantize_type)
+        return quantized_layer
diff --git a/python/paddle/fluid/contrib/slim/quantization/imperative/quant_nn.py b/python/paddle/fluid/contrib/slim/quantization/imperative/quant_nn.py
new file mode 100644
index 0000000000000000000000000000000000000000..59dd9867abb95dea74e1cdc362b671e7d4120d70
--- /dev/null
+++ b/python/paddle/fluid/contrib/slim/quantization/imperative/quant_nn.py
@@ -0,0 +1,375 @@
+#   Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from paddle.fluid.dygraph import layers
+from paddle.fluid import core
+from paddle.fluid import dygraph_utils
+from paddle.fluid import unique_name
+from paddle.fluid.param_attr import ParamAttr
+from paddle.fluid.framework import _varbase_creator
+from paddle.fluid.framework import in_dygraph_mode
+from paddle.fluid.initializer import Constant
+from paddle.fluid.data_feeder import check_variable_and_dtype
+
+__all__ = [
+    'FakeQuantMovingAverage', 'FakeQuantAbsMax', 'QuantizedConv2D',
+    'QuantizedLinear'
+]
+
+
+class FakeQuantMovingAverage(layers.Layer):
+    """
+    FakeQuantMovingAverage layer does the moving_average_abs_max quant and then dequant.
+    Its computational formula is described as below:
+
+    :math:`scale = (moving\_rate*accum+max(abs(x)))/(moving\_rate*state+1)`
+    :math:`range = 2^{bit\_length - 1} - 1`
+    :math:`Out = round(X / scale * range) * scale / range`
+    """
+
+    def __init__(self,
+                 name=None,
+                 moving_rate=0.9,
+                 quant_bits=8,
+                 dtype='float32'):
+        super(FakeQuantMovingAverage, self).__init__()
+        self._moving_rate = moving_rate
+        self._quant_bits = quant_bits
+
+        scale_prefix = "{}.scale".format(
+            name) if name else 'quant_dequant.scale'
+        scale_attr = ParamAttr(
+            name=unique_name.generate(scale_prefix),
+            initializer=Constant(0.001),
+            trainable=False)
+        self._scale = self.create_parameter(
+            shape=[1], attr=scale_attr, dtype=dtype)
+        self._scale.stop_gradient = True
+
+        state_prefix = "{}.state".format(
+            name) if name else 'quant_dequant.state'
+        state_attr = ParamAttr(
+            name=unique_name.generate(state_prefix),
+            initializer=Constant(1),
+            trainable=False)
+        self._state = self.create_parameter(
+            shape=[1], attr=state_attr, dtype=dtype)
+        self._state.stop_gradient = True
+
+        accum_prefix = "{}.accum".format(
+            name) if name else 'quant_dequant.accum'
+        accum_attr = ParamAttr(
+            name=unique_name.generate(accum_prefix),
+            initializer=Constant(1),
+            trainable=False)
+        self._accum = self.create_parameter(
+            shape=[1], attr=accum_attr, dtype=dtype)
+        self._accum.stop_gradient = True
+
+    def forward(self, input):
+        if in_dygraph_mode():
+            attrs = ('moving_rate', self._moving_rate, 'bit_length',
+                     self._quant_bits, 'is_test', not self.training)
+            quant_out = _varbase_creator(
+                type=input.type,
+                name="{}.quantized.dequantized".format(input.name),
+                shape=input.shape,
+                dtype=input.dtype,
+                persistable=False)
+            state = self._state if self.training else None
+            accum = self._accum if self.training else None
+
+            out, _, _, _ = core.ops.fake_quantize_dequantize_moving_average_abs_max(
+                input, self._scale, accum, state, quant_out, self._scale, state,
+                accum, *attrs)
+            return out
+
+        check_variable_and_dtype(input, 'input', ['float32'],
+                                 "FakeQuantMovingAverage")
+        attrs = {
+            'moving_rate': self._moving_rate,
+            'bit_length': self._quant_bits,
+            'is_test': not self.training
+        }
+        inputs = {"X": [input], "InScale": [self._scale]}
+        quant_out = self._helper.create_variable(
+            name="{}.quantized.dequantized".format(input.name),
+            dtype=input.dtype,
+            type=core.VarDesc.VarType.LOD_TENSOR,
+            persistable=False,
+            stop_gradient=False)
+        outputs = {"Out": [quant_out], "OutScale": [self._scale]}
+
+        if self.training:
+            inputs['InState'] = [self._state]
+            inputs['InAccum'] = [self._accum]
+            outputs['OutState'] = [self._state]
+            outputs['OutAccum'] = [self._accum]
+
+        self._helper.append_op(
+            type="fake_quantize_dequantize_moving_average_abs_max",
+            inputs=inputs,
+            outputs=outputs,
+            attrs=attrs)
+
+        return quant_out
+
+
+class FakeQuantAbsMax(layers.Layer):
+    """
+    FakeQuantAbsMax layer does the abs_max quant and then dequant.
+    Its computational formula is described as below:
+
+    :math:`scale = max(abs(X))`
+    :math:`range = 2^{bit\_length - 1} - 1`
+    :math:`Out = round(X / scale * range) * scale / range`
+    """
+
+    def __init__(self,
+                 name=None,
+                 quant_bits=8,
+                 dtype='float32',
+                 quant_on_weight=False):
+        super(FakeQuantAbsMax, self).__init__()
+        self._quant_bits = quant_bits
+        self._dtype = dtype
+        self._name = name
+        scale_prefix = "{}.scale".format(
+            name) if name else 'quant_dequant.scale'
+        self._scale_name = unique_name.generate(scale_prefix)
+        if quant_on_weight:
+            scale_attr = ParamAttr(
+                name=self._scale_name,
+                initializer=Constant(0.0),
+                trainable=False)
+            self._scale = self.create_parameter(
+                shape=[1], attr=scale_attr, dtype=self._dtype)
+            self._scale.stop_gradient = True
+        else:
+            self._scale = None
+
+    def forward(self, input):
+        if in_dygraph_mode():
+            attrs = ('bit_length', self._quant_bits)
+            quant_out = _varbase_creator(
+                type=input.type,
+                name="{}.quantized.dequantized".format(input.name),
+                shape=input.shape,
+                dtype=input.dtype,
+                persistable=False)
+            out_scale = self._scale
+            if not out_scale:
+                out_scale = _varbase_creator(
+                    type=core.VarDesc.VarType.LOD_TENSOR,
+                    name=self._scale_name,
+                    shape=[1],
+                    dtype=self._dtype,
+                    persistable=False)
+                out_scale.stop_gradient = True
+            out, _, = core.ops.fake_quantize_dequantize_abs_max(
+                input, quant_out, out_scale, *attrs)
+            return out
+
+        check_variable_and_dtype(input, 'input', ['float32'], "FakeQuantAbsMax")
+        attrs = {'bit_length': self._quant_bits}
+        inputs = {"X": [input]}
+        quant_out = self._helper.create_variable(
+            name="{}.quantized.dequantized".format(input.name),
+            dtype=input.dtype,
+            type=core.VarDesc.VarType.LOD_TENSOR,
+            persistable=False,
+            stop_gradient=False)
+        out_scale = self._scale
+        if not out_scale:
+            out_scale = self._helper.create_variable(
+                name=self._scale_name,
+                dtype=self._dtype,
+                type=core.VarDesc.VarType.LOD_TENSOR,
+                persistable=False,
+                stop_gradient=True)
+        outputs = {"Out": [quant_out], "OutScale": [out_scale]}
+
+        self._helper.append_op(
+            type="fake_quantize_dequantize_abs_max",
+            inputs=inputs,
+            outputs=outputs,
+            attrs=attrs)
+
+        return quant_out
+
+
+def _get_fake_quant_type(quant_type, name, moving_rate, quant_bits, dtype,
+                         quant_on_weight):
+    fake_quant_map = {
+        'abs_max':
+        lambda: FakeQuantAbsMax(name, quant_bits, dtype, quant_on_weight),
+        'moving_average_abs_max':
+        lambda: FakeQuantMovingAverage(name, moving_rate, quant_bits, dtype)
+    }
+    return fake_quant_map[quant_type]()
+
+
+class QuantizedConv2D(layers.Layer):
+    """
+    The computational logic of QuantizedConv2D is the same with Conv2D.
+    The only difference is that its inputs are all fake quantized.
+    """
+
+    def __init__(self,
+                 layer,
+                 weight_bits=8,
+                 activation_bits=8,
+                 moving_rate=0.9,
+                 weight_quantize_type='abs_max',
+                 activation_quantize_type='abs_max'):
+        super(QuantizedConv2D, self).__init__()
+        # For Conv2D
+        self._groups = getattr(layer, '_groups')
+        self._stride = getattr(layer, '_stride')
+        self._padding = getattr(layer, '_padding')
+        self._dilation = getattr(layer, '_dilation')
+        self._act = getattr(layer, '_act')
+        self._use_cudnn = getattr(layer, '_use_cudnn')
+        self._dtype = getattr(layer, '_dtype')
+        self._l_type = getattr(layer, '_l_type')
+        self.weight = getattr(layer, 'weight')
+        self.bias = getattr(layer, 'bias')
+        # For FakeQuant
+        self._fake_quant_weight = _get_fake_quant_type(
+            weight_quantize_type, self.weight.name, moving_rate, weight_bits,
+            self._dtype, True)
+        self._fake_quant_input = _get_fake_quant_type(
+            activation_quantize_type,
+            layer.full_name(), moving_rate, activation_bits, self._dtype, False)
+
+    def forward(self, input):
+        quant_input = self._fake_quant_input(input)
+        quant_weight = self._fake_quant_weight(self.weight)
+
+        if in_dygraph_mode() and self._l_type == 'conv2d':
+            attrs = ('strides', self._stride, 'paddings', self._padding,
+                     'dilations', self._dilation, 'groups', self._groups
+                     if self._groups else 1, 'use_cudnn', self._use_cudnn)
+            pre_bias = core.ops.conv2d(quant_input, quant_weight, *attrs)
+
+            pre_act = dygraph_utils._append_bias_in_dygraph(pre_bias, self.bias,
+                                                            1)
+            return dygraph_utils._append_activation_in_dygraph(pre_act,
+                                                               self._act)
+        check_variable_and_dtype(quant_input, 'input',
+                                 ['float16', 'float32', 'float64'],
+                                 'QuantizedConv2D')
+        attrs = {
+            'strides': self._stride,
+            'paddings': self._padding,
+            'dilations': self._dilation,
+            'groups': self._groups if self._groups else 1,
+            'use_cudnn': self._use_cudnn,
+            'use_mkldnn': False,
+        }
+        pre_bias = self._helper.create_variable_for_type_inference(
+            dtype=self._dtype)
+
+        self._helper.append_op(
+            type=self._l_type,
+            inputs={
+                'Input': quant_input,
+                'Filter': quant_weight,
+            },
+            outputs={"Output": pre_bias},
+            attrs=attrs)
+
+        if self.bias is not None:
+            pre_act = self._helper.create_variable_for_type_inference(
+                dtype=self._dtype)
+            self._helper.append_op(
+                type='elementwise_add',
+                inputs={'X': [pre_bias],
+                        'Y': [self.bias]},
+                outputs={'Out': [pre_act]},
+                attrs={'axis': 1})
+        else:
+            pre_act = pre_bias
+
+        return self._helper.append_activation(pre_act, act=self._act)
+
+
+class QuantizedLinear(layers.Layer):
+    """
+    The computational logic of QuantizedLinear is the same with Linear.
+    The only difference is that its inputs are all fake quantized.
+    """
+
+    def __init__(self,
+                 layer,
+                 weight_bits=8,
+                 activation_bits=8,
+                 moving_rate=0.9,
+                 weight_quantize_type='abs_max',
+                 activation_quantize_type='abs_max'):
+        super(QuantizedLinear, self).__init__()
+        # For Linear
+        self._act = getattr(layer, '_act')
+        self._dtype = getattr(layer, '_dtype')
+        self.weight = getattr(layer, 'weight')
+        self.bias = getattr(layer, 'bias')
+        # For FakeQuant
+        self._fake_quant_weight = _get_fake_quant_type(
+            weight_quantize_type, self.weight.name, moving_rate, weight_bits,
+            self._dtype, True)
+        self._fake_quant_input = _get_fake_quant_type(
+            activation_quantize_type,
+            layer.full_name(), moving_rate, activation_bits, self._dtype, False)
+
+    def forward(self, input):
+        quant_input = self._fake_quant_input(input)
+        quant_weight = self._fake_quant_weight(self.weight)
+        if in_dygraph_mode():
+            pre_bias = _varbase_creator(dtype=input.dtype)
+            core.ops.matmul(quant_input, quant_weight, pre_bias, 'transpose_X',
+                            False, 'transpose_Y', False, "alpha", 1)
+            pre_act = dygraph_utils._append_bias_in_dygraph(
+                pre_bias, self.bias, axis=len(input.shape) - 1)
+
+            return dygraph_utils._append_activation_in_dygraph(pre_act,
+                                                               self._act)
+
+        check_variable_and_dtype(input, 'input',
+                                 ['float16', 'float32', 'float64'],
+                                 "QuantizedLinear")
+        attrs = {
+            "transpose_X": False,
+            "transpose_Y": False,
+            "alpha": 1,
+        }
+        inputs = {"X": [quant_input], "Y": [quant_weight]}
+        mul_out = self._helper.create_variable_for_type_inference(self._dtype)
+
+        self._helper.append_op(
+            type="matmul",
+            inputs=inputs,
+            outputs={"Out": [mul_out]},
+            attrs=attrs)
+        if self.bias is not None:
+            pre_activation = self._helper.create_variable_for_type_inference(
+                dtype=self._dtype)
+            self._helper.append_op(
+                type='elementwise_add',
+                inputs={'X': [mul_out],
+                        'Y': [self.bias]},
+                outputs={'Out': [pre_activation]},
+                attrs={'axis': len(input.shape) - 1})
+        else:
+            pre_activation = mul_out
+        return self._helper.append_activation(pre_activation, act=self._act)
diff --git a/python/paddle/fluid/contrib/slim/quantization/post_training_quantization.py b/python/paddle/fluid/contrib/slim/quantization/post_training_quantization.py
index 5f9b94ea82fcacc950f806803e0d04ec41c18ca2..3097e1d82a9cb5e096efa3913ea6a06bff557c94 100644
--- a/python/paddle/fluid/contrib/slim/quantization/post_training_quantization.py
+++ b/python/paddle/fluid/contrib/slim/quantization/post_training_quantization.py
@@ -28,6 +28,7 @@ from .quantization_pass import AddQuantDequantPass
 from .quantization_pass import _out_scale_op_list
 from .quantization_pass import _get_op_input_var_names
 from .quantization_pass import _get_op_output_var_names
+from .quantization_pass import _get_output_name_index
 
 __all__ = ['PostTrainingQuantization', 'WeightQuantization']
 
@@ -341,7 +342,8 @@ class PostTrainingQuantization(object):
             self._executor.run(program=self._program,
                                feed=data,
                                fetch_list=self._fetch_list,
-                               return_numpy=False)
+                               return_numpy=False,
+                               scope=self._scope)
             if self._algo == "KL":
                 self._sample_data(batch_id)
             else:
@@ -405,6 +407,10 @@ class PostTrainingQuantization(object):
                                     model_filename=self._model_filename,
                                     params_filename=self._params_filename)
 
+        if self._program.num_blocks > 1:
+            _logger.error("The post training quantization requires that the "
+                          "program only has one block.")
+
         if self._optimize_model:
             self._optimize_fp32_model()
 
@@ -450,6 +456,9 @@ class PostTrainingQuantization(object):
         persistable_var_names = _all_persistable_var_names(self._program)
         for op in self._program.global_block().ops:
             op_type = op.type
+            if self._is_full_quantize and \
+                op_type not in self._quantizable_op_type:
+                _logger.warning(op_type + " is not supported for quantization.")
             # For quantized ops, sample inputs and outputs
             if op_type in self._quantizable_op_type:
                 collect_var_name(
@@ -554,8 +563,9 @@ class PostTrainingQuantization(object):
             for var_name in self._quantized_act_var_name:
                 var_tensor = _load_variable_data(self._scope, var_name)
                 var_tensor = var_tensor.ravel()
-                save_path = os.path.join(self._cache_dir,
-                                         var_name + "_" + str(iter) + ".npy")
+                save_path = os.path.join(
+                    self._cache_dir,
+                    var_name.replace("/", ".") + "_" + str(iter) + ".npy")
                 np.save(save_path, var_tensor)
         else:
             for var_name in self._quantized_act_var_name:
@@ -590,7 +600,7 @@ class PostTrainingQuantization(object):
             for var_name in self._quantized_act_var_name:
                 sampling_data = []
                 filenames = [f for f in os.listdir(self._cache_dir) \
-                    if re.match(var_name + '_[0-9]+.npy', f)]
+                    if re.match(var_name.replace("/", ".")  + '_[0-9]+.npy', f)]
                 for filename in filenames:
                     file_path = os.path.join(self._cache_dir, filename)
                     sampling_data.append(np.load(file_path))
@@ -685,13 +695,25 @@ class PostTrainingQuantization(object):
                 op._set_attr("quantization_type", quantized_type)
 
         def analysis_and_save_info(op_node, out_var_name):
+            argname_index = _get_output_name_index(op_node, out_var_name)
+            assert argname_index is not None, \
+                out_var_name + " is not the output of the op"
             if self._algo == "KL":
+                # For compatibility, we save output threshold by two methods.
                 save_info(op_node, out_var_name,
                           self._quantized_var_kl_threshold, "out_threshold",
                           "post_kl")
+                save_info(
+                    op_node, out_var_name, self._quantized_var_kl_threshold,
+                    argname_index[0] + str(argname_index[1]) + "_threshold",
+                    "post_kl")
             elif self._algo == "abs_max":
                 save_info(op_node, out_var_name, self._quantized_var_abs_max,
                           "out_threshold", "post_abs_max")
+                save_info(
+                    op_node, out_var_name, self._quantized_var_abs_max,
+                    argname_index[0] + str(argname_index[1]) + "_threshold",
+                    "post_kl")
             elif self._algo == "min_max":
                 save_info(op_node, out_var_name, self._quantized_var_min,
                           "out_min", "post_min_max")
diff --git a/python/paddle/fluid/contrib/slim/quantization/qat2_int8_mkldnn_pass.py b/python/paddle/fluid/contrib/slim/quantization/quant2_int8_mkldnn_pass.py
similarity index 89%
rename from python/paddle/fluid/contrib/slim/quantization/qat2_int8_mkldnn_pass.py
rename to python/paddle/fluid/contrib/slim/quantization/quant2_int8_mkldnn_pass.py
index 3e9a2c9858c424f1dfddf0e6fff604df74d65ce4..75e1ea43d15e432d2f6cbec271acd67624de1e01 100644
--- a/python/paddle/fluid/contrib/slim/quantization/qat2_int8_mkldnn_pass.py
+++ b/python/paddle/fluid/contrib/slim/quantization/quant2_int8_mkldnn_pass.py
@@ -16,17 +16,17 @@ import numpy as np
 from .... import core
 from ....framework import IrGraph
 
-__all__ = ['Qat2Int8MkldnnPass']
+__all__ = ['Quant2Int8MkldnnPass']
 
 OpRole = core.op_proto_and_checker_maker.OpRole
 
 
-class Qat2Int8MkldnnPass(object):
+class Quant2Int8MkldnnPass(object):
     """
-    Transform a QAT model IrGraph into MKL-DNN supported INT8 IrGraph.
+    Transform a quant model IrGraph into MKL-DNN supported INT8 IrGraph.
     The pass consists of the following transformations:
         1. gather scale values from fake quantize/dequantize operators,
-        2. extract FP32 inference model graph from the QAT graph, i.e.
+        2. extract FP32 inference model graph from the quant graph, i.e.
             a.  remove fake quantize/dequantize operators,
             b.  dequantize conv2d and mul's weights,
         3. optimize the FP32 graph using standard FP32 optimization fuses
@@ -55,7 +55,7 @@ class Qat2Int8MkldnnPass(object):
             'fake_dequantize_max_abs', 'fake_channel_wise_dequantize_max_abs'
         ]
         self._ops_to_quantize = _ops_to_quantize
-        self._op_ids_to_skip = _op_ids_to_skip if _op_ids_to_skip != None else set(
+        self._op_ids_to_skip = _op_ids_to_skip if _op_ids_to_skip is not None else set(
             [-1])
         self._scale_immutable_ops = [
             'transpose2', 'reshape2', 'pool2d', 'scale'
@@ -67,15 +67,18 @@ class Qat2Int8MkldnnPass(object):
         self._relu_ops = ['relu', 'relu6']
         self._matmul_ops = ['matmul']
         self._weight_scales = {}
-        # Collect the Input and Output sclaes from Fake QAT models
+        # Collect the Input and Output sclaes from Fake quant models
         self._var_quant_scales = {}
         self._max_range = {}
         self._s8_max = 127
+        self._pass_idx = 0
+        self._pass_group = 'int8'
 
     def apply(self, graph):
         assert isinstance(graph,
                           IrGraph), 'graph must be the instance of IrGraph.'
 
+        self._reset_pass_idx_and_group('int8')
         graph = self._gather_weight_scales_from_fake(graph)
         graph = self._gather_output_scales_from_attr(graph)
         graph = self._gather_input_scales_from_fake(graph)
@@ -86,21 +89,24 @@ class Qat2Int8MkldnnPass(object):
         graph = self._update_relu_output_scales(graph)
         graph = self._propagate_scales(graph)
         graph = self._quantize_fp32_graph(graph)
-        graph = self._optimize_int8_graph(graph)
+        graph = self._final_optimizations(graph)
         graph = self._cleanup(graph)
         return graph
 
-    def apply_fp32(self, graph):
+    def prepare_and_optimize_fp32(self, graph):
         assert isinstance(graph,
                           IrGraph), 'graph must be the instance of IrGraph.'
 
-        graph = self._gather_weight_scales_from_fake(graph)
-        graph = self._remove_fake_ops(graph)
-        graph = self._dequantize_weights(graph)
+        self._reset_pass_idx_and_group('fp32')
         graph = self._optimize_fp32_graph(graph)
+        graph = self._final_optimizations(graph)
         graph = self._cleanup(graph)
         return graph
 
+    def _reset_pass_idx_and_group(self, group):
+        self._pass_idx = 0
+        self._pass_group = group
+
     def _convert_scale2tensor(self, scale):
         tensor = core.LoDTensor()
         tensor.set(scale, core.CPUPlace())
@@ -333,20 +339,38 @@ class Qat2Int8MkldnnPass(object):
     def _optimize_fp32_graph(self, graph):
         graph = self._update_activations(graph)
         graph = self._remove_ctrl_vars(graph)
+        graph = self._apply_pass(graph, 'attention_lstm_fuse_pass')
+        graph = self._apply_pass(graph, 'seqconv_eltadd_relu_fuse_pass')
+        #  graph = self._apply_pass(graph, 'seqpool_concat_fuse_pass')
+        graph = self._apply_pass(graph, 'seqpool_cvm_concat_fuse_pass')
+        #  graph = self._apply_pass(graph, 'embedding_fc_lstm_fuse_pass')
+        graph = self._apply_pass(graph, 'fc_lstm_fuse_pass')
+        graph = self._apply_pass(graph, 'mul_lstm_fuse_pass')
+        graph = self._apply_pass(graph, 'fc_gru_fuse_pass')
+        graph = self._apply_pass(graph, 'mul_gru_fuse_pass')
+        graph = self._apply_pass(graph, 'seq_concat_fc_fuse_pass')
+        graph = self._apply_pass(graph, 'squared_mat_sub_fuse_pass')
+        graph = self._apply_pass(graph, 'is_test_pass')
         graph = self._apply_pass(graph, 'mkldnn_placement_pass',
                                  ['mkldnn_enabled_op_types'], [set()])
         graph = self._apply_pass(graph, 'depthwise_conv_mkldnn_pass')
         graph = self._apply_pass(graph, 'conv_bn_fuse_pass')
         graph = self._apply_pass(graph, 'conv_eltwiseadd_bn_fuse_pass')
+        graph = self._apply_pass(graph, 'conv_transpose_bn_fuse_pass')
+        graph = self._apply_pass(graph,
+                                 'conv_transpose_eltwiseadd_bn_fuse_pass')
         graph = self._apply_pass(graph, 'conv_bias_mkldnn_fuse_pass')
         graph = self._apply_pass(graph, 'conv_elementwise_add_mkldnn_fuse_pass')
         graph = self._apply_pass(graph, 'conv_relu_mkldnn_fuse_pass')
         graph = self._apply_pass(graph, 'conv_relu6_mkldnn_fuse_pass')
         graph = self._apply_pass(graph, 'fc_fuse_pass',
                                  ['use_gpu', 'use_fc_padding'], [False, False])
+        graph = self._apply_pass(graph, 'repeated_fc_relu_fuse_pass')
         if self._is_fc_quantized(graph):
             graph = self._apply_pass(graph, 'fc_mkldnn_pass')
         graph = self._apply_pass(graph, 'matmul_transpose_reshape_fuse_pass')
+        # the following pass should be the last one since it will work on all fused ops.
+        graph = self._apply_pass(graph, 'runtime_context_cache_pass')
         return graph
 
     def _apply_pass(self, graph, pass_name, attrs=None, attr_values=None):
@@ -362,12 +386,13 @@ class Qat2Int8MkldnnPass(object):
                 ir_pass.set(attr, value)
         ir_pass.apply(cpp_graph)
         if self._debug:
-            graph.draw('.', 'qat_fp32_{}'.format(pass_name),
-                       graph.all_op_nodes())
+            graph.draw('.', '{}_{}_{}'.format(self._pass_group, self._pass_idx,
+                                              pass_name), graph.all_op_nodes())
         self._remove_unused_var_nodes(graph)
+        self._pass_idx += 1
         return graph
 
-    def _optimize_int8_graph(self, graph):
+    def _final_optimizations(self, graph):
         # remove dropout ops
         graph = self._apply_pass(graph, 'simplify_with_basic_ops_pass')
         # make some MKL-DNN ops working inplace
@@ -448,8 +473,7 @@ class Qat2Int8MkldnnPass(object):
                         self._var_quant_scales[out_name] = (True, tensor)
             return graph
 
-        conv_predicate = lambda op: op.attr("fuse_activation") in self._relu_ops and \
-            op.attr("fuse_residual_connection") == False
+        conv_predicate = lambda op: op.attr("fuse_activation") in self._relu_ops
         graph = _set_unsigned_scale(graph, self._conv_ops, "Output",
                                     conv_predicate)
 
@@ -465,15 +489,10 @@ class Qat2Int8MkldnnPass(object):
         return 'NHWC' if self._is_conv_quantized(graph) else 'NCHW'
 
     def _quantize_fp32_graph(self, graph):
-        ir_pass = self._core.get_pass('cpu_quantize_placement_pass')
-        cpp_graph = graph.graph
-        ir_pass.set('quantize_enabled_op_types', self._ops_to_quantize)
-        ir_pass.set('quantize_excluded_op_ids',
-                    self._find_avg_pooling_ids(graph))
-        ir_pass.apply(cpp_graph)
-        if self._debug:
-            graph.draw('.', 'qat_int8_{}'.format(ir_pass.type()),
-                       graph.all_op_nodes())
+        graph = self._apply_pass(
+            graph, 'cpu_quantize_placement_pass',
+            ['quantize_enabled_op_types', 'quantize_excluded_op_ids'],
+            [self._ops_to_quantize, self._find_avg_pooling_ids(graph)])
         graph = self._apply_pass(graph, 'scale_matmul_fuse_pass')
         graph = self._apply_pass(graph,
                                  'reshape_transpose_matmul_mkldnn_fuse_pass')
diff --git a/python/paddle/fluid/contrib/slim/quantization/qat_int8_mkldnn_pass.py b/python/paddle/fluid/contrib/slim/quantization/quant_int8_mkldnn_pass.py
similarity index 97%
rename from python/paddle/fluid/contrib/slim/quantization/qat_int8_mkldnn_pass.py
rename to python/paddle/fluid/contrib/slim/quantization/quant_int8_mkldnn_pass.py
index 51f8497e21cd78fb6cda7f626553168844dd1215..a25abd9ff09fbab1534f6f4327983af5db52f023 100644
--- a/python/paddle/fluid/contrib/slim/quantization/qat_int8_mkldnn_pass.py
+++ b/python/paddle/fluid/contrib/slim/quantization/quant_int8_mkldnn_pass.py
@@ -17,10 +17,10 @@ from .... import core
 from ....framework import IrGraph
 from ....framework import IrNode
 
-__all__ = ['QatInt8MkldnnPass']
+__all__ = ['QuantInt8MkldnnPass']
 
 
-class QatInt8MkldnnPass(object):
+class QuantInt8MkldnnPass(object):
     """
     Convert QuantizationFreezePass generated IrGraph to MKL-DNN supported INT8
     IrGraph. Following transformations did in this pass:
@@ -48,13 +48,13 @@ class QatInt8MkldnnPass(object):
             # The original graph will be rewrite.
             import paddle.fluid as fluid
             from paddle.fluid.contrib.slim.quantization \
-                import QatInt8MkldnnPass
+                import QuantInt8MkldnnPass
             from paddle.fluid.framework import IrGraph
             from paddle.fluid import core
 
             graph = IrGraph(core.Graph(fluid.Program().desc), for_test=False)
             place = fluid.CPUPlace()
-            mkldnn_pass = QatInt8MkldnnPass(fluid.global_scope(),
+            mkldnn_pass = QuantInt8MkldnnPass(fluid.global_scope(),
             place)
             mkldnn_pass.apply(graph)
         """
@@ -163,7 +163,7 @@ class QatInt8MkldnnPass(object):
                     'Filter': weight_var_node},
             outputs={'Output': output_var_node})
 
-        # Based on the QAT's scales to calculate the scales of MKL-DNN INT8 conv2d
+        # Based on the Quant's scales to calculate the scales of MKL-DNN INT8 conv2d
         scale_in = self._s8_max / self._in_scale[output_name]
         scale_w = []
         scale_w = [self._max_range[output_name] / self._s8_max]
@@ -207,7 +207,7 @@ class QatInt8MkldnnPass(object):
                     'Y': weight_var_node},
             outputs={'Out': output_var_node})
 
-        # Based on the QAT's scales to calculate MKL-DNN INT8 mul's scales
+        # Based on the Quant's scales to calculate MKL-DNN INT8 mul's scales
         scale_in = self._s8_max / self._in_scale[output_name]
         scale_w = []
         scale_w = [self._max_range[output_name] / self._s8_max]
diff --git a/python/paddle/fluid/contrib/slim/quantization/quantization_pass.py b/python/paddle/fluid/contrib/slim/quantization/quantization_pass.py
index 61d19e64ba4c2c4ee258eae0973c4e798a3cbe6b..c9614a1fb7770a7273e5f675380b635a1f8fd16c 100644
--- a/python/paddle/fluid/contrib/slim/quantization/quantization_pass.py
+++ b/python/paddle/fluid/contrib/slim/quantization/quantization_pass.py
@@ -127,6 +127,22 @@ def _get_op_output_var_names(op):
     return var_names
 
 
+def _get_output_name_index(op, output_var_name):
+    """Get the output name and index of the var_name in the op"""
+    assert isinstance(op, (IrNode, Operator)), \
+        "The input op should be IrNode or Operator."
+    op_name = op.name() if isinstance(op, IrNode) \
+        else op.type
+    name_list = _op_real_in_out_name[op_name][1]
+    res = None
+    for name in name_list:
+        var_name = op.output(name)
+        for index, val in enumerate(var_name):
+            if val == output_var_name:
+                res = (name, index)
+    return res
+
+
 def _init_var_node(var_node, value, scope, place):
     assert isinstance(value,
                       np.ndarray), 'The type of value should be numpy array.'
@@ -1528,13 +1544,19 @@ class OutScaleForInferencePass(object):
         op_nodes = graph.all_op_nodes()
         for op_node in op_nodes:
             if op_node.name() in self._teller_set:
-                output_var_name = _get_op_output_var_names(op_node)
-                assert len(output_var_name) == 1, "Only support collecting " \
-                    "output for op that only has an activation output for now."
-                scale_name = self._scale_name(output_var_name[0])
-                scale_v = np.array(
-                    self._scope.find_var(scale_name).get_tensor())[0]
-                op_node.op()._set_attr("out_threshold", float(scale_v))
+                var_names = _get_op_output_var_names(op_node)
+                for var_name in var_names:
+                    # For compatibility, we save output threshold by two methods.
+                    scale_name = self._scale_name(var_name)
+                    scale_v = np.array(
+                        self._scope.find_var(scale_name).get_tensor())[0]
+                    op_node.op()._set_attr("out_threshold", float(scale_v))
+
+                    argname_index = _get_output_name_index(op_node, var_name)
+                    assert argname_index is not None, \
+                        var_name + " is not the output of the op"
+                    op_node.op()._set_attr(argname_index[0] + str(argname_index[1]) \
+                        + "_threshold", float(scale_v))
         graph.resolve_hazard()
         return graph
 
diff --git a/python/paddle/fluid/contrib/slim/tests/CMakeLists.txt b/python/paddle/fluid/contrib/slim/tests/CMakeLists.txt
index 64133da70f4050c08d771b39ad9eefdd1fe1a4fb..ac4235d2e17936bd5b93fc85820b8f93361332c0 100644
--- a/python/paddle/fluid/contrib/slim/tests/CMakeLists.txt
+++ b/python/paddle/fluid/contrib/slim/tests/CMakeLists.txt
@@ -25,30 +25,30 @@ function(inference_analysis_python_api_int8_test_mkldnn target model_dir data_pa
     _inference_analysis_python_api_int8_test(${target} ${model_dir} ${data_path} ${filename} True)
 endfunction()
 
-function(download_qat_data install_dir data_file)
+function(download_quant_data install_dir data_file)
     if (NOT EXISTS ${install_dir}/${data_file})
 	    inference_download_and_uncompress(${install_dir} ${INFERENCE_URL}/int8 ${data_file})
     endif()
 endfunction()
 
-function(download_qat_model install_dir data_file)
+function(download_quant_model install_dir data_file)
     if (NOT EXISTS ${install_dir}/${data_file})
 	    inference_download_and_uncompress(${install_dir} ${INFERENCE_URL}/int8/QAT_models ${data_file})
     endif()
 endfunction()
 
-function(download_qat_fp32_model install_dir data_file)
+function(download_quant_fp32_model install_dir data_file)
     if (NOT EXISTS ${install_dir}/${data_file})
 	    inference_download_and_uncompress(${install_dir} ${INFERENCE_URL}/int8/QAT_models/fp32 ${data_file})
     endif()
 endfunction()
 
-function(inference_qat_int8_image_classification_test target qat_model_dir dataset_path)
-    py_test(${target} SRCS "${CMAKE_CURRENT_SOURCE_DIR}/qat_int8_image_classification_comparison.py"
+function(inference_quant_int8_image_classification_test target quant_model_dir dataset_path)
+    py_test(${target} SRCS "${CMAKE_CURRENT_SOURCE_DIR}/quant_int8_image_classification_comparison.py"
             ENVS FLAGS_OMP_NUM_THREADS=${CPU_NUM_THREADS_ON_CI}
                  OMP_NUM_THREADS=${CPU_NUM_THREADS_ON_CI}
                  FLAGS_use_mkldnn=true
-            ARGS --qat_model ${qat_model_dir}
+            ARGS --quant_model ${quant_model_dir}
                  --infer_data ${dataset_path}
                  --batch_size 25
                  --batch_num 2
@@ -57,60 +57,61 @@ endfunction()
 
 
 # set batch_size 10 for UT only (avoid OOM). For whole dataset, use batch_size 25 
-function(inference_qat2_int8_image_classification_test target qat_model_dir fp32_model_dir dataset_path ops_to_quantize)
-    py_test(${target} SRCS "${CMAKE_CURRENT_SOURCE_DIR}/qat2_int8_image_classification_comparison.py"
+function(inference_quant2_int8_image_classification_test target quant_model_dir fp32_model_dir dataset_path)
+    py_test(${target} SRCS "${CMAKE_CURRENT_SOURCE_DIR}/quant2_int8_image_classification_comparison.py"
             ENVS FLAGS_OMP_NUM_THREADS=${CPU_NUM_THREADS_ON_CI}
                  OMP_NUM_THREADS=${CPU_NUM_THREADS_ON_CI}
                  FLAGS_use_mkldnn=true
-            ARGS --qat_model ${qat_model_dir}
+            ARGS --quant_model ${quant_model_dir}
                  --fp32_model ${fp32_model_dir}
                  --infer_data ${dataset_path}
                  --batch_size 10
                  --batch_num 2
-                 --acc_diff_threshold 0.1
-                 --ops_to_quantize ${ops_to_quantize})
+                 --acc_diff_threshold 0.1)
 endfunction()
 
 # set batch_size 10 for UT only (avoid OOM). For whole dataset, use batch_size 20 
-function(inference_qat2_int8_nlp_test target qat_model_dir fp32_model_dir dataset_path labels_path)
-    py_test(${target} SRCS "${CMAKE_CURRENT_SOURCE_DIR}/qat2_int8_nlp_comparison.py"
+function(inference_quant2_int8_nlp_test target quant_model_dir fp32_model_dir dataset_path labels_path ops_to_quantize)
+    py_test(${target} SRCS "${CMAKE_CURRENT_SOURCE_DIR}/quant2_int8_nlp_comparison.py"
             ENVS FLAGS_OMP_NUM_THREADS=${CPU_NUM_THREADS_ON_CI}
                  OMP_NUM_THREADS=${CPU_NUM_THREADS_ON_CI}
                  FLAGS_use_mkldnn=true
-            ARGS --qat_model ${qat_model_dir}
+            ARGS --quant_model ${quant_model_dir}
 		 --fp32_model ${fp32_model_dir}
                  --infer_data ${dataset_path}
 		 --labels ${labels_path}
                  --batch_size 10
                  --batch_num 2
-                 --acc_diff_threshold 0.1)
+                 --acc_diff_threshold 0.1
+		 --ops_to_quantize ${ops_to_quantize})
 endfunction()
 
-function(download_qat_data install_dir data_file)
+function(download_quant_data install_dir data_file)
     if (NOT EXISTS ${install_dir}/${data_file})
            inference_download_and_uncompress(${install_dir} ${INFERENCE_URL}/int8 ${data_file})
     endif()
 endfunction()
 
-function(download_qat_model install_dir data_file)
+function(download_quant_model install_dir data_file)
     if (NOT EXISTS ${install_dir}/${data_file})
            inference_download_and_uncompress(${install_dir} ${INFERENCE_URL}/int8/QAT_models ${data_file})
     endif()
 endfunction()
 
-function(save_qat_ic_model_test target qat_model_dir fp32_model_save_path int8_model_save_path ops_to_quantize)
-    py_test(${target} SRCS ${CMAKE_CURRENT_SOURCE_DIR}/save_qat_model.py
-            ARGS --qat_model_path ${qat_model_dir}
+function(save_quant_ic_model_test target quant_model_dir fp32_model_save_path int8_model_save_path)
+    py_test(${target} SRCS ${CMAKE_CURRENT_SOURCE_DIR}/save_quant_model.py
+            ARGS --quant_model_path ${quant_model_dir}
 	         --fp32_model_save_path ${fp32_model_save_path}
 	         --int8_model_save_path ${int8_model_save_path}
-		 --ops_to_quantize ${ops_to_quantize})
+		 --debug)
 endfunction()
 
-function(save_qat_nlp_model_test target qat_model_dir fp32_model_save_path int8_model_save_path)
-    py_test(${target} SRCS ${CMAKE_CURRENT_SOURCE_DIR}/save_qat_model.py
-            ARGS --qat_model_path ${qat_model_dir}
+function(save_quant_nlp_model_test target quant_model_dir fp32_model_save_path int8_model_save_path ops_to_quantize)
+    py_test(${target} SRCS ${CMAKE_CURRENT_SOURCE_DIR}/save_quant_model.py
+            ARGS --quant_model_path ${quant_model_dir}
 	         --fp32_model_save_path ${fp32_model_save_path}
-	         --int8_model_save_path ${int8_model_save_path})
+	         --int8_model_save_path ${int8_model_save_path}
+		 --ops_to_quantize ${ops_to_quantize})
 endfunction()
 
 function(convert_model2dot_test target model_path save_graph_dir save_graph_name)
@@ -173,130 +174,131 @@ if(LINUX AND WITH_MKLDNN)
 		inference_analysis_python_api_int8_test(test_slim_int8_vgg19 ${INT8_VGG19_MODEL_DIR} ${IMAGENET_DATA_PATH} ${INT8_IC_TEST_FILE_PATH})
 	endif()
 
-	#### QAT FP32 & INT8 comparison python api tests
-
-	set(QAT_INSTALL_DIR "${INFERENCE_DEMO_INSTALL_DIR}/qat")
-
-	### QATv1 for image classification
-
-	# QAT ResNet50
-	set(QAT_RESNET50_MODEL_DIR "${QAT_INSTALL_DIR}/ResNet50_QAT")
-	set(QAT_RESNET50_MODEL_ARCHIVE "ResNet50_qat_model.tar.gz")
-	download_qat_model(${QAT_RESNET50_MODEL_DIR} ${QAT_RESNET50_MODEL_ARCHIVE})
-	inference_qat_int8_image_classification_test(test_qat_int8_resnet50_mkldnn ${QAT_RESNET50_MODEL_DIR}/model ${IMAGENET_DATA_PATH})
-
-	# QAT ResNet101
-	set(QAT_RESNET101_MODEL_DIR "${QAT_INSTALL_DIR}/ResNet101_QAT")
-	set(QAT_RESNET101_MODEL_ARCHIVE "ResNet101_qat_model.tar.gz")
-	download_qat_model(${QAT_RESNET101_MODEL_DIR} ${QAT_RESNET101_MODEL_ARCHIVE})
-	# inference_qat_int8_image_classification_test(test_qat_int8_resnet101_mkldnn ${QAT_RESNET101_MODEL_DIR}/model ${IMAGENET_DATA_PATH})
-
-	# QAT GoogleNet
-	set(QAT_GOOGLENET_MODEL_DIR "${QAT_INSTALL_DIR}/GoogleNet_QAT")
-	set(QAT_GOOGLENET_MODEL_ARCHIVE "GoogleNet_qat_model.tar.gz")
-	download_qat_model(${QAT_GOOGLENET_MODEL_DIR} ${QAT_GOOGLENET_MODEL_ARCHIVE})
-	inference_qat_int8_image_classification_test(test_qat_int8_googlenet_mkldnn ${QAT_GOOGLENET_MODEL_DIR}/model ${IMAGENET_DATA_PATH})
-
-	# QAT MobileNetV1
-	set(QAT_MOBILENETV1_MODEL_DIR "${QAT_INSTALL_DIR}/MobileNetV1_QAT")
-	set(QAT_MOBILENETV1_MODEL_ARCHIVE "MobileNetV1_qat_model.tar.gz")
-	download_qat_model(${QAT_MOBILENETV1_MODEL_DIR} ${QAT_MOBILENETV1_MODEL_ARCHIVE})
-	inference_qat_int8_image_classification_test(test_qat_int8_mobilenetv1_mkldnn ${QAT_MOBILENETV1_MODEL_DIR}/model ${IMAGENET_DATA_PATH})
-
-	# QAT MobileNetV2
-	set(QAT_MOBILENETV2_MODEL_DIR "${QAT_INSTALL_DIR}/MobileNetV2_QAT")
-	set(QAT_MOBILENETV2_MODEL_ARCHIVE "MobileNetV2_qat_model.tar.gz")
-	download_qat_model(${QAT_MOBILENETV2_MODEL_DIR} ${QAT_MOBILENETV2_MODEL_ARCHIVE})
-	inference_qat_int8_image_classification_test(test_qat_int8_mobilenetv2_mkldnn ${QAT_MOBILENETV2_MODEL_DIR}/model ${IMAGENET_DATA_PATH})
-
-	# QAT VGG16
-	set(QAT_VGG16_MODEL_DIR "${QAT_INSTALL_DIR}/VGG16_QAT")
-	set(QAT_VGG16_MODEL_ARCHIVE "VGG16_qat_model.tar.gz")
-	download_qat_model(${QAT_VGG16_MODEL_DIR} ${QAT_VGG16_MODEL_ARCHIVE})
-	# inference_qat_int8_image_classification_test(test_qat_int8_vgg16_mkldnn ${QAT_VGG16_MODEL_DIR}/model ${IMAGENET_DATA_PATH})
-
-	# QAT VGG19
-	set(QAT_VGG19_MODEL_DIR "${QAT_INSTALL_DIR}/VGG19_QAT")
-	set(QAT_VGG19_MODEL_ARCHIVE "VGG19_qat_model.tar.gz")
-	download_qat_model(${QAT_VGG19_MODEL_DIR} ${QAT_VGG19_MODEL_ARCHIVE})
-	# inference_qat_int8_image_classification_test(test_qat_int8_vgg19_mkldnn ${QAT_VGG19_MODEL_DIR}/model ${IMAGENET_DATA_PATH})
-
-	### QATv2 for image classification
-
-	set(QAT2_IC_OPS_TO_QUANTIZE "conv2d,pool2d")
-
-	# QAT2 ResNet50 with input/output scales in `fake_quantize_moving_average_abs_max` operators,
+	#### QUANT & INT8 comparison python api tests
+
+	set(QUANT_INSTALL_DIR "${INFERENCE_DEMO_INSTALL_DIR}/quant")
+
+	### Quant1 for image classification
+
+	# Quant ResNet50
+	set(QUANT_RESNET50_MODEL_DIR "${QUANT_INSTALL_DIR}/ResNet50_quant")
+	set(QUANT_RESNET50_MODEL_ARCHIVE "ResNet50_qat_model.tar.gz")
+	download_quant_model(${QUANT_RESNET50_MODEL_DIR} ${QUANT_RESNET50_MODEL_ARCHIVE})
+	inference_quant_int8_image_classification_test(test_quant_int8_resnet50_mkldnn ${QUANT_RESNET50_MODEL_DIR}/model ${IMAGENET_DATA_PATH})
+
+	# Quant ResNet101
+	set(QUANT_RESNET101_MODEL_DIR "${QUANT_INSTALL_DIR}/ResNet101_quant")
+	set(QUANT_RESNET101_MODEL_ARCHIVE "ResNet101_qat_model.tar.gz")
+	download_quant_model(${QUANT_RESNET101_MODEL_DIR} ${QUANT_RESNET101_MODEL_ARCHIVE})
+	# inference_quant_int8_image_classification_test(test_quant_int8_resnet101_mkldnn ${QUANT_RESNET101_MODEL_DIR}/model ${IMAGENET_DATA_PATH})
+
+	# Quant GoogleNet
+	set(QUANT_GOOGLENET_MODEL_DIR "${QUANT_INSTALL_DIR}/GoogleNet_quant")
+	set(QUANT_GOOGLENET_MODEL_ARCHIVE "GoogleNet_qat_model.tar.gz")
+	download_quant_model(${QUANT_GOOGLENET_MODEL_DIR} ${QUANT_GOOGLENET_MODEL_ARCHIVE})
+	inference_quant_int8_image_classification_test(test_quant_int8_googlenet_mkldnn ${QUANT_GOOGLENET_MODEL_DIR}/model ${IMAGENET_DATA_PATH})
+
+	# Quant MobileNetV1
+	set(QUANT_MOBILENETV1_MODEL_DIR "${QUANT_INSTALL_DIR}/MobileNetV1_quant")
+	set(QUANT_MOBILENETV1_MODEL_ARCHIVE "MobileNetV1_qat_model.tar.gz")
+	download_quant_model(${QUANT_MOBILENETV1_MODEL_DIR} ${QUANT_MOBILENETV1_MODEL_ARCHIVE})
+	inference_quant_int8_image_classification_test(test_quant_int8_mobilenetv1_mkldnn ${QUANT_MOBILENETV1_MODEL_DIR}/model ${IMAGENET_DATA_PATH})
+
+	# Quant MobileNetV2
+	set(QUANT_MOBILENETV2_MODEL_DIR "${QUANT_INSTALL_DIR}/MobileNetV2_quant")
+	set(QUANT_MOBILENETV2_MODEL_ARCHIVE "MobileNetV2_qat_model.tar.gz")
+	download_quant_model(${QUANT_MOBILENETV2_MODEL_DIR} ${QUANT_MOBILENETV2_MODEL_ARCHIVE})
+	inference_quant_int8_image_classification_test(test_quant_int8_mobilenetv2_mkldnn ${QUANT_MOBILENETV2_MODEL_DIR}/model ${IMAGENET_DATA_PATH})
+
+	# Quant VGG16
+	set(QUANT_VGG16_MODEL_DIR "${QUANT_INSTALL_DIR}/VGG16_quant")
+	set(QUANT_VGG16_MODEL_ARCHIVE "VGG16_qat_model.tar.gz")
+	download_quant_model(${QUANT_VGG16_MODEL_DIR} ${QUANT_VGG16_MODEL_ARCHIVE})
+	# inference_quant_int8_image_classification_test(test_quant_int8_vgg16_mkldnn ${QUANT_VGG16_MODEL_DIR}/model ${IMAGENET_DATA_PATH})
+
+	# Quant VGG19
+	set(QUANT_VGG19_MODEL_DIR "${QUANT_INSTALL_DIR}/VGG19_quant")
+	set(QUANT_VGG19_MODEL_ARCHIVE "VGG19_qat_model.tar.gz")
+	download_quant_model(${QUANT_VGG19_MODEL_DIR} ${QUANT_VGG19_MODEL_ARCHIVE})
+	# inference_quant_int8_image_classification_test(test_quant_int8_vgg19_mkldnn ${QUANT_VGG19_MODEL_DIR}/model ${IMAGENET_DATA_PATH})
+
+	### Quant2 for image classification
+
+	# Quant2 ResNet50 with input/output scales in `fake_quantize_moving_average_abs_max` operators,
 	# with weight scales in `fake_dequantize_max_abs` operators
-        set(QAT2_RESNET50_MODEL_DIR "${QAT_INSTALL_DIR}/ResNet50_qat_perf")
+        set(QUANT2_RESNET50_MODEL_DIR "${QUANT_INSTALL_DIR}/ResNet50_quant2")
+	set(QUANT2_RESNET50_MODEL_ARCHIVE "ResNet50_qat_perf.tar.gz")
+	download_quant_model(${QUANT2_RESNET50_MODEL_DIR} ${QUANT2_RESNET50_MODEL_ARCHIVE})
 	set(FP32_RESNET50_MODEL_DIR "${INT8_INSTALL_DIR}/resnet50")
-	set(QAT2_RESNET50_MODEL_ARCHIVE "ResNet50_qat_perf.tar.gz")
-	download_qat_model(${QAT2_RESNET50_MODEL_DIR} ${QAT2_RESNET50_MODEL_ARCHIVE})
-	inference_qat2_int8_image_classification_test(test_qat2_int8_resnet50_mkldnn ${QAT2_RESNET50_MODEL_DIR}/ResNet50_qat_perf/float ${FP32_RESNET50_MODEL_DIR}/model ${IMAGENET_DATA_PATH} ${QAT2_IC_OPS_TO_QUANTIZE})
+	inference_quant2_int8_image_classification_test(test_quant2_int8_resnet50_mkldnn ${QUANT2_RESNET50_MODEL_DIR}/ResNet50_qat_perf/float ${FP32_RESNET50_MODEL_DIR}/model ${IMAGENET_DATA_PATH})
 
-	# QAT2 ResNet50 with input/output scales in `fake_quantize_range_abs_max` operators and the `out_threshold` attributes,
+	# Quant2 ResNet50 with input/output scales in `fake_quantize_range_abs_max` operators and the `out_threshold` attributes,
 	# with weight scales in `fake_dequantize_max_abs` operators
-	set(QAT2_RESNET50_RANGE_MODEL_DIR "${QAT_INSTALL_DIR}/ResNet50_qat_range")
-	set(QAT2_RESNET50_RANGE_MODEL_ARCHIVE "ResNet50_qat_range.tar.gz")
-	download_qat_model(${QAT2_RESNET50_RANGE_MODEL_DIR} ${QAT2_RESNET50_RANGE_MODEL_ARCHIVE})
-	inference_qat2_int8_image_classification_test(test_qat2_int8_resnet50_range_mkldnn ${QAT2_RESNET50_RANGE_MODEL_DIR}/ResNet50_qat_range ${FP32_RESNET50_MODEL_DIR}/model ${IMAGENET_DATA_PATH} ${QAT2_IC_OPS_TO_QUANTIZE})
+	set(QUANT2_RESNET50_RANGE_MODEL_DIR "${QUANT_INSTALL_DIR}/ResNet50_quant2_range")
+	set(QUANT2_RESNET50_RANGE_MODEL_ARCHIVE "ResNet50_qat_range.tar.gz")
+	download_quant_model(${QUANT2_RESNET50_RANGE_MODEL_DIR} ${QUANT2_RESNET50_RANGE_MODEL_ARCHIVE})
+	inference_quant2_int8_image_classification_test(test_quant2_int8_resnet50_range_mkldnn ${QUANT2_RESNET50_RANGE_MODEL_DIR}/ResNet50_qat_range ${FP32_RESNET50_MODEL_DIR}/model ${IMAGENET_DATA_PATH})
 
-	# QAT2 ResNet50 with input/output scales in `fake_quantize_range_abs_max` operators and the `out_threshold` attributes,
+	# Quant2 ResNet50 with input/output scales in `fake_quantize_range_abs_max` operators and the `out_threshold` attributes,
 	# with weight scales in `fake_channel_wise_dequantize_max_abs` operators
-	set(QAT2_RESNET50_CHANNELWISE_MODEL_DIR "${QAT_INSTALL_DIR}/ResNet50_qat_channelwise")
-	set(QAT2_RESNET50_CHANNELWISE_MODEL_ARCHIVE "ResNet50_qat_channelwise.tar.gz")
-	download_qat_model(${QAT2_RESNET50_CHANNELWISE_MODEL_DIR} ${QAT2_RESNET50_CHANNELWISE_MODEL_ARCHIVE})
-	inference_qat2_int8_image_classification_test(test_qat2_int8_resnet50_channelwise_mkldnn ${QAT2_RESNET50_CHANNELWISE_MODEL_DIR}/ResNet50_qat_channelwise ${FP32_RESNET50_MODEL_DIR}/model ${IMAGENET_DATA_PATH} ${QAT2_IC_OPS_TO_QUANTIZE})
-
-	# QAT2 MobileNetV1
-        set(QAT2_MOBILENETV1_MODEL_DIR "${QAT_INSTALL_DIR}/MobileNet_qat_perf")
+	set(QUANT2_RESNET50_CHANNELWISE_MODEL_DIR "${QUANT_INSTALL_DIR}/ResNet50_quant2_channelwise")
+	set(QUANT2_RESNET50_CHANNELWISE_MODEL_ARCHIVE "ResNet50_qat_channelwise.tar.gz")
+	download_quant_model(${QUANT2_RESNET50_CHANNELWISE_MODEL_DIR} ${QUANT2_RESNET50_CHANNELWISE_MODEL_ARCHIVE})
+	inference_quant2_int8_image_classification_test(test_quant2_int8_resnet50_channelwise_mkldnn ${QUANT2_RESNET50_CHANNELWISE_MODEL_DIR}/ResNet50_qat_channelwise ${FP32_RESNET50_MODEL_DIR}/model ${IMAGENET_DATA_PATH})
+
+	# Quant2 MobileNetV1
+        set(QUANT2_MOBILENETV1_MODEL_DIR "${QUANT_INSTALL_DIR}/MobileNetV1_quant2")
+	set(QUANT2_MOBILENETV1_MODEL_ARCHIVE "MobileNet_qat_perf.tar.gz")
+	download_quant_model(${QUANT2_MOBILENETV1_MODEL_DIR} ${QUANT2_MOBILENETV1_MODEL_ARCHIVE})
 	set(FP32_MOBILENETV1_MODEL_DIR "${INT8_INSTALL_DIR}/mobilenetv1")
-	set(QAT2_MOBILENETV1_MODEL_ARCHIVE "MobileNet_qat_perf.tar.gz")
-	download_qat_model(${QAT2_MOBILENETV1_MODEL_DIR} ${QAT2_MOBILENETV1_MODEL_ARCHIVE})
-	inference_qat2_int8_image_classification_test(test_qat2_int8_mobilenetv1_mkldnn ${QAT2_MOBILENETV1_MODEL_DIR}/MobileNet_qat_perf/float ${FP32_MOBILENETV1_MODEL_DIR}/model ${IMAGENET_DATA_PATH} ${QAT2_IC_OPS_TO_QUANTIZE})
+	inference_quant2_int8_image_classification_test(test_quant2_int8_mobilenetv1_mkldnn ${QUANT2_MOBILENETV1_MODEL_DIR}/MobileNet_qat_perf/float ${FP32_MOBILENETV1_MODEL_DIR}/model ${IMAGENET_DATA_PATH})
 	
-	### QATv2 for NLP
+	### Quant2 for NLP
 
 	set(NLP_DATA_ARCHIVE "Ernie_dataset.tar.gz")
 	set(NLP_DATA_DIR "${INFERENCE_DEMO_INSTALL_DIR}/Ernie_dataset")
 	set(NLP_DATA_PATH "${NLP_DATA_DIR}/Ernie_dataset/1.8w.bs1")
 	set(NLP_LABLES_PATH "${NLP_DATA_DIR}/Ernie_dataset/label.xnli.dev")
-	download_qat_data(${NLP_DATA_DIR} ${NLP_DATA_ARCHIVE})
+	download_quant_data(${NLP_DATA_DIR} ${NLP_DATA_ARCHIVE})
+
+	set(QUANT2_NLP_OPS_TO_QUANTIZE "fc,reshape2,transpose2,matmul,elementwise_add")
 
-	# QAT2 Ernie
-	set(QAT2_ERNIE_MODEL_ARCHIVE "ernie_qat.tar.gz")
-	set(QAT2_ERNIE_MODEL_DIR "${QAT_INSTALL_DIR}/Ernie_qat")
-	download_qat_model(${QAT2_ERNIE_MODEL_DIR} ${QAT2_ERNIE_MODEL_ARCHIVE})
+	# Quant2 Ernie
+	set(QUANT2_ERNIE_MODEL_ARCHIVE "ernie_qat.tar.gz")
+	set(QUANT2_ERNIE_MODEL_DIR "${QUANT_INSTALL_DIR}/Ernie_quant2")
+	download_quant_model(${QUANT2_ERNIE_MODEL_DIR} ${QUANT2_ERNIE_MODEL_ARCHIVE})
 	set(FP32_ERNIE_MODEL_ARCHIVE "ernie_fp32_model.tar.gz")
-	set(FP32_ERNIE_MODEL_DIR "${QAT_INSTALL_DIR}/Ernie_float")
-	download_qat_fp32_model(${FP32_ERNIE_MODEL_DIR} ${FP32_ERNIE_MODEL_ARCHIVE})
-	inference_qat2_int8_nlp_test(test_qat2_int8_ernie_mkldnn ${QAT2_ERNIE_MODEL_DIR}/Ernie_qat/float ${FP32_ERNIE_MODEL_DIR}/ernie_fp32_model ${NLP_DATA_PATH} ${NLP_LABLES_PATH})
+	set(FP32_ERNIE_MODEL_DIR "${QUANT_INSTALL_DIR}/Ernie_float")
+	download_quant_fp32_model(${FP32_ERNIE_MODEL_DIR} ${FP32_ERNIE_MODEL_ARCHIVE})
+	inference_quant2_int8_nlp_test(test_quant2_int8_ernie_mkldnn ${QUANT2_ERNIE_MODEL_DIR}/Ernie_qat/float ${FP32_ERNIE_MODEL_DIR}/ernie_fp32_model ${NLP_DATA_PATH} ${NLP_LABLES_PATH} ${QUANT2_NLP_OPS_TO_QUANTIZE})
 
-	### Save QAT2 FP32 model or QAT2 INT8 model
+	### Save FP32 model or INT8 model from Quant model
         
-	set(QAT2_INT8_RESNET50_SAVE_PATH "${QAT_INSTALL_DIR}/ResNet50_qat2_int8")
-	set(QAT2_FP32_RESNET50_SAVE_PATH "${QAT_INSTALL_DIR}/ResNet50_qat2_fp32")
-	save_qat_ic_model_test(save_qat2_model_resnet50 ${QAT2_RESNET50_MODEL_DIR}/ResNet50_qat_perf/float ${QAT2_FP32_RESNET50_SAVE_PATH} ${QAT2_INT8_RESNET50_SAVE_PATH} ${QAT2_IC_OPS_TO_QUANTIZE})
+	set(QUANT2_INT8_RESNET50_SAVE_PATH "${QUANT_INSTALL_DIR}/ResNet50_quant2_int8")
+	set(QUANT2_FP32_RESNET50_SAVE_PATH "${QUANT_INSTALL_DIR}/ResNet50_quant2_fp32")
+	save_quant_ic_model_test(save_quant2_model_resnet50 ${QUANT2_RESNET50_MODEL_DIR}/ResNet50_qat_perf/float ${QUANT2_FP32_RESNET50_SAVE_PATH} ${QUANT2_INT8_RESNET50_SAVE_PATH})
 
-	set(QAT2_INT8_ERNIE_SAVE_PATH "${QAT_INSTALL_DIR}/Ernie_qat2_int8")
-	set(QAT2_FP32_ERNIE_SAVE_PATH "${QAT_INSTALL_DIR}/Ernie_qat2_fp32")
-	save_qat_nlp_model_test(save_qat2_model_ernie ${QAT2_ERNIE_MODEL_DIR}/Ernie_qat/float ${QAT2_FP32_ERNIE_SAVE_PATH} ${QAT2_INT8_ERNIE_SAVE_PATH})
+	set(QUANT2_INT8_ERNIE_SAVE_PATH "${QUANT_INSTALL_DIR}/Ernie_quant2_int8")
+	set(QUANT2_FP32_ERNIE_SAVE_PATH "${QUANT_INSTALL_DIR}/Ernie_quant2_fp32")
+	save_quant_nlp_model_test(save_quant2_model_ernie ${QUANT2_ERNIE_MODEL_DIR}/Ernie_qat/float ${QUANT2_FP32_ERNIE_SAVE_PATH} ${QUANT2_INT8_ERNIE_SAVE_PATH} ${QUANT2_NLP_OPS_TO_QUANTIZE})
 
-	# Convert QAT2 model to dot and pdf files 
-	set(QAT2_INT8_ERNIE_DOT_SAVE_PATH "${QAT_INSTALL_DIR}/Ernie_qat2_int8_dot_file")
-	convert_model2dot_test(convert_model2dot_ernie ${QAT2_ERNIE_MODEL_DIR}/Ernie_qat/float ${QAT2_INT8_ERNIE_DOT_SAVE_PATH} "Ernie_qat2_int8")
+	# Convert Quant2 model to dot and pdf files 
+	set(QUANT2_INT8_ERNIE_DOT_SAVE_PATH "${QUANT_INSTALL_DIR}/Ernie_quant2_int8_dot_file")
+	convert_model2dot_test(convert_model2dot_ernie ${QUANT2_ERNIE_MODEL_DIR}/Ernie_qat/float ${QUANT2_INT8_ERNIE_DOT_SAVE_PATH} "Ernie_quant2_int8")
 
 endif()
 
-# Since the tests for QAT FP32 & INT8 comparison support only testing on Linux 
+# Since the tests for Quant & INT8 comparison support only testing on Linux 
 # with MKL-DNN, we remove it here to not test it on other systems.
 list(REMOVE_ITEM TEST_OPS
 	test_mkldnn_int8_quantization_strategy
-	qat_int8_image_classification_comparison
-	qat_int8_nlp_comparison)
+	quant_int8_image_classification_comparison
+	quant_int8_nlp_comparison)
 
 #TODO(wanghaoshuang): Fix this unitest failed on GCC8.
 LIST(REMOVE_ITEM TEST_OPS test_auto_pruning)
 LIST(REMOVE_ITEM TEST_OPS test_filter_pruning)
+LIST(REMOVE_ITEM TEST_OPS test_user_defined_quantization)
 foreach(src ${TEST_OPS})
     py_test(${src} SRCS ${src}.py)
 endforeach()
diff --git a/python/paddle/fluid/contrib/slim/tests/QAT_mkldnn_int8_readme.md b/python/paddle/fluid/contrib/slim/tests/README.md
similarity index 69%
rename from python/paddle/fluid/contrib/slim/tests/QAT_mkldnn_int8_readme.md
rename to python/paddle/fluid/contrib/slim/tests/README.md
index b0665d1684c9acc794dea56bea5bb61b050037ca..169cb686168f8cf343dc3ee52adc5519da4fb8ab 100644
--- a/python/paddle/fluid/contrib/slim/tests/QAT_mkldnn_int8_readme.md
+++ b/python/paddle/fluid/contrib/slim/tests/README.md
@@ -1,16 +1,16 @@
 # SLIM Quantization-aware training (QAT) for INT8 MKL-DNN
 
-This document describes how to use [Paddle Slim](https://paddlepaddle.github.io/PaddleSlim/index.html) to convert a quantization-aware trained model into INT8 MKL-DNN quantized model and run it.
+This document describes how to use [Paddle Slim](https://paddlepaddle.github.io/PaddleSlim/index.html) to convert a quantization-aware trained model (Quant model) into INT8 MKL-DNN quantized model and run it.
 
-In **Release 1.5**, we have released the first approach to the MKL-DNN-based quantization of QAT models, called QAT1. It enabled the `conv2d` and `mul` INT8 MKL-DNN kernels for QAT trained models (GoogleNet, MobileNetV1, MobileNetV2, ResNet50, ResNet101, VGG16, and VGG19) with 0.05% accuracy diff.
+In **Release 1.5**, we have released the first approach to the MKL-DNN-based quantization of Quant models, called Quant1. It enabled the `conv2d` and `mul` INT8 MKL-DNN kernels for Quant trained models (GoogleNet, MobileNetV1, MobileNetV2, ResNet50, ResNet101, VGG16, and VGG19) with 0.05% accuracy diff.
 
-In **Release 1.6**, a new approach was introduced, called QAT2, which adds support for more performance optimizations and more INT8 MKL-DNN kernels. INT8 MKL-DNN models obtained using QAT2 have much better inference performance than using QAT1, with only a little bit bigger accuracy diff.
+In **Release 1.6**, a new approach was introduced, called Quant2, which adds support for more performance optimizations and more INT8 MKL-DNN kernels. INT8 MKL-DNN models obtained using Quant2 have much better inference performance than using Quant1, with only a little bit bigger accuracy diff.
 
-In **Release 1.7**, a support for [Ernie (NLP) QAT trained model](https://github.com/PaddlePaddle/benchmark/tree/master/Inference/c%2B%2B/ernie/mkldnn) was added to the QAT2.
+In **Release 1.7**, a support for [Ernie (NLP) Quant trained model](https://github.com/PaddlePaddle/benchmark/tree/master/Inference/c%2B%2B/ernie/mkldnn) was added to the Quant2.
 
-In **Release 2.0**, further optimizations were added to the QAT2: INT8 `matmul` kernel, inplace execution of activation and `elementwise_add` operators, and broader support for quantization aware strategy from PaddleSlim.
+In **Release 2.0**, further optimizations were added to the Quant2: INT8 `matmul` kernel, inplace execution of activation and `elementwise_add` operators, and broader support for quantization aware strategy from PaddleSlim.
 
-In this document we focus on the QAT2 approach only. 
+In this document we focus on the Quant2 approach only. 
 
 ## 0. Prerequisites
 * PaddlePaddle in version 2.0 or higher is required. For instructions on how to install it see the [installation document](https://www.paddlepaddle.org.cn/install/quick).
@@ -20,15 +20,15 @@ In this document we focus on the QAT2 approach only.
 
 ## 1. Introduction
 
-There are two forms of quantization supported in PaddlePaddle: [post-training quantization](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/fluid/inference/tests/api/int8_mkldnn_quantization.md) (PTQ) and quantization-aware training (QAT). Using both PTQ and QAT a user can convert models created by PaddleSlim into INT8 models and run INT8 inference on CPU. PTQ is more automatic and requires less model preparation than QAT, but usually QAT gives better accuracy with similar performance. In this document we focus on QAT2 approach to the QAT and INT8 quantization.
+There are two approaches to quantization supported in PaddlePaddle: [post-training quantization](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/fluid/inference/tests/api/int8_mkldnn_quantization.md) (PTQ) and quantization-aware training (QAT). Using both PTQ and QAT a user can convert models created by PaddleSlim into INT8 models and run INT8 inference on CPU. PTQ is more automatic and requires less model preparation. However, QAT usually gives better accuracy with similar performance. In this document we focus on a transformation from intermediate models obtained during the QAT process (Quant models) into MKL-DNN INT8 models. We call this procedure Quant2.
 
-## 2. How to turn an FP32 model into a QAT model?
+## 2. How to turn an FP32 model into a Quant model?
 
-A procedure on how to transform an FP32 model into a QAT model supported by the QAT2 approach is described in [this document](https://github.com/PaddlePaddle/PaddleSlim/blob/80c9fab3f419880dd19ca6ea30e0f46a2fedf6b3/demo/mkldnn_quant/quant_aware/PaddleCV_mkldnn_quantaware_tutorial.md).
+A procedure on how to transform an FP32 model into a Quant model supported by the Quant2 approach is described in [this document](https://github.com/PaddlePaddle/PaddleSlim/blob/develop/demo/mkldnn_quant/README.md).
 
-## 3. How to turn a QAT model into an INT8 MKL-DNN model?
+## 3. How to turn a Quant model into an INT8 MKL-DNN model?
 
-A QAT model can be transformed into an INT8 quantized model if it contains enough information about quantization scales for every quantized operator in the graph. The process of quantization is done by the `Qat2Int8MkldnnPass` pass which comprises several steps:
+A Quant model can be transformed into an INT8 quantized model if it contains enough information about quantization scales for every quantized operator in the graph. The process of quantization is done by the `Quant2Int8MkldnnPass` pass which comprises several steps:
 
 ### Gathering scales
 
@@ -51,7 +51,7 @@ Notes:
    ```... → input1 → conv2d → output1 → batch_norm → output2 → relu → output3 → ...```
    and we want to quantize the `conv2d` op, then after applying FP32 optimizations the sequence will become
    ```... → input1 → conv2d → output3 → ...```
-   and the quantization scales have to be collected for the `input1` and `outpu3` tensors in the QAT model.
+   and the quantization scales have to be collected for the `input1` and `outpu3` tensors in the Quant model.
 2. Quantization of the following operators is supported: `conv2d`, `depthwise_conv2d`, `mul`, `fc`, `matmul`, `pool2d`, `reshape2`, `transpose2`, `concat`.
 3. The longest sequence of consecutive quantizable operators in the model, the biggest performance boost can be achieved through quantization:
    ```... → conv2d → conv2d → pool2d → conv2d → conv2d → ...``` 
@@ -64,7 +64,7 @@ All the `fake_quantize_*` and `fake_dequantize_*` operators are being removed fr
 
 ### Dequantizing weights
 
-Weights of `conv2d`, `depthwise_conv2d` and `mul` operators are assumed to be fake-quantized (with integer values in the `int8` range, but kept as `float`s) in QAT models. Here, the information about the scale from `fake_dequantize_max_abs` and `fake_channel_wise_dequantize_max_abs` operators is used to fake-dequantize the weights back to the full float range of values. At this moment the model becomes an unoptimized clean FP32 inference model.
+Weights of `conv2d`, `depthwise_conv2d` and `mul` operators are assumed to be fake-quantized (with integer values in the `int8` range, but kept as `float`s) in Quant models. Here, the information about the scale from `fake_dequantize_max_abs` and `fake_channel_wise_dequantize_max_abs` operators is used to fake-dequantize the weights back to the full float range of values. At this moment the model becomes an unoptimized clean FP32 inference model.
 
 ### Optimizing FP32 graph
 
@@ -88,11 +88,11 @@ Having gathered all the data needed for quantization we apply the `cpu_quantize_
 
 ## 4. Code example
 
-The code snipped shows how the `Qat2Int8MkldnnPass` can be applied to a model graph:
+The code snipped shows how the `Quant2Int8MkldnnPass` can be applied to a model graph:
 
 ```python
     import paddle.fluid as fluid
-    from paddle.fluid.contrib.slim.quantization import Qat2Int8MkldnnPass
+    from paddle.fluid.contrib.slim.quantization import Quant2Int8MkldnnPass
     from paddle.fluid.framework import IrGraph
     from paddle.fluid import core	
     
@@ -100,16 +100,16 @@ The code snipped shows how the `Qat2Int8MkldnnPass` can be applied to a model gr
     graph = IrGraph(core.Graph(fluid.Program().desc), for_test=False)
     place = fluid.CPUPlace()
     # Convert the IrGraph to MKL-DNN supported INT8 IrGraph using the
-    # Qat2Int8MkldnnPass. It requires a list of operators to be quantized
-    mkldnn_pass = Qat2Int8MkldnnPass({'conv2d', 'pool2d'}, fluid.global_scope(), place, fluid.core, False)
-    # Apply Qat2Int8MkldnnPass to IrGraph
+    # Quant2Int8MkldnnPass. It requires a list of operators to be quantized
+    mkldnn_pass = Quant2Int8MkldnnPass({'conv2d', 'pool2d'}, fluid.global_scope(), place, fluid.core, False)
+    # Apply Quant2Int8MkldnnPass to IrGraph
     mkldnn_pass.apply(graph)
 
 ```
 
 ## 5. Accuracy and Performance benchmark
 
-This section contain QAT2 MKL-DNN accuracy and performance benchmark results measured on the following server:
+This section contain Quant2 MKL-DNN accuracy and performance benchmark results measured on the following server:
 
 * Intel(R) Xeon(R) Gold 6271 (with AVX512 VNNI support),
 
@@ -134,7 +134,7 @@ Performance benchmarks were run with the following environment settings:
 
 >**Intel(R) Xeon(R) Gold 6271**
 
-|    Model     | FP32 Top1 Accuracy | INT8 QAT Top1 Accuracy | Top1 Diff | FP32 Top5 Accuracy | INT8 QAT Top5 Accuracy | Top5 Diff |
+|    Model     | FP32 Top1 Accuracy | INT8 Quant Top1 Accuracy | Top1 Diff | FP32 Top5 Accuracy | INT8 Quant Top5 Accuracy | Top5 Diff |
 | :----------: | :----------------: | :--------------------: | :-------: | :----------------: | :--------------------: | :-------: |
 | MobileNet-V1 |       70.78%       |         70.71%         |  -0.07%   |       89.69%       |         89.41%         |  -0.28%   |
 | MobileNet-V2 |       71.90%       |         72.11%         |  +0.21%   |       90.56%       |         90.62%         |  +0.06%   |
@@ -150,7 +150,7 @@ Image classification models performance was measured using a single thread. The
 
 >**Intel(R) Xeon(R) Gold 6271**
 
-|    Model     | FP32 (images/s) | INT8 QAT (images/s) | Ratio (INT8/FP32)  |
+|    Model     | FP32 (images/s) | INT8 Quant (images/s) | Ratio (INT8/FP32)  |
 | :----------: | :-------------: | :-----------------: | :---------------:  |
 | MobileNet-V1 |      74.05      |       196.98        |      2.66          |
 | MobileNet-V2 |      88.60      |       187.67        |      2.12          |
@@ -169,7 +169,7 @@ Notes:
 
 >**Intel(R) Xeon(R) Gold 6271**
 
-|     Model    |  FP32 Accuracy | QAT INT8 Accuracy | Accuracy Diff |
+|     Model    |  FP32 Accuracy | Quant INT8 Accuracy | Accuracy Diff |
 |:------------:|:----------------------:|:----------------------:|:---------:|
 |   Ernie      |      80.20%            |        79.44%        |  -0.76%  |
 
@@ -179,7 +179,7 @@ Notes:
 
 >**Intel(R) Xeon(R) Gold 6271**
 
-|  Model  |     Threads  | FP32 Latency (ms) | QAT INT8 Latency (ms)    | Ratio (FP32/INT8) |
+|  Model  |     Threads  | FP32 Latency (ms) | Quant INT8 Latency (ms)    | Ratio (FP32/INT8) |
 |:------------:|:----------------------:|:-------------------:|:---------:|:---------:|
 | Ernie | 1 thread     |       237.21        |     79.26    |   2.99x    |
 | Ernie | 20 threads   |       22.08         |     12.57    |   1.76x    |
@@ -188,7 +188,7 @@ Notes:
 ## 6. How to reproduce the results
 
 The steps below show, taking ResNet50 as an example, how to reproduce the above accuracy and performance results for Image Classification models. 
-To reproduce NLP models results (Ernie), please follow [How to reproduce Ernie QAT results on MKL-DNN](https://github.com/PaddlePaddle/benchmark/tree/master/Inference/c%2B%2B/ernie/mkldnn/README.md).
+To reproduce NLP models results (Ernie), please follow [How to reproduce Ernie Quant results on MKL-DNN](https://github.com/PaddlePaddle/benchmark/tree/master/Inference/c%2B%2B/ernie/mkldnn/README.md).
 
 ### Prepare dataset
 
@@ -202,18 +202,18 @@ The converted data binary file is saved by default in `$HOME/.cache/paddle/datas
 
 ### Prepare models
 
-Run the following commands to download and extract QAT model:
+Run the following commands to download and extract Quant model:
 
 ```bash
 mkdir -p /PATH/TO/DOWNLOAD/MODEL/
 cd /PATH/TO/DOWNLOAD/MODEL/
-export QAT_MODEL_NAME=resnet50
-export QAT_MODEL_ARCHIVE=${QAT_MODEL_NAME}_quant.tar.gz
-wget http://paddle-inference-dist.bj.bcebos.com/int8/QAT2_models/${QAT_MODEL_ARCHIVE}
-mkdir ${QAT_MODEL_NAME} && tar -xvf ${QAT_MODEL_ARCHIVE} -C ${QAT_MODEL_NAME}
+export QUANT_MODEL_NAME=resnet50
+export QUANT_MODEL_ARCHIVE=${QUANT_MODEL_NAME}_quant.tar.gz
+wget http://paddle-inference-dist.bj.bcebos.com/int8/QAT2_models/${QUANT_MODEL_ARCHIVE}
+mkdir ${QUANT_MODEL_NAME} && tar -xvf ${QUANT_MODEL_ARCHIVE} -C ${QUANT_MODEL_NAME}
 ```
 
-To download other QAT models, set the `QAT_MODEL_NAME` variable in the above commands to one of the values: `resnet101`, `mobilenetv1`, `mobilenetv2`, `vgg16`, `vgg19`.
+To download other Quant models, set the `QUANT_MODEL_NAME` variable in the above commands to one of the values: `resnet101`, `mobilenetv1`, `mobilenetv2`, `vgg16`, `vgg19`.
 
 Download clean FP32 model for accuracy comparison against the INT8 model:
 
@@ -231,23 +231,23 @@ To download other FP32 models, set the `FP32_MODEL_NAME` variable to on of the v
 
 #### Accuracy benchmark commands
 
-You can use the `qat2_int8_image_classification_comparison.py` script to reproduce the accuracy result of the INT8 QAT models. The following options are required:
+You can use the `quant2_int8_image_classification_comparison.py` script to reproduce the accuracy result of the INT8 Quant models. The following options are required:
 
-* `--qat_model` - a path to a QAT model that will be transformed into INT8 model.
+* `--quant_model` - a path to a Quant model that will be transformed into INT8 model.
 * `--fp32_model` - a path to an FP32 model whose accuracy will be measured and compared to the accuracy of the INT8 model.
 * `--infer_data` - a path to the validation dataset.
 
 The following options are also accepted:
-* `--ops_to_quantize` - a comma-separated list of operator types to quantize. If the option is not used, an attempt to quantize all quantizable operators will be made, and in that case only quantizable operators which have quantization scales provided in the QAT model will be quantized. When deciding which operators to put on the list, the following have to be considered:
+* `--ops_to_quantize` - a comma-separated list of operator types to quantize. If the option is not used, an attempt to quantize all quantizable operators will be made, and in that case only quantizable operators which have quantization scales provided in the Quant model will be quantized. When deciding which operators to put on the list, the following have to be considered:
   * Only operators which support quantization will be taken into account.
   * All the quantizable operators from the list, which are present in the model, must have quantization scales provided in the model. Otherwise, quantization of the operator will be skipped with a message saying which variable is missing a quantization scale.
   * Sometimes it may be suboptimal to quantize all quantizable operators in the model (cf. *Notes* in the **Gathering scales** section above). To find the optimal configuration for this option, user can run benchmark a few times with different lists of quantized operators present in the model and compare the results. For Image Classification models mentioned above the list usually comprises of `conv2d` and `pool2d` operators.
-* `--op_ids_to_skip` - a comma-separated list of operator ids to skip in quantization. To get an id of a particular operator run the script with the `--debug` option first (see below for the description of the option), and having opened the generated file `qat_int8_cpu_quantize_placement_pass.dot` find the id number written in parentheses next to the name of the operator.
+* `--op_ids_to_skip` - a comma-separated list of operator ids to skip in quantization. To get an id of a particular operator run the script with the `--debug` option first (see below for the description of the option), and having opened the generated file `int8_<some_number>_cpu_quantize_placement_pass.dot` find the id number written in parentheses next to the name of the operator.
 * `--debug` - add this option to generate a series of `*.dot` files containing the model graphs after each step of the transformation. For a description of the DOT format see [DOT]( https://graphviz.gitlab.io/_pages/doc/info/lang.html). The files will be saved in the current location. To open the `*.dot` files use any of the Graphviz tools available on your system (e.g. `xdot` tool on Linux or `dot` tool on Windows, for documentation see [Graphviz](http://www.graphviz.org/documentation/)).
 
 ```bash
 cd /PATH/TO/PADDLE
-OMP_NUM_THREADS=28 FLAGS_use_mkldnn=true python python/paddle/fluid/contrib/slim/tests/qat2_int8_image_classification_comparison.py --qat_model=/PATH/TO/DOWNLOADED/QAT/MODEL --fp32_model=/PATH/TO/DOWNLOADED/FP32/MODEL --infer_data=$HOME/.cache/paddle/dataset/int8/download/int8_full_val.bin --batch_size=50 --batch_num=1000 --acc_diff_threshold=0.01 --ops_to_quantize="conv2d,pool2d"
+OMP_NUM_THREADS=28 FLAGS_use_mkldnn=true python python/paddle/fluid/contrib/slim/tests/quant2_int8_image_classification_comparison.py --quant_model=/PATH/TO/DOWNLOADED/QUANT/MODEL --fp32_model=/PATH/TO/DOWNLOADED/FP32/MODEL --infer_data=$HOME/.cache/paddle/dataset/int8/download/int8_full_val.bin --batch_size=50 --batch_num=1000 --acc_diff_threshold=0.01 --ops_to_quantize="conv2d,pool2d"
 ```
 
 > Notes: Due to a large amount of images in the `int8_full_val.bin` dataset (50 000), the accuracy benchmark may last long. To accelerate accuracy measuring, it is recommended to set `OMP_NUM_THREADS` to the maximum number of physical cores available on the server.
@@ -256,16 +256,16 @@ OMP_NUM_THREADS=28 FLAGS_use_mkldnn=true python python/paddle/fluid/contrib/slim
 
 To reproduce the performance results, the environment variable `OMP_NUM_THREADS=1` and `--batch_size=1` option should be set.
 
-1. Transform the QAT model into INT8 model by applying the `Qat2Int8MkldnnPass` pass and save the result. You can use the script `save_qat_model.py` for this purpose. It also accepts the option `--ops_to_quantize` with a list of operators to quantize.
+1. Transform the Quant model into INT8 model by applying the `Quant2Int8MkldnnPass` pass and save the result. You can use the script `save_quant_model.py` for this purpose. It also accepts the option `--ops_to_quantize` with a list of operators to quantize.
 
    ```bash
    cd /PATH/TO/PADDLE/build
-   python ../python/paddle/fluid/contrib/slim/tests/save_qat_model.py --qat_model_path=/PATH/TO/DOWNLOADED/QAT/MODEL --int8_model_save_path=/PATH/TO/SAVE/QAT/INT8/MODEL --ops_to_quantize="conv2d,pool2d"
+   python ../python/paddle/fluid/contrib/slim/tests/save_quant_model.py --quant_model_path=/PATH/TO/DOWNLOADED/QUANT/MODEL --int8_model_save_path=/PATH/TO/SAVE/QUANT/INT8/MODEL --ops_to_quantize="conv2d,pool2d"
    ```
 
 2. Run the C-API test for performance benchmark.
 
    ```bash
    cd /PATH/TO/PADDLE/build
-   OMP_NUM_THREADS=1 paddle/fluid/inference/tests/api/test_analyzer_qat_image_classification ARGS --enable_fp32=false --with_accuracy_layer=false --int8_model=/PATH/TO/SAVED/QAT/INT8/MODEL --infer_data=$HOME/.cache/paddle/dataset/int8/download/int8_full_val.bin --batch_size=1 --paddle_num_threads=1
+   OMP_NUM_THREADS=1 paddle/fluid/inference/tests/api/test_analyzer_quant_image_classification ARGS --enable_fp32=false --with_accuracy_layer=false --int8_model=/PATH/TO/SAVED/QUANT/INT8/MODEL --infer_data=$HOME/.cache/paddle/dataset/int8/download/int8_full_val.bin --batch_size=1 --paddle_num_threads=1
    ```
diff --git a/python/paddle/fluid/contrib/slim/tests/qat2_int8_image_classification_comparison.py b/python/paddle/fluid/contrib/slim/tests/quant2_int8_image_classification_comparison.py
similarity index 74%
rename from python/paddle/fluid/contrib/slim/tests/qat2_int8_image_classification_comparison.py
rename to python/paddle/fluid/contrib/slim/tests/quant2_int8_image_classification_comparison.py
index a6d81e06bc04c31a566b76bd3e9296142d984e25..77c925a1b111a243a2749eacf1b4d42b9cfb379b 100644
--- a/python/paddle/fluid/contrib/slim/tests/qat2_int8_image_classification_comparison.py
+++ b/python/paddle/fluid/contrib/slim/tests/quant2_int8_image_classification_comparison.py
@@ -24,7 +24,7 @@ import time
 import paddle
 import paddle.fluid as fluid
 from paddle.fluid.framework import IrGraph
-from paddle.fluid.contrib.slim.quantization import Qat2Int8MkldnnPass
+from paddle.fluid.contrib.slim.quantization import Quant2Int8MkldnnPass
 from paddle.fluid import core
 
 logging.basicConfig(format='%(asctime)s-%(levelname)s: %(message)s')
@@ -42,7 +42,7 @@ def parse_args():
         help='Number of the first minibatches to skip in performance statistics.'
     )
     parser.add_argument(
-        '--qat_model', type=str, default='', help='A path to a QAT model.')
+        '--quant_model', type=str, default='', help='A path to a Quant model.')
     parser.add_argument(
         '--fp32_model', type=str, default='', help='A path to an FP32 model.')
     parser.add_argument('--infer_data', type=str, default='', help='Data file.')
@@ -71,15 +71,15 @@ def parse_args():
     parser.add_argument(
         '--debug',
         action='store_true',
-        help='If used, the graph of QAT model is drawn.')
+        help='If used, the graph of Quant model is drawn.')
 
     test_args, args = parser.parse_known_args(namespace=unittest)
     return test_args, sys.argv[:1] + args
 
 
-class Qat2Int8ImageClassificationComparisonTest(unittest.TestCase):
+class Quant2Int8ImageClassificationComparisonTest(unittest.TestCase):
     """
-    Test for accuracy comparison of FP32 and QAT2 INT8 Image Classification inference.
+    Test for accuracy comparison of FP32 and Quant2 INT8 Image Classification inference.
     """
 
     def _reader_creator(self, data_file='data.bin'):
@@ -167,7 +167,8 @@ class Qat2Int8ImageClassificationComparisonTest(unittest.TestCase):
                  batch_size=1,
                  batch_num=1,
                  skip_batch_num=0,
-                 transform_to_int8=False):
+                 target='quant'):
+        assert target in ['quant', 'int8', 'fp32']
         place = fluid.CPUPlace()
         exe = fluid.Executor(place)
         inference_scope = fluid.executor.global_scope()
@@ -182,18 +183,20 @@ class Qat2Int8ImageClassificationComparisonTest(unittest.TestCase):
 
             graph = IrGraph(core.Graph(inference_program.desc), for_test=True)
             if (self._debug):
-                graph.draw('.', 'qat_orig', graph.all_op_nodes())
-            if (transform_to_int8):
-                transform_to_mkldnn_int8_pass = Qat2Int8MkldnnPass(
-                    self._quantized_ops,
-                    _op_ids_to_skip=self._op_ids_to_skip,
-                    _scope=inference_scope,
-                    _place=place,
-                    _core=core,
-                    _debug=self._debug)
-                graph = transform_to_mkldnn_int8_pass.apply(graph)
-            else:
+                graph.draw('.', 'quant_orig', graph.all_op_nodes())
+            quant_transform_pass = Quant2Int8MkldnnPass(
+                self._quantized_ops,
+                _op_ids_to_skip=self._op_ids_to_skip,
+                _scope=inference_scope,
+                _place=place,
+                _core=core,
+                _debug=self._debug)
+            if (target == 'quant'):
                 graph = self._prepare_for_fp32_mkldnn(graph)
+            elif (target == 'int8'):
+                graph = quant_transform_pass.apply(graph)
+            else:  # target == fp32
+                graph = quant_transform_pass.prepare_and_optimize_fp32(graph)
 
             inference_program = graph.to_program()
 
@@ -222,18 +225,7 @@ class Qat2Int8ImageClassificationComparisonTest(unittest.TestCase):
                 images = np.array(images).astype('float32')
                 labels = np.array([x[1] for x in data]).astype('int64')
 
-                if (transform_to_int8 == True):
-                    # QAT INT8 models do not have accuracy measuring layers
-                    start = time.time()
-                    out = exe.run(inference_program,
-                                  feed={feed_target_names[0]: images},
-                                  fetch_list=fetch_targets)
-                    batch_time = (time.time() - start) * 1000  # in miliseconds
-                    outputs.append(out[0])
-                    # Calculate accuracy result
-                    batch_acc1, batch_acc5 = self._get_batch_accuracy(out[0],
-                                                                      labels)
-                else:
+                if (target == 'fp32'):
                     # FP32 models have accuracy measuring layers
                     labels = labels.reshape([-1, 1])
                     start = time.time()
@@ -246,6 +238,18 @@ class Qat2Int8ImageClassificationComparisonTest(unittest.TestCase):
                     batch_time = (time.time() - start) * 1000  # in miliseconds
                     batch_acc1, batch_acc5 = out[1][0], out[2][0]
                     outputs.append(batch_acc1)
+                else:
+                    # Quant INT8 models do not have accuracy measuring layers
+                    start = time.time()
+                    out = exe.run(inference_program,
+                                  feed={feed_target_names[0]: images},
+                                  fetch_list=fetch_targets)
+                    batch_time = (time.time() - start) * 1000  # in miliseconds
+                    outputs.append(out[0])
+                    # Calculate accuracy result
+                    batch_acc1, batch_acc5 = self._get_batch_accuracy(out[0],
+                                                                      labels)
+
                 infer_accs1.append(batch_acc1)
                 infer_accs5.append(batch_acc5)
                 samples = len(data)
@@ -274,39 +278,47 @@ class Qat2Int8ImageClassificationComparisonTest(unittest.TestCase):
 
             return outputs, acc1_avg, acc5_avg, fps_avg, latency_avg
 
-    def _summarize_performance(self, fp32_fps, fp32_lat, int8_fps, int8_lat):
+    def _print_performance(self, title, fps, lat):
+        _logger.info('{0}: avg fps: {1:.2f}, avg latency: {2:.4f} ms'.format(
+            title, fps, lat))
+
+    def _print_accuracy(self, title, acc1, acc5):
+        _logger.info(
+            '{0}: avg top1 accuracy: {1:.4f}, avg top5 accuracy: {2:.4f}'.
+            format(title, acc1, acc5))
+
+    def _summarize_performance(self, int8_fps, int8_lat, fp32_fps, fp32_lat):
         _logger.info('--- Performance summary ---')
-        _logger.info('FP32: avg fps: {0:.2f}, avg latency: {1:.4f} ms'.format(
-            fp32_fps, fp32_lat))
-        _logger.info('INT8: avg fps: {0:.2f}, avg latency: {1:.4f} ms'.format(
-            int8_fps, int8_lat))
+        self._print_performance('INT8', int8_fps, int8_lat)
+        if fp32_lat >= 0:
+            self._print_performance('FP32', fp32_fps, fp32_lat)
 
-    def _compare_accuracy(self, fp32_acc1, fp32_acc5, int8_acc1, int8_acc5,
-                          threshold):
+    def _summarize_accuracy(self, quant_acc1, quant_acc5, int8_acc1, int8_acc5,
+                            fp32_acc1, fp32_acc5):
         _logger.info('--- Accuracy summary ---')
+        self._print_accuracy('Quant', quant_acc1, quant_acc5)
+        self._print_accuracy('INT8', int8_acc1, int8_acc5)
+        if fp32_acc1 >= 0:
+            self._print_accuracy('FP32', fp32_acc1, fp32_acc5)
+
+    def _compare_accuracy(self, threshold, quant_acc1, int8_acc1):
         _logger.info(
-            'Accepted top1 accuracy drop threshold: {0}. (condition: (FP32_top1_acc - IN8_top1_acc) <= threshold)'
+            'Accepted top1 accuracy drop threshold: {0}. (condition: (Quant_top1_acc - IN8_top1_acc) <= threshold && Quant_top1_acc > 0.5 && INT8_top1_acc > 0.5)'
             .format(threshold))
-        _logger.info(
-            'FP32: avg top1 accuracy: {0:.4f}, avg top5 accuracy: {1:.4f}'.
-            format(fp32_acc1, fp32_acc5))
-        _logger.info(
-            'INT8: avg top1 accuracy: {0:.4f}, avg top5 accuracy: {1:.4f}'.
-            format(int8_acc1, int8_acc5))
-        assert fp32_acc1 > 0.0
-        assert int8_acc1 > 0.0
-        assert fp32_acc1 - int8_acc1 <= threshold
+        # We assume valid accuracy to be at least 0.5
+        assert quant_acc1 > 0.5
+        assert int8_acc1 > 0.5
+        assert quant_acc1 - int8_acc1 <= threshold
 
     def test_graph_transformation(self):
         if not fluid.core.is_compiled_with_mkldnn():
             return
 
-        qat_model_path = test_case_args.qat_model
-        assert qat_model_path, 'The QAT model path cannot be empty. Please, use the --qat_model option.'
-        fp32_model_path = test_case_args.fp32_model
-        assert fp32_model_path, 'The FP32 model path cannot be empty. Please, use the --fp32_model option.'
+        quant_model_path = test_case_args.quant_model
+        assert quant_model_path, 'The Quant model path cannot be empty. Please, use the --quant_model option.'
         data_path = test_case_args.infer_data
         assert data_path, 'The dataset path cannot be empty. Please, use the --infer_data option.'
+        fp32_model_path = test_case_args.fp32_model
         batch_size = test_case_args.batch_size
         batch_num = test_case_args.batch_num
         skip_batch_num = test_case_args.skip_batch_num
@@ -323,9 +335,10 @@ class Qat2Int8ImageClassificationComparisonTest(unittest.TestCase):
             self._op_ids_to_skip = set(
                 map(int, test_case_args.op_ids_to_skip.split(',')))
 
-        _logger.info('FP32 & QAT INT8 prediction run.')
-        _logger.info('QAT model: {}'.format(qat_model_path))
-        _logger.info('FP32 model: {}'.format(fp32_model_path))
+        _logger.info('Quant & INT8 prediction run.')
+        _logger.info('Quant model: {}'.format(quant_model_path))
+        if fp32_model_path:
+            _logger.info('FP32 model: {}'.format(fp32_model_path))
         _logger.info('Dataset: {}'.format(data_path))
         _logger.info('Batch size: {}'.format(batch_size))
         _logger.info('Batch number: {}'.format(batch_num))
@@ -336,30 +349,51 @@ class Qat2Int8ImageClassificationComparisonTest(unittest.TestCase):
             map(str, self._op_ids_to_skip)) if test_case_args.op_ids_to_skip
                                                                else 'none'))
 
-        _logger.info('--- FP32 prediction start ---')
+        _logger.info('--- Quant prediction start ---')
         val_reader = paddle.batch(
             self._reader_creator(data_path), batch_size=batch_size)
-        fp32_output, fp32_acc1, fp32_acc5, fp32_fps, fp32_lat = self._predict(
+        quant_output, quant_acc1, quant_acc5, quant_fps, quant_lat = self._predict(
             val_reader,
-            fp32_model_path,
+            quant_model_path,
             batch_size,
             batch_num,
             skip_batch_num,
-            transform_to_int8=False)
-        _logger.info('--- QAT INT8 prediction start ---')
+            target='quant')
+        self._print_performance('Quant', quant_fps, quant_lat)
+        self._print_accuracy('Quant', quant_acc1, quant_acc5)
+
+        _logger.info('--- INT8 prediction start ---')
         val_reader = paddle.batch(
             self._reader_creator(data_path), batch_size=batch_size)
         int8_output, int8_acc1, int8_acc5, int8_fps, int8_lat = self._predict(
             val_reader,
-            qat_model_path,
+            quant_model_path,
             batch_size,
             batch_num,
             skip_batch_num,
-            transform_to_int8=True)
-
-        self._summarize_performance(fp32_fps, fp32_lat, int8_fps, int8_lat)
-        self._compare_accuracy(fp32_acc1, fp32_acc5, int8_acc1, int8_acc5,
-                               acc_diff_threshold)
+            target='int8')
+        self._print_performance('INT8', int8_fps, int8_lat)
+        self._print_accuracy('INT8', int8_acc1, int8_acc5)
+
+        fp32_acc1 = fp32_acc5 = fp32_fps = fp32_lat = -1
+        if fp32_model_path:
+            _logger.info('--- FP32 prediction start ---')
+            val_reader = paddle.batch(
+                self._reader_creator(data_path), batch_size=batch_size)
+            fp32_output, fp32_acc1, fp32_acc5, fp32_fps, fp32_lat = self._predict(
+                val_reader,
+                fp32_model_path,
+                batch_size,
+                batch_num,
+                skip_batch_num,
+                target='fp32')
+            self._print_performance('FP32', fp32_fps, fp32_lat)
+            self._print_accuracy('FP32', fp32_acc1, fp32_acc5)
+
+        self._summarize_performance(int8_fps, int8_lat, fp32_fps, fp32_lat)
+        self._summarize_accuracy(quant_acc1, quant_acc5, int8_acc1, int8_acc5,
+                                 fp32_acc1, fp32_acc5)
+        self._compare_accuracy(acc_diff_threshold, quant_acc1, int8_acc1)
 
 
 if __name__ == '__main__':
diff --git a/python/paddle/fluid/contrib/slim/tests/qat2_int8_nlp_comparison.py b/python/paddle/fluid/contrib/slim/tests/quant2_int8_nlp_comparison.py
similarity index 75%
rename from python/paddle/fluid/contrib/slim/tests/qat2_int8_nlp_comparison.py
rename to python/paddle/fluid/contrib/slim/tests/quant2_int8_nlp_comparison.py
index d4971511cb0681a9e7c6a85ddae924c20f7830b4..640d500152dd519393516b994ec9ab25b1e2ff54 100644
--- a/python/paddle/fluid/contrib/slim/tests/qat2_int8_nlp_comparison.py
+++ b/python/paddle/fluid/contrib/slim/tests/quant2_int8_nlp_comparison.py
@@ -17,14 +17,12 @@ import os
 import sys
 import argparse
 import logging
-import struct
-import six
 import numpy as np
 import time
 import paddle
 import paddle.fluid as fluid
 from paddle.fluid.framework import IrGraph
-from paddle.fluid.contrib.slim.quantization import Qat2Int8MkldnnPass
+from paddle.fluid.contrib.slim.quantization import Quant2Int8MkldnnPass
 from paddle.fluid import core
 
 logging.basicConfig(format='%(asctime)s-%(levelname)s: %(message)s')
@@ -42,12 +40,12 @@ def parse_args():
         help='Number of the first minibatches to skip in performance statistics.'
     )
     parser.add_argument(
-        '--qat_model', type=str, default='', help='A path to a QAT model.')
+        '--quant_model', type=str, default='', help='A path to a Quant model.')
     parser.add_argument(
         '--fp32_model',
         type=str,
         default='',
-        help='A path to an FP32 model. If empty, the QAT model will be used for FP32 inference.'
+        help='A path to an FP32 model. If empty, the Quant model will be used for FP32 inference.'
     )
     parser.add_argument('--infer_data', type=str, default='', help='Data file.')
     parser.add_argument(
@@ -77,16 +75,16 @@ def parse_args():
     parser.add_argument(
         '--debug',
         action='store_true',
-        help='If used, the graph of QAT model is drawn.')
+        help='If used, the graph of Quant model is drawn.')
 
     test_args, args = parser.parse_known_args(namespace=unittest)
 
     return test_args, sys.argv[:1] + args
 
 
-class QatInt8NLPComparisonTest(unittest.TestCase):
+class QuantInt8NLPComparisonTest(unittest.TestCase):
     """
-    Test for accuracy comparison of QAT FP32 and INT8 NLP inference.
+    Test for accuracy comparison of Quant FP32 and INT8 NLP inference.
     """
 
     def _reader_creator(self, data_file=None, labels_file=None):
@@ -143,7 +141,8 @@ class QatInt8NLPComparisonTest(unittest.TestCase):
                  batch_size=1,
                  batch_num=1,
                  skip_batch_num=0,
-                 transform_to_int8=False):
+                 target='quant'):
+        assert target in ['quant', 'int8', 'fp32']
         place = fluid.CPUPlace()
         exe = fluid.Executor(place)
         inference_scope = fluid.executor.global_scope()
@@ -158,16 +157,20 @@ class QatInt8NLPComparisonTest(unittest.TestCase):
 
             graph = IrGraph(core.Graph(inference_program.desc), for_test=True)
             if (self._debug):
-                graph.draw('.', 'qat_orig', graph.all_op_nodes())
-            if (transform_to_int8):
-                transform_to_mkldnn_int8_pass = Qat2Int8MkldnnPass(
+                graph.draw('.', 'quant_orig', graph.all_op_nodes())
+            if (target != 'quant'):
+                quant_transform_pass = Quant2Int8MkldnnPass(
                     self._quantized_ops,
                     _op_ids_to_skip=self._op_ids_to_skip,
                     _scope=inference_scope,
                     _place=place,
                     _core=core,
                     _debug=self._debug)
-                graph = transform_to_mkldnn_int8_pass.apply(graph)
+                if (target == 'int8'):
+                    graph = quant_transform_pass.apply(graph)
+                else:  # target == fp32
+                    graph = quant_transform_pass.prepare_and_optimize_fp32(
+                        graph)
 
             inference_program = graph.to_program()
 
@@ -223,36 +226,45 @@ class QatInt8NLPComparisonTest(unittest.TestCase):
 
             return acc_avg, pps_avg, latency_avg
 
-    def _summarize_performance(self, fp32_pps, fp32_lat, int8_pps, int8_lat):
-        _logger.info('--- Performance summary ---')
-        _logger.info(
-            'FP32: avg predictions per sec: {0:.2f}, avg latency: {1:.4f} ms'.
-            format(fp32_pps, fp32_lat))
+    def _print_performance(self, title, pps, lat):
         _logger.info(
-            'INT8: avg predictions per sec: {0:.2f}, avg latency: {1:.4f} ms'.
-            format(int8_pps, int8_lat))
+            '{0}: avg predictions per sec: {1:.2f}, avg latency: {2:.4f} ms'.
+            format(title, pps, lat))
+
+    def _print_accuracy(self, title, acc):
+        _logger.info('{0}: avg accuracy: {1:.6f}'.format(title, acc))
+
+    def _summarize_performance(self, int8_pps, int8_lat, fp32_pps, fp32_lat):
+        _logger.info('--- Performance summary ---')
+        self._print_performance('INT8', int8_pps, int8_lat)
+        if fp32_lat >= 0:
+            self._print_performance('FP32', fp32_pps, fp32_lat)
 
-    def _compare_accuracy(self, fp32_acc, int8_acc, threshold):
+    def _summarize_accuracy(self, quant_acc, int8_acc, fp32_acc):
         _logger.info('--- Accuracy summary ---')
+        self._print_accuracy('Quant', quant_acc)
+        self._print_accuracy('INT8', int8_acc)
+        if fp32_acc >= 0:
+            self._print_accuracy('FP32', fp32_acc)
+
+    def _compare_accuracy(self, threshold, quant_acc, int8_acc):
         _logger.info(
-            'Accepted accuracy drop threshold: {0}. (condition: (FP32_acc - INT8_acc) <= threshold)'
+            'Accepted accuracy drop threshold: {0}. (condition: (Quant_acc - INT8_acc) <= threshold)'
             .format(threshold))
-        _logger.info('FP32: avg accuracy: {0:.6f}'.format(fp32_acc))
-        _logger.info('INT8: avg accuracy: {0:.6f}'.format(int8_acc))
         # Random outputs give accuracy about 0.33, we assume valid accuracy to be at least 0.5
-        assert fp32_acc > 0.5
+        assert quant_acc > 0.5
         assert int8_acc > 0.5
-        assert fp32_acc - int8_acc <= threshold
+        assert quant_acc - int8_acc <= threshold
 
     def test_graph_transformation(self):
         if not fluid.core.is_compiled_with_mkldnn():
             return
 
-        qat_model_path = test_case_args.qat_model
-        assert qat_model_path, 'The QAT model path cannot be empty. Please, use the --qat_model option.'
-        fp32_model_path = test_case_args.fp32_model if test_case_args.fp32_model else qat_model_path
+        quant_model_path = test_case_args.quant_model
+        assert quant_model_path, 'The Quant model path cannot be empty. Please, use the --quant_model option.'
         data_path = test_case_args.infer_data
         assert data_path, 'The dataset path cannot be empty. Please, use the --infer_data option.'
+        fp32_model_path = test_case_args.fp32_model
         labels_path = test_case_args.labels
         batch_size = test_case_args.batch_size
         batch_num = test_case_args.batch_num
@@ -270,9 +282,10 @@ class QatInt8NLPComparisonTest(unittest.TestCase):
             self._op_ids_to_skip = set(
                 map(int, test_case_args.op_ids_to_skip.split(',')))
 
-        _logger.info('FP32 & QAT INT8 prediction run.')
-        _logger.info('QAT model: {}'.format(qat_model_path))
-        _logger.info('FP32 model: {}'.format(fp32_model_path))
+        _logger.info('Quant & INT8 prediction run.')
+        _logger.info('Quant model: {}'.format(quant_model_path))
+        if fp32_model_path:
+            _logger.info('FP32 model: {}'.format(fp32_model_path))
         _logger.info('Dataset: {}'.format(data_path))
         _logger.info('Labels: {}'.format(labels_path))
         _logger.info('Batch size: {}'.format(batch_size))
@@ -284,31 +297,51 @@ class QatInt8NLPComparisonTest(unittest.TestCase):
             map(str, self._op_ids_to_skip)) if test_case_args.op_ids_to_skip
                                                                else 'none'))
 
-        _logger.info('--- FP32 prediction start ---')
+        _logger.info('--- Quant prediction start ---')
         val_reader = paddle.batch(
             self._reader_creator(data_path, labels_path), batch_size=batch_size)
-        fp32_acc, fp32_pps, fp32_lat = self._predict(
+        quant_acc, quant_pps, quant_lat = self._predict(
             val_reader,
-            fp32_model_path,
+            quant_model_path,
             batch_size,
             batch_num,
             skip_batch_num,
-            transform_to_int8=False)
-        _logger.info('FP32: avg accuracy: {0:.6f}'.format(fp32_acc))
-        _logger.info('--- QAT INT8 prediction start ---')
+            target='quant')
+        self._print_performance('Quant', quant_pps, quant_lat)
+        self._print_accuracy('Quant', quant_acc)
+
+        _logger.info('--- INT8 prediction start ---')
         val_reader = paddle.batch(
             self._reader_creator(data_path, labels_path), batch_size=batch_size)
         int8_acc, int8_pps, int8_lat = self._predict(
             val_reader,
-            qat_model_path,
+            quant_model_path,
             batch_size,
             batch_num,
             skip_batch_num,
-            transform_to_int8=True)
-        _logger.info('INT8: avg accuracy: {0:.6f}'.format(int8_acc))
+            target='int8')
+        self._print_performance('INT8', int8_pps, int8_lat)
+        self._print_accuracy('INT8', int8_acc)
+
+        fp32_acc = fp32_pps = fp32_lat = -1
+        if fp32_model_path:
+            _logger.info('--- FP32 prediction start ---')
+            val_reader = paddle.batch(
+                self._reader_creator(data_path, labels_path),
+                batch_size=batch_size)
+            fp32_acc, fp32_pps, fp32_lat = self._predict(
+                val_reader,
+                fp32_model_path,
+                batch_size,
+                batch_num,
+                skip_batch_num,
+                target='fp32')
+            self._print_performance('FP32', fp32_pps, fp32_lat)
+            self._print_accuracy('FP32', fp32_acc)
 
-        self._summarize_performance(fp32_pps, fp32_lat, int8_pps, int8_lat)
-        self._compare_accuracy(fp32_acc, int8_acc, acc_diff_threshold)
+        self._summarize_performance(int8_pps, int8_lat, fp32_pps, fp32_lat)
+        self._summarize_accuracy(quant_acc, int8_acc, fp32_acc)
+        self._compare_accuracy(acc_diff_threshold, quant_acc, int8_acc)
 
 
 if __name__ == '__main__':
diff --git a/python/paddle/fluid/contrib/slim/tests/qat_int8_image_classification_comparison.py b/python/paddle/fluid/contrib/slim/tests/quant_int8_image_classification_comparison.py
similarity index 92%
rename from python/paddle/fluid/contrib/slim/tests/qat_int8_image_classification_comparison.py
rename to python/paddle/fluid/contrib/slim/tests/quant_int8_image_classification_comparison.py
index 3d09f762912fd97665779948b3dfae5c60e3eac2..5f0a8f2d6fa9818481096249aaf74da27a852531 100644
--- a/python/paddle/fluid/contrib/slim/tests/qat_int8_image_classification_comparison.py
+++ b/python/paddle/fluid/contrib/slim/tests/quant_int8_image_classification_comparison.py
@@ -24,7 +24,7 @@ import time
 import paddle
 import paddle.fluid as fluid
 from paddle.fluid.framework import IrGraph
-from paddle.fluid.contrib.slim.quantization import QatInt8MkldnnPass
+from paddle.fluid.contrib.slim.quantization import QuantInt8MkldnnPass
 from paddle.fluid import core
 
 logging.basicConfig(format='%(asctime)s-%(levelname)s: %(message)s')
@@ -44,9 +44,9 @@ def parse_args():
     parser.add_argument(
         '--debug',
         action='store_true',
-        help='If used, the graph of QAT model is drawn.')
+        help='If used, the graph of Quant model is drawn.')
     parser.add_argument(
-        '--qat_model', type=str, default='', help='A path to a QAT model.')
+        '--quant_model', type=str, default='', help='A path to a Quant model.')
     parser.add_argument('--infer_data', type=str, default='', help='Data file.')
     parser.add_argument(
         '--batch_num',
@@ -64,9 +64,9 @@ def parse_args():
     return test_args, sys.argv[:1] + args
 
 
-class QatInt8ImageClassificationComparisonTest(unittest.TestCase):
+class QuantInt8ImageClassificationComparisonTest(unittest.TestCase):
     """
-    Test for accuracy comparison of QAT FP32 and INT8 Image Classification inference.
+    Test for accuracy comparison of Quant FP32 and INT8 Image Classification inference.
     """
 
     def _reader_creator(self, data_file='data.bin'):
@@ -169,9 +169,9 @@ class QatInt8ImageClassificationComparisonTest(unittest.TestCase):
 
             graph = IrGraph(core.Graph(inference_program.desc), for_test=True)
             if (self._debug):
-                graph.draw('.', 'qat_orig', graph.all_op_nodes())
+                graph.draw('.', 'quant_orig', graph.all_op_nodes())
             if (transform_to_int8):
-                mkldnn_int8_pass = QatInt8MkldnnPass(
+                mkldnn_int8_pass = QuantInt8MkldnnPass(
                     _scope=inference_scope, _place=place)
                 graph = mkldnn_int8_pass.apply(graph)
             else:
@@ -264,8 +264,8 @@ class QatInt8ImageClassificationComparisonTest(unittest.TestCase):
         if not fluid.core.is_compiled_with_mkldnn():
             return
 
-        qat_model_path = test_case_args.qat_model
-        assert qat_model_path, 'The QAT model path cannot be empty. Please, use the --qat_model option.'
+        quant_model_path = test_case_args.quant_model
+        assert quant_model_path, 'The Quant model path cannot be empty. Please, use the --quant_model option.'
         data_path = test_case_args.infer_data
         assert data_path, 'The dataset path cannot be empty. Please, use the --infer_data option.'
         batch_size = test_case_args.batch_size
@@ -274,29 +274,29 @@ class QatInt8ImageClassificationComparisonTest(unittest.TestCase):
         acc_diff_threshold = test_case_args.acc_diff_threshold
         self._debug = test_case_args.debug
 
-        _logger.info('QAT FP32 & INT8 prediction run.')
-        _logger.info('QAT model: {0}'.format(qat_model_path))
+        _logger.info('Quant FP32 & INT8 prediction run.')
+        _logger.info('Quant model: {0}'.format(quant_model_path))
         _logger.info('Dataset: {0}'.format(data_path))
         _logger.info('Batch size: {0}'.format(batch_size))
         _logger.info('Batch number: {0}'.format(batch_num))
         _logger.info('Accuracy drop threshold: {0}.'.format(acc_diff_threshold))
 
-        _logger.info('--- QAT FP32 prediction start ---')
+        _logger.info('--- Quant FP32 prediction start ---')
         val_reader = paddle.batch(
             self._reader_creator(data_path), batch_size=batch_size)
         fp32_output, fp32_acc1, fp32_acc5, fp32_fps, fp32_lat = self._predict(
             val_reader,
-            qat_model_path,
+            quant_model_path,
             batch_size,
             batch_num,
             skip_batch_num,
             transform_to_int8=False)
-        _logger.info('--- QAT INT8 prediction start ---')
+        _logger.info('--- Quant INT8 prediction start ---')
         val_reader = paddle.batch(
             self._reader_creator(data_path), batch_size=batch_size)
         int8_output, int8_acc1, int8_acc5, int8_fps, int8_lat = self._predict(
             val_reader,
-            qat_model_path,
+            quant_model_path,
             batch_size,
             batch_num,
             skip_batch_num,
diff --git a/python/paddle/fluid/contrib/slim/tests/save_qat_model.py b/python/paddle/fluid/contrib/slim/tests/save_quant_model.py
similarity index 72%
rename from python/paddle/fluid/contrib/slim/tests/save_qat_model.py
rename to python/paddle/fluid/contrib/slim/tests/save_quant_model.py
index 6f029eb9d9ab0f40a73f013bbe4692469c3b8611..dab4b63cda4cca8036b4236d44cb54660258c0d4 100644
--- a/python/paddle/fluid/contrib/slim/tests/save_qat_model.py
+++ b/python/paddle/fluid/contrib/slim/tests/save_quant_model.py
@@ -24,19 +24,17 @@ import time
 import paddle
 import paddle.fluid as fluid
 from paddle.fluid.framework import IrGraph
-from paddle.fluid.contrib.slim.quantization import Qat2Int8MkldnnPass
+from paddle.fluid.contrib.slim.quantization import Quant2Int8MkldnnPass
 from paddle.fluid import core
 
 
 def parse_args():
     parser = argparse.ArgumentParser()
     parser.add_argument(
-        '--qat_model_path', type=str, default='', help='A path to a QAT model.')
-    parser.add_argument(
-        '--fp32_model_save_path',
+        '--quant_model_path',
         type=str,
         default='',
-        help='Saved optimized fp32 model')
+        help='A path to a Quant model.')
     parser.add_argument(
         '--int8_model_save_path',
         type=str,
@@ -56,13 +54,13 @@ def parse_args():
     parser.add_argument(
         '--debug',
         action='store_true',
-        help='If used, the graph of QAT model is drawn.')
+        help='If used, the graph of Quant model is drawn.')
 
     test_args, args = parser.parse_known_args(namespace=unittest)
     return test_args, sys.argv[:1] + args
 
 
-def transform_and_save_model(original_path, save_path, save_type):
+def transform_and_save_int8_model(original_path, save_path):
     place = fluid.CPUPlace()
     exe = fluid.Executor(place)
     inference_scope = fluid.executor.global_scope()
@@ -85,34 +83,26 @@ def transform_and_save_model(original_path, save_path, save_type):
 
         graph = IrGraph(core.Graph(inference_program.desc), for_test=True)
         if (test_args.debug):
-            graph.draw('.', 'qat_orig', graph.all_op_nodes())
-        transform_to_mkldnn_int8_pass = Qat2Int8MkldnnPass(
+            graph.draw('.', 'quant_orig', graph.all_op_nodes())
+        transform_to_mkldnn_int8_pass = Quant2Int8MkldnnPass(
             ops_to_quantize,
             _op_ids_to_skip=op_ids_to_skip,
             _scope=inference_scope,
             _place=place,
             _core=core,
             _debug=test_args.debug)
-
-        graph = IrGraph(core.Graph(inference_program.desc), for_test=True)
-        if save_type == 'FP32':
-            graph = transform_to_mkldnn_int8_pass.apply_fp32(graph)
-        elif save_type == 'INT8':
-            graph = transform_to_mkldnn_int8_pass.apply(graph)
+        graph = transform_to_mkldnn_int8_pass.apply(graph)
         inference_program = graph.to_program()
         with fluid.scope_guard(inference_scope):
             fluid.io.save_inference_model(save_path, feed_target_names,
                                           fetch_targets, exe, inference_program)
-        print("Success! Transformed QAT_{0} model can be found at {1}\n".format(
-            save_type, save_path))
+        print(
+            "Success! INT8 model obtained from the Quant model can be found at {}\n"
+            .format(save_path))
 
 
 if __name__ == '__main__':
     global test_args
     test_args, remaining_args = parse_args()
-    if test_args.fp32_model_save_path:
-        transform_and_save_model(test_args.qat_model_path,
-                                 test_args.fp32_model_save_path, 'FP32')
-    if test_args.int8_model_save_path:
-        transform_and_save_model(test_args.qat_model_path,
-                                 test_args.int8_model_save_path, 'INT8')
+    transform_and_save_int8_model(test_args.quant_model_path,
+                                  test_args.int8_model_save_path)
diff --git a/python/paddle/fluid/contrib/slim/tests/test_imperative_qat.py b/python/paddle/fluid/contrib/slim/tests/test_imperative_qat.py
new file mode 100644
index 0000000000000000000000000000000000000000..79b0bbd6a4dd3850f49aa0b5124e9be86d4e6ee3
--- /dev/null
+++ b/python/paddle/fluid/contrib/slim/tests/test_imperative_qat.py
@@ -0,0 +1,425 @@
+#   copyright (c) 2018 paddlepaddle authors. all rights reserved.
+#
+# licensed under the apache license, version 2.0 (the "license");
+# you may not use this file except in compliance with the license.
+# you may obtain a copy of the license at
+#
+#     http://www.apache.org/licenses/license-2.0
+#
+# unless required by applicable law or agreed to in writing, software
+# distributed under the license is distributed on an "as is" basis,
+# without warranties or conditions of any kind, either express or implied.
+# see the license for the specific language governing permissions and
+# limitations under the license.
+
+from __future__ import print_function
+
+import os
+import numpy as np
+import random
+import unittest
+import logging
+import paddle
+import paddle.fluid as fluid
+from paddle.fluid import core
+from paddle.fluid.optimizer import AdamOptimizer
+from paddle.fluid.framework import IrGraph
+from paddle.fluid.contrib.slim.quantization import ImperativeQuantAware
+from paddle.fluid.contrib.slim.quantization import QuantizationTransformPass
+from paddle.fluid.dygraph.container import Sequential
+from paddle.fluid.dygraph.nn import Conv2D
+from paddle.fluid.dygraph.nn import Pool2D
+from paddle.fluid.dygraph.nn import Linear
+from paddle.fluid.log_helper import get_logger
+
+os.environ["CPU_NUM"] = "1"
+if core.is_compiled_with_cuda():
+    fluid.set_flags({"FLAGS_cudnn_deterministic": True})
+
+_logger = get_logger(
+    __name__, logging.INFO, fmt='%(asctime)s-%(levelname)s: %(message)s')
+
+
+def StaticLenet(data, num_classes=10, classifier_activation='softmax'):
+    conv2d_w1_attr = fluid.ParamAttr(name="conv2d_w_1")
+    conv2d_w2_attr = fluid.ParamAttr(name="conv2d_w_2")
+    fc_w1_attr = fluid.ParamAttr(name="fc_w_1")
+    fc_w2_attr = fluid.ParamAttr(name="fc_w_2")
+    fc_w3_attr = fluid.ParamAttr(name="fc_w_3")
+    conv2d_b1_attr = fluid.ParamAttr(name="conv2d_b_1")
+    conv2d_b2_attr = fluid.ParamAttr(name="conv2d_b_2")
+    fc_b1_attr = fluid.ParamAttr(name="fc_b_1")
+    fc_b2_attr = fluid.ParamAttr(name="fc_b_2")
+    fc_b3_attr = fluid.ParamAttr(name="fc_b_3")
+    conv1 = fluid.layers.conv2d(
+        data,
+        num_filters=6,
+        filter_size=3,
+        stride=1,
+        padding=1,
+        param_attr=conv2d_w1_attr,
+        bias_attr=conv2d_b1_attr)
+    pool1 = fluid.layers.pool2d(
+        conv1, pool_size=2, pool_type='max', pool_stride=2)
+    conv2 = fluid.layers.conv2d(
+        pool1,
+        num_filters=16,
+        filter_size=5,
+        stride=1,
+        padding=0,
+        param_attr=conv2d_w2_attr,
+        bias_attr=conv2d_b2_attr)
+    pool2 = fluid.layers.pool2d(
+        conv2, pool_size=2, pool_type='max', pool_stride=2)
+
+    fc1 = fluid.layers.fc(input=pool2,
+                          size=120,
+                          param_attr=fc_w1_attr,
+                          bias_attr=fc_b1_attr)
+    fc2 = fluid.layers.fc(input=fc1,
+                          size=84,
+                          param_attr=fc_w2_attr,
+                          bias_attr=fc_b2_attr)
+    fc3 = fluid.layers.fc(input=fc2,
+                          size=num_classes,
+                          act=classifier_activation,
+                          param_attr=fc_w3_attr,
+                          bias_attr=fc_b3_attr)
+
+    return fc3
+
+
+class ImperativeLenet(fluid.dygraph.Layer):
+    def __init__(self, num_classes=10, classifier_activation='softmax'):
+        super(ImperativeLenet, self).__init__()
+        conv2d_w1_attr = fluid.ParamAttr(name="conv2d_w_1")
+        conv2d_w2_attr = fluid.ParamAttr(name="conv2d_w_2")
+        fc_w1_attr = fluid.ParamAttr(name="fc_w_1")
+        fc_w2_attr = fluid.ParamAttr(name="fc_w_2")
+        fc_w3_attr = fluid.ParamAttr(name="fc_w_3")
+        conv2d_b1_attr = fluid.ParamAttr(name="conv2d_b_1")
+        conv2d_b2_attr = fluid.ParamAttr(name="conv2d_b_2")
+        fc_b1_attr = fluid.ParamAttr(name="fc_b_1")
+        fc_b2_attr = fluid.ParamAttr(name="fc_b_2")
+        fc_b3_attr = fluid.ParamAttr(name="fc_b_3")
+        self.features = Sequential(
+            Conv2D(
+                num_channels=1,
+                num_filters=6,
+                filter_size=3,
+                stride=1,
+                padding=1,
+                param_attr=conv2d_w1_attr,
+                bias_attr=conv2d_b1_attr),
+            Pool2D(
+                pool_size=2, pool_type='max', pool_stride=2),
+            Conv2D(
+                num_channels=6,
+                num_filters=16,
+                filter_size=5,
+                stride=1,
+                padding=0,
+                param_attr=conv2d_w2_attr,
+                bias_attr=conv2d_b2_attr),
+            Pool2D(
+                pool_size=2, pool_type='max', pool_stride=2))
+
+        self.fc = Sequential(
+            Linear(
+                input_dim=400,
+                output_dim=120,
+                param_attr=fc_w1_attr,
+                bias_attr=fc_b1_attr),
+            Linear(
+                input_dim=120,
+                output_dim=84,
+                param_attr=fc_w2_attr,
+                bias_attr=fc_b2_attr),
+            Linear(
+                input_dim=84,
+                output_dim=num_classes,
+                act=classifier_activation,
+                param_attr=fc_w3_attr,
+                bias_attr=fc_b3_attr))
+
+    def forward(self, inputs):
+        x = self.features(inputs)
+
+        x = fluid.layers.flatten(x, 1)
+        x = self.fc(x)
+        return x
+
+
+class TestImperativeQat(unittest.TestCase):
+    """
+    QAT = quantization-aware training
+    """
+
+    def test_qat_save(self):
+        imperative_qat = ImperativeQuantAware(
+            weight_quantize_type='abs_max',
+            activation_quantize_type='moving_average_abs_max')
+
+        with fluid.dygraph.guard():
+            lenet = ImperativeLenet()
+            imperative_qat.quantize(lenet)
+            adam = AdamOptimizer(
+                learning_rate=0.001, parameter_list=lenet.parameters())
+            train_reader = paddle.batch(
+                paddle.dataset.mnist.train(), batch_size=32, drop_last=True)
+            test_reader = paddle.batch(
+                paddle.dataset.mnist.test(), batch_size=32)
+
+            epoch_num = 1
+            for epoch in range(epoch_num):
+                lenet.train()
+                for batch_id, data in enumerate(train_reader()):
+                    x_data = np.array([x[0].reshape(1, 28, 28)
+                                       for x in data]).astype('float32')
+                    y_data = np.array(
+                        [x[1] for x in data]).astype('int64').reshape(-1, 1)
+
+                    img = fluid.dygraph.to_variable(x_data)
+                    label = fluid.dygraph.to_variable(y_data)
+
+                    out = lenet(img)
+                    acc = fluid.layers.accuracy(out, label)
+                    loss = fluid.layers.cross_entropy(out, label)
+                    avg_loss = fluid.layers.mean(loss)
+                    avg_loss.backward()
+                    adam.minimize(avg_loss)
+                    lenet.clear_gradients()
+                    if batch_id % 100 == 0:
+                        _logger.info(
+                            "Train | At epoch {} step {}: loss = {:}, acc= {:}".
+                            format(epoch, batch_id,
+                                   avg_loss.numpy(), acc.numpy()))
+
+                lenet.eval()
+                for batch_id, data in enumerate(test_reader()):
+                    x_data = np.array([x[0].reshape(1, 28, 28)
+                                       for x in data]).astype('float32')
+                    y_data = np.array(
+                        [x[1] for x in data]).astype('int64').reshape(-1, 1)
+
+                    img = fluid.dygraph.to_variable(x_data)
+                    label = fluid.dygraph.to_variable(y_data)
+
+                    out = lenet(img)
+                    acc_top1 = fluid.layers.accuracy(
+                        input=out, label=label, k=1)
+                    acc_top5 = fluid.layers.accuracy(
+                        input=out, label=label, k=5)
+
+                    if batch_id % 100 == 0:
+                        _logger.info(
+                            "Test | At epoch {} step {}: acc1 = {:}, acc5 = {:}".
+                            format(epoch, batch_id,
+                                   acc_top1.numpy(), acc_top5.numpy()))
+
+            # save weights
+            model_dict = lenet.state_dict()
+            fluid.save_dygraph(model_dict, "save_temp")
+
+            # test the correctness of `save_quantized_model`
+            data = next(test_reader())
+            test_data = np.array([x[0].reshape(1, 28, 28)
+                                  for x in data]).astype('float32')
+            test_img = fluid.dygraph.to_variable(test_data)
+            lenet.eval()
+            before_save = lenet(test_img)
+
+        # save inference quantized model
+        path = "./mnist_infer_model"
+        imperative_qat.save_quantized_model(
+            dirname=path,
+            model=lenet,
+            input_shape=[(1, 28, 28)],
+            input_dtype=['float32'],
+            feed=[0],
+            fetch=[0])
+        if core.is_compiled_with_cuda():
+            place = core.CUDAPlace(0)
+        else:
+            place = core.CPUPlace()
+        exe = fluid.Executor(place)
+        [inference_program, feed_target_names, fetch_targets] = (
+            fluid.io.load_inference_model(
+                dirname=path, executor=exe))
+        after_save, = exe.run(inference_program,
+                              feed={feed_target_names[0]: test_data},
+                              fetch_list=fetch_targets)
+
+        self.assertTrue(
+            np.allclose(after_save, before_save.numpy()),
+            msg='Failed to save the inference quantized model.')
+
+    def test_qat_acc(self):
+        def _build_static_lenet(main, startup, is_test=False, seed=1000):
+            with fluid.unique_name.guard():
+                with fluid.program_guard(main, startup):
+                    main.random_seed = seed
+                    startup.random_seed = seed
+                    img = fluid.layers.data(
+                        name='image', shape=[1, 28, 28], dtype='float32')
+                    label = fluid.layers.data(
+                        name='label', shape=[1], dtype='int64')
+                    prediction = StaticLenet(img)
+                    if not is_test:
+                        loss = fluid.layers.cross_entropy(
+                            input=prediction, label=label)
+                        avg_loss = fluid.layers.mean(loss)
+                    else:
+                        avg_loss = prediction
+            return img, label, avg_loss
+
+        reader = paddle.batch(
+            paddle.dataset.mnist.test(), batch_size=32, drop_last=True)
+        weight_quantize_type = 'abs_max'
+        activation_quant_type = 'moving_average_abs_max'
+        param_init_map = {}
+        seed = 1000
+        lr = 0.1
+
+        # imperative train
+        _logger.info(
+            "--------------------------dynamic graph qat--------------------------"
+        )
+        imperative_qat = ImperativeQuantAware(
+            weight_quantize_type=weight_quantize_type,
+            activation_quantize_type=activation_quant_type)
+
+        with fluid.dygraph.guard():
+            np.random.seed(seed)
+            fluid.default_main_program().random_seed = seed
+            fluid.default_startup_program().random_seed = seed
+            lenet = ImperativeLenet()
+            fixed_state = {}
+            for name, param in lenet.named_parameters():
+                p_shape = param.numpy().shape
+                p_value = param.numpy()
+                if name.endswith("bias"):
+                    value = np.zeros_like(p_value).astype('float32')
+                else:
+                    value = np.random.normal(
+                        loc=0.0, scale=0.01, size=np.product(p_shape)).reshape(
+                            p_shape).astype('float32')
+                fixed_state[name] = value
+                param_init_map[param.name] = value
+            lenet.set_dict(fixed_state)
+
+            imperative_qat.quantize(lenet)
+            adam = AdamOptimizer(
+                learning_rate=lr, parameter_list=lenet.parameters())
+            dynamic_loss_rec = []
+            lenet.train()
+            for batch_id, data in enumerate(reader()):
+                x_data = np.array([x[0].reshape(1, 28, 28)
+                                   for x in data]).astype('float32')
+                y_data = np.array(
+                    [x[1] for x in data]).astype('int64').reshape(-1, 1)
+
+                img = fluid.dygraph.to_variable(x_data)
+                label = fluid.dygraph.to_variable(y_data)
+
+                out = lenet(img)
+                loss = fluid.layers.cross_entropy(out, label)
+                avg_loss = fluid.layers.mean(loss)
+                avg_loss.backward()
+                adam.minimize(avg_loss)
+                lenet.clear_gradients()
+                dynamic_loss_rec.append(avg_loss.numpy()[0])
+                if batch_id % 100 == 0:
+                    _logger.info('{}: {}'.format('loss', avg_loss.numpy()))
+
+        imperative_qat.save_quantized_model(
+            dirname="./dynamic_mnist",
+            model=lenet,
+            input_shape=[(1, 28, 28)],
+            input_dtype=['float32'],
+            feed=[0],
+            fetch=[0])
+
+        # static graph train
+        _logger.info(
+            "--------------------------static graph qat--------------------------"
+        )
+        static_loss_rec = []
+        if core.is_compiled_with_cuda():
+            place = core.CUDAPlace(0)
+        else:
+            place = core.CPUPlace()
+        exe = fluid.Executor(place)
+
+        main = fluid.Program()
+        infer = fluid.Program()
+        startup = fluid.Program()
+        static_img, static_label, static_loss = _build_static_lenet(
+            main, startup, False, seed)
+        infer_img, _, infer_pre = _build_static_lenet(infer, startup, True,
+                                                      seed)
+        with fluid.unique_name.guard():
+            with fluid.program_guard(main, startup):
+                opt = AdamOptimizer(learning_rate=lr)
+                opt.minimize(static_loss)
+
+        scope = core.Scope()
+        with fluid.scope_guard(scope):
+            exe.run(startup)
+        for param in main.all_parameters():
+            param_tensor = scope.var(param.name).get_tensor()
+            param_tensor.set(param_init_map[param.name], place)
+
+        main_graph = IrGraph(core.Graph(main.desc), for_test=False)
+        infer_graph = IrGraph(core.Graph(infer.desc), for_test=True)
+        transform_pass = QuantizationTransformPass(
+            scope=scope,
+            place=place,
+            activation_quantize_type=activation_quant_type,
+            weight_quantize_type=weight_quantize_type,
+            quantizable_op_type=['conv2d', 'depthwise_conv2d', 'mul'])
+        transform_pass.apply(main_graph)
+        transform_pass.apply(infer_graph)
+        build_strategy = fluid.BuildStrategy()
+        build_strategy.fuse_all_reduce_ops = False
+        binary = fluid.CompiledProgram(main_graph.graph).with_data_parallel(
+            loss_name=static_loss.name, build_strategy=build_strategy)
+
+        feeder = fluid.DataFeeder(
+            feed_list=[static_img, static_label], place=place)
+        with fluid.scope_guard(scope):
+            for batch_id, data in enumerate(reader()):
+                loss_v, = exe.run(binary,
+                                  feed=feeder.feed(data),
+                                  fetch_list=[static_loss])
+                static_loss_rec.append(loss_v[0])
+                if batch_id % 100 == 0:
+                    _logger.info('{}: {}'.format('loss', loss_v))
+
+        save_program = infer_graph.to_program()
+        with fluid.scope_guard(scope):
+            fluid.io.save_inference_model("./static_mnist", [infer_img.name],
+                                          [infer_pre], exe, save_program)
+        rtol = 1e-05
+        atol = 1e-08
+        for i, (loss_d,
+                loss_s) in enumerate(zip(dynamic_loss_rec, static_loss_rec)):
+            diff = np.abs(loss_d - loss_s)
+            if diff > (atol + rtol * np.abs(loss_s)):
+                _logger.info(
+                    "diff({}) at {}, dynamic loss = {}, static loss = {}".
+                    format(diff, i, loss_d, loss_s))
+                break
+
+        self.assertTrue(
+            np.allclose(
+                np.array(dynamic_loss_rec),
+                np.array(static_loss_rec),
+                rtol=rtol,
+                atol=atol,
+                equal_nan=True),
+            msg='Failed to do the imperative qat.')
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/fluid/contrib/slim/tests/test_qat2_int8_mkldnn_pass.py b/python/paddle/fluid/contrib/slim/tests/test_quant2_int8_mkldnn_pass.py
similarity index 94%
rename from python/paddle/fluid/contrib/slim/tests/test_qat2_int8_mkldnn_pass.py
rename to python/paddle/fluid/contrib/slim/tests/test_quant2_int8_mkldnn_pass.py
index 16cbfdd99d3e7c4ee2612c2e19bb75ffadccec3f..fcbb1b66ad1fd73a152b9128fa75a152baecd223 100644
--- a/python/paddle/fluid/contrib/slim/tests/test_qat2_int8_mkldnn_pass.py
+++ b/python/paddle/fluid/contrib/slim/tests/test_quant2_int8_mkldnn_pass.py
@@ -17,10 +17,10 @@ import numpy as np
 import paddle.fluid as fluid
 import paddle.fluid.core as core
 from paddle.fluid.framework import IrGraph
-from paddle.fluid.contrib.slim.quantization import Qat2Int8MkldnnPass
+from paddle.fluid.contrib.slim.quantization import Quant2Int8MkldnnPass
 
 
-class TestQat2Int8MkldnnPass(unittest.TestCase):
+class TestQuant2Int8MkldnnPass(unittest.TestCase):
     def setUp(self):
         self.scope = fluid.Scope()
         self.place = fluid.CPUPlace()
@@ -109,20 +109,20 @@ class TestQat2Int8MkldnnPass(unittest.TestCase):
             if op.op().has_attr("fuse_brelu") and op.op().attr("fuse_brelu"):
                 self.assertTrue(op.op().attr("fuse_activation") == "relu6")
 
-    def test_qat_update_activation(self):
+    def test_quant_update_activation(self):
         program = fluid.Program()
         with fluid.program_guard(program):
             self.prepare_program(program)
             graph = IrGraph(core.Graph(program.desc), for_test=True)
             graph = self.remove_fuse_activation_attribute(graph)
             self.check_graph_before_pass(graph)
-            qat2_int8_mkldnn_pass = Qat2Int8MkldnnPass(
+            quant2_int8_mkldnn_pass = Quant2Int8MkldnnPass(
                 self.quantized_ops,
                 _scope=self.scope,
                 _place=self.place,
                 _core=core,
                 _debug=False)
-            graph = qat2_int8_mkldnn_pass._update_activations(graph)
+            graph = quant2_int8_mkldnn_pass._update_activations(graph)
             self.check_graph_after_pass(graph)
 
 
diff --git a/python/paddle/fluid/contrib/slim/tests/test_quantization_mkldnn_pass.py b/python/paddle/fluid/contrib/slim/tests/test_quantization_mkldnn_pass.py
index eb75070e45c9d62830bc5c66a41f54afc5a0ff5d..3acbd8974195854da014990b13f3b1ba38e4c2c1 100644
--- a/python/paddle/fluid/contrib/slim/tests/test_quantization_mkldnn_pass.py
+++ b/python/paddle/fluid/contrib/slim/tests/test_quantization_mkldnn_pass.py
@@ -22,7 +22,7 @@ import paddle
 from paddle.fluid.framework import IrGraph
 from paddle.fluid.contrib.slim.quantization import QuantizationFreezePass
 from paddle.fluid.contrib.slim.quantization import QuantizationTransformPass
-from paddle.fluid.contrib.slim.quantization import QatInt8MkldnnPass
+from paddle.fluid.contrib.slim.quantization import QuantInt8MkldnnPass
 from paddle.fluid import core
 
 os.environ["CPU_NUM"] = "1"
@@ -90,7 +90,7 @@ class TestMKLDNNTransformBasedFreezePass(unittest.TestCase):
                                   seed,
                                   activation_quant_type,
                                   weight_quant_type='abs_max',
-                                  qat_perf=False,
+                                  quant_perf=False,
                                   for_ci=False):
         random.seed(0)
         np.random.seed(0)
@@ -109,7 +109,7 @@ class TestMKLDNNTransformBasedFreezePass(unittest.TestCase):
         scope = fluid.Scope()
         with fluid.scope_guard(scope):
             exe.run(startup)
-        # Apply the QAT QuantizationTransformPass
+        # Apply the QuantizationTransformPass
         transform_pass = QuantizationTransformPass(
             scope=scope,
             place=place,
@@ -149,7 +149,7 @@ class TestMKLDNNTransformBasedFreezePass(unittest.TestCase):
         freeze_pass.apply(test_graph)
 
         # Transform quantized graph for MKL-DNN INT8 inference
-        mkldnn_int8_pass = QatInt8MkldnnPass(_scope=scope, _place=place)
+        mkldnn_int8_pass = QuantInt8MkldnnPass(_scope=scope, _place=place)
         mkldnn_int8_pass.apply(test_graph)
         dev_name = '_cpu_'
         if not for_ci:
@@ -169,7 +169,7 @@ class TestMKLDNNTransformBasedFreezePass(unittest.TestCase):
         self.assertFalse(self.isinteger(np.sum(conv_w_mkldnn)))
         self.assertFalse(self.isinteger(np.sum(mul_w_mkldnn)))
 
-        # Check if the conv2d output and mul output are correctly linked to fake_dequantize's 
+        # Check if the conv2d output and mul output are correctly linked to fake_dequantize's
         # output
         self.check_program(mkldnn_program)
         if not for_ci:
diff --git a/python/paddle/fluid/core.py b/python/paddle/fluid/core.py
index c3fbb7b51b5ad2b0f2701f325a0d81df0b0ede79..d87363abf14cdfc3e29567bd41dbac387b882f76 100644
--- a/python/paddle/fluid/core.py
+++ b/python/paddle/fluid/core.py
@@ -17,6 +17,8 @@ from __future__ import print_function
 import site
 import sys
 import os
+import warnings
+import platform
 
 core_suffix = 'so'
 if os.name == 'nt':
@@ -62,7 +64,6 @@ def avx_supported():
     """
     Whether current system(Linux, MacOS, Windows) is supported with AVX.
     """
-    import platform
     from .. import compat as cpt
     sysstr = platform.system().lower()
     has_avx = False
@@ -160,6 +161,76 @@ def avx_supported():
         return False
 
 
+def run_shell_command(cmd):
+    import subprocess
+    out, err = subprocess.Popen(
+        cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE,
+        shell=True).communicate()
+    if err:
+        return None
+    else:
+        return out.decode('utf-8').strip()
+
+
+def get_dso_path(core_so, dso_name):
+    if core_so and dso_name:
+        return run_shell_command("ldd %s|grep %s|awk '{print $3}'" %
+                                 (core_so, dso_name))
+    else:
+        return None
+
+
+def load_dso(dso_absolute_path):
+    if dso_absolute_path:
+        try:
+            from ctypes import cdll
+            cdll.LoadLibrary(dso_absolute_path)
+        except:
+            warnings.warn("Load {} failed".format(dso_absolute_path))
+
+
+def pre_load(dso_name):
+    if has_avx_core:
+        core_so = current_path + os.sep + 'core_avx.' + core_suffix
+    elif has_noavx_core:
+        core_so = current_path + os.sep + 'core_noavx.' + core_suffix
+    else:
+        core_so = None
+    dso_path = get_dso_path(core_so, dso_name)
+    load_dso(dso_path)
+
+
+def get_glibc_ver():
+    return run_shell_command("ldd --version | awk '/ldd/{print $NF}'").strip()
+
+
+def less_than_ver(a, b):
+    import re
+    import operator
+
+    def to_list(s):
+        s = re.sub('(\.0+)+$', '', s)
+        return [int(x) for x in s.split('.')]
+
+    return operator.lt(to_list(a), to_list(b))
+
+
+# NOTE(zhiqiu): An error may occurs when import paddle in linux platform with glibc < 2.22, 
+# the error message of which is "dlopen: cannot load any more object with static TLS".
+# This happens when:
+# (1) the number of dynamic shared librarys (DSO) loaded > 14,
+# (2) after that, load a dynamic shared library (DSO) with static TLS.
+# For paddle, the problem is that 'libgomp' is a DSO with static TLS, and it is loaded after 14 DSOs.
+# So, here is a tricky way to solve the problem by pre load 'libgomp' before 'core_avx.so'.
+# The final solution is to upgrade glibc to > 2.22 on the target system.
+if platform.system().lower() == 'linux' and less_than_ver(get_glibc_ver(),
+                                                          '2.23'):
+    try:
+        pre_load('libgomp')
+    except Exception as e:
+        # NOTE(zhiqiu): do not abort if failed, since it may success when import core_avx.so
+        sys.stderr.write('Error: Can not preload libgomp.so')
+
 load_noavx = False
 
 if avx_supported():
diff --git a/python/paddle/fluid/dataloader/dataloader_iter.py b/python/paddle/fluid/dataloader/dataloader_iter.py
index ac6e05248f72d5a0499138586b25f6f35c4822af..214cd772af6b1fa6e24ec972c0f0644dc1c09f95 100644
--- a/python/paddle/fluid/dataloader/dataloader_iter.py
+++ b/python/paddle/fluid/dataloader/dataloader_iter.py
@@ -38,7 +38,27 @@ from ..multiprocess_utils import CleanupFuncRegistrar, _cleanup_mmap, _set_SIGCH
 MP_INDICES_CHECK_INTERVAL = 5
 
 
-def _default_collate_fn(batch):
+def default_collate_fn(batch):
+    """
+    Default batch collating function for :code:`fluid.io.DataLoader`,
+    batch should be a list of samples, and each sample should be a list
+    of fields as follows:
+    
+    [[filed1, filed2, ...], [filed1, filed2, ...], ...]
+    
+    This default collate function zipped each filed together and stack
+    each filed as the batch field as follows:
+
+    [batch_filed1, batch_filed2, ...]
+
+    Args:  
+        batch(list of list of numpy array): the batch data, each fields
+              should be a numpy array, each sample should be a list of
+              fileds, and batch should be a list of sample.
+    
+    Returns:
+        a list of numpy array: collated batch
+    """
     sample = batch[0]
     # dataset has only 1 field
     if isinstance(sample, np.ndarray):
@@ -82,7 +102,7 @@ class _DataLoaderIterBase(object):
         self._return_list = loader.return_list
         self._batch_sampler = loader.batch_sampler
         self._sampler_iter = iter(loader.batch_sampler)
-        self._collate_fn = loader.collate_fn or _default_collate_fn
+        self._collate_fn = loader.collate_fn or default_collate_fn
         self._num_workers = loader.num_workers
         self._use_buffer_reader = loader.use_buffer_reader
         self._use_shared_memory = loader.use_shared_memory
@@ -128,8 +148,10 @@ class _DataLoaderIterSingleProcess(_DataLoaderIterBase):
         self._need_check_feed = [
             v.desc.need_check_feed() for v in self._feed_list
         ]
+        # if only 1 place, do not need to keep order
         self._blocking_queue = core.init_lod_tensor_blocking_queue(
-            core.Variable(), self._blocking_queue_capacity, True)
+            core.Variable(), self._blocking_queue_capacity,
+            len(self._places) > 1)
         self._reader = core.create_py_reader(
             self._blocking_queue, self._var_names, self._shapes, self._dtypes,
             self._need_check_feed, self._places, self._use_buffer_reader, True)
@@ -280,8 +302,9 @@ class _DataLoaderIterMultiProcess(_DataLoaderIterBase):
         self._need_check_feed = [
             v.desc.need_check_feed() for v in self._feed_list
         ]
+        # if only 1 place, do not need to keep order
         self._blocking_queue = core.init_lod_tensor_blocking_queue(
-            core.Variable(), self._outstanding_capacity, True)
+            core.Variable(), self._outstanding_capacity, len(self._places) > 1)
         self._reader = core.create_py_reader(
             self._blocking_queue, self._var_names, self._shapes, self._dtypes,
             self._need_check_feed, self._places, self._use_buffer_reader, True)
@@ -442,6 +465,11 @@ class _DataLoaderIterMultiProcess(_DataLoaderIterBase):
                 #    get data again
                 data = self._data_queue.get(timeout=self._timeout)
             except Exception as e:
+                # check if thread done event set when waiting data
+                if self._thread_done_event.is_set():
+                    continue
+
+                # check failed workers
                 failed_workers = []
                 for i, w in enumerate(self._workers):
                     if self._worker_status[i] and not w.is_alive():
diff --git a/python/paddle/fluid/device_worker.py b/python/paddle/fluid/device_worker.py
index ebc304eab7d6a8b59a81fa2cc4244fb81bf3b1a4..72e0351ec36c028593eb8f099a4e39aa314aac37 100644
--- a/python/paddle/fluid/device_worker.py
+++ b/python/paddle/fluid/device_worker.py
@@ -403,11 +403,8 @@ class Section(DeviceWorker):
         trainer_desc.device_worker_name = "SectionWorker"
         pipeline_opt = self._program._pipeline_opt
         section_param = trainer_desc.section_param
-        section_param.queue_size = pipeline_opt["queue_size"]
-        section_param.sync_steps = pipeline_opt["sync_steps"]
+        section_param.num_microbatches = pipeline_opt["num_microbatches"]
         section_param.start_cpu_core_id = pipeline_opt["start_cpu_core_id"]
-        for e in pipeline_opt["param_need_sync"]:
-            section_param.param_need_sync.append(e)
         for i, program in enumerate(pipeline_opt["section_program_list"]):
             cfg = section_param.section_config.add()
             cfg.program_desc.ParseFromString(program["program"]._get_desc()
@@ -415,6 +412,7 @@ class Section(DeviceWorker):
             # TODO: why does not work
             # cfg.program_desc.CopyFrom(program.program._get_desc())
             place = pipeline_opt["place_list"][i]
+            place_id = pipeline_opt["place_id_list"][i]
             if isinstance(place, core.CPUPlace):
                 cfg.place = cfg.CPUPlace
             elif isinstance(place, core.CUDAPlace):
@@ -425,12 +423,7 @@ class Section(DeviceWorker):
                 raise NotImplementedError(
                     "SectionWorker only supports CPUPlace, CUDAPlace and CUDAPinnedPlace now."
                 )
-
-            cfg.concurrency = pipeline_opt["concurrency_list"][i]
-            for var in program["input_set"]:
-                cfg.section_in_var_names.append(var)
-            for var in program["output_set"]:
-                cfg.section_out_var_names.append(var)
+            cfg.place_id = place_id
 
 
 class DeviceWorkerFactory(object):
diff --git a/python/paddle/fluid/dygraph/__init__.py b/python/paddle/fluid/dygraph/__init__.py
index 22de864dd696100cd7859e33ad935cd6bb10b9f5..f990d02342be78fe998cebfa40ed8b348cf54b2a 100644
--- a/python/paddle/fluid/dygraph/__init__.py
+++ b/python/paddle/fluid/dygraph/__init__.py
@@ -44,6 +44,9 @@ from .backward_strategy import *
 from . import jit
 from .jit import *
 
+from . import io
+from .io import *
+
 from . import static_runner
 from .static_runner import StaticModelRunner
 
@@ -63,5 +66,6 @@ __all__ += checkpoint.__all__
 __all__ += learning_rate_scheduler.__all__
 __all__ += backward_strategy.__all__
 __all__ += jit.__all__
+__all__ += io.__all__
 __all__ += rnn.__all__
 __all__ += ['ProgramTranslator']
diff --git a/python/paddle/fluid/dygraph/base.py b/python/paddle/fluid/dygraph/base.py
index c04f4b7b59e340320449746a7f3a58fb27df1747..7d972cbbd09b95e5d7476837cb3f3318526deed8 100644
--- a/python/paddle/fluid/dygraph/base.py
+++ b/python/paddle/fluid/dygraph/base.py
@@ -23,6 +23,7 @@ from paddle.fluid.multiprocess_utils import CleanupFuncRegistrar
 from .tracer import Tracer
 import logging
 import objgraph
+from ..data_feeder import convert_dtype
 
 __all__ = [
     'no_grad',
@@ -64,17 +65,26 @@ _functional_dygraph_context_manager = None
 
 @signature_safe_contextmanager
 def param_guard(parameters):
-    # Note: parameters is a reference of self._parameters
+    # Note: parameters is a reference of self._parameters or self._buffers
     if not framework.in_dygraph_mode() and parameters:
         origin_parameters = parameters.copy()
         for name, var_base in parameters.items():
             if isinstance(var_base, core.VarBase):
-                new_var = framework.Parameter(
-                    var_base.block,
-                    var_base.shape,
-                    var_base.dtype,
-                    var_base.type,
-                    name=var_base.name)
+                # Convert ParamBase into Parameter with same attributes in dy2stat.
+                if isinstance(var_base, framework.ParamBase):
+                    new_var = var_base._to_static_var(to_parameter=True)
+                else:
+                    # Check whether has been created before.
+                    if var_base.name in var_base.block.vars:
+                        new_var = var_base.block.vars[var_base.name]
+                    # Note(Aurelius84): Convert VarBase in self._buffers into Variabe with
+                    # same attributes and set persistable=True to allow saving this var.
+                    # Because users can create a VarBase in `__init__`  like a
+                    # `mask` Tensor or `hidden_0` in RNN layers, which is equivalent to a Parameter
+                    # and necessary for inferring. It will be pruned if it's not necessary for inferring.
+                    else:
+                        new_var = var_base._to_static_var(
+                            to_parameter=False, persistable=True)
                 parameters[name] = new_var
         yield
         parameters.update(origin_parameters)
@@ -530,28 +540,34 @@ def grad(outputs,
 
 
 @framework.dygraph_only
-def to_variable(value, name=None, zero_copy=None):
+def to_variable(value, name=None, zero_copy=None, dtype=None):
     """
     :api_attr: imperative
 
     The API will create a ``Variable`` or ``ComplexVariable`` object from 
-    numpy\.ndarray, Variable or ComplexVariable object.
+    tuple, list, numpy\.ndarray, Variable or ComplexVariable object.
 
     Parameters:
-        value(ndarray|Variable|Tensor|ComplexVariable): The numpy\.ndarray, Variable 
-            Tensor or ComplexVariable object that needs to be converted, it can be 
-            multi-dimension, and the data type is one of numpy\.{float16, 
-            float32, float64, int16, int32, int64, uint8, uint16, complex64, 
-            complex128}.
+        value(tuple|list|ndarray|Variable|Tensor|ComplexVariable): Initial data. 
+            Can be a list, tuple, NumPy ndarray, Variable, Tensor, ComplexVariable. 
+            The shape can be multi-dimensional. The data type is one of 
+            numpy\.{float16, float32, float64, int16, int32, int64, 
+            uint8, uint16, complex64, complex128}.
         name(str, optional): The default value is None. Normally there is no 
             need for user to set this property. For more information, please 
             refer to :ref:`api_guide_Name` .
         zero_copy(bool, optional): Whether to share memory with the input numpy 
             array. This parameter only works with CPUPlace and will be set to 
             True when it is None. Default: None.
+        dtype(str, optional): The desired data type of returned ``Variable`` .
+            Can be 'bool' , 'float16' , 'float32' , 'float64' , 'int8' , 'int16' , 
+            'int32' , 'int64' , 'uint8' . Default: None.
 
     Returns:
-        Variable or ComplexVariable: If ``value`` is a numpy\.ndarray object, return ``Tensor`` created from the specified numpy\.ndarray object, which has same data type and shape with ``value``. If ``value`` is a Variable or ComplexVariable object, just return ``value``.
+        Variable or ComplexVariable: If ``value`` is a tuple/list/numpy\.ndarray object, 
+            return ``Tensor`` created from the corresponding numpy\.ndarray object, which has 
+            same data type and shape with ``value``. If ``value`` is a Variable or ComplexVariable 
+            object, just return ``value``.
 
 
     Examples:
@@ -573,17 +589,41 @@ def to_variable(value, name=None, zero_copy=None):
             z = fluid.dygraph.to_variable(c)
             z.numpy() # array([2.+1.j, 2.+0.j])
             z.dtype # 'complex128'
+
+            y = fluid.dygraph.to_variable([[0.1, 1.2], [2.2, 3.1], [4.9, 5.2]])
+            y.shape     # [3L, 2L]
+
+            y = fluid.dygraph.to_variable(((0.1, 1.2), (2.2, 3.1), (4.9, 5.2)), dtype='int32')
+            y.shape     # [3L, 2L]
+
     """
-    if isinstance(value, np.ndarray):
-        assert framework.in_dygraph_mode(
-        ), "to_variable could only be called in dygraph mode"
+    support_type = (list, tuple, np.ndarray, core.VarBase, framework.Variable,
+                    framework.ComplexVariable, core.Tensor, core.LoDTensor)
+    if not isinstance(value, support_type):
+        raise TypeError(
+            "The type of 'value' in fluid.dygraph.to_variable must be %s, but received %s."
+            % (support_type, type(value)))
+    if isinstance(value, (core.VarBase, framework.Variable,
+                          framework.ComplexVariable)):
+        return value
+    elif isinstance(value, (core.Tensor, core.LoDTensor)):
+        return core.VarBase(value)
+    else:
         if isinstance(framework._current_expected_place(),
                       framework.core.CPUPlace):
             if zero_copy is None:
                 zero_copy = True
         else:
             assert not zero_copy, "zero_copy mode can only be used with CPUPlace"
-            zero_copy = False
+
+        if not isinstance(value, np.ndarray):
+            value = np.array(value)
+
+        if dtype is not None:
+            dtype = convert_dtype(dtype)
+            if value.dtype != dtype:
+                value = value.astype(dtype)
+
         if np.iscomplexobj(value):
             if not name:
                 name = framework.unique_name.generate('_generated_var')
@@ -608,12 +648,3 @@ def to_variable(value, name=None, zero_copy=None):
                 zero_copy=zero_copy,
                 name=name if name else '')
             return py_var
-    elif isinstance(value, (core.VarBase, framework.Variable,
-                            framework.ComplexVariable)):
-        return value
-    elif isinstance(value, (core.Tensor, core.LoDTensor)):
-        return core.VarBase(value)
-    else:
-        raise TypeError(
-            "The type of input value is invalid, expected type is 'ndarray', "
-            "'Variable' or 'ComplexVariable', but received %s." % type(value))
diff --git a/python/paddle/fluid/dygraph/checkpoint.py b/python/paddle/fluid/dygraph/checkpoint.py
index 7bc10a97768a4f9451b4ed7e01df1c82cd796b84..e020507af418baf21066a099357f253237e25e79 100644
--- a/python/paddle/fluid/dygraph/checkpoint.py
+++ b/python/paddle/fluid/dygraph/checkpoint.py
@@ -80,9 +80,9 @@ def save_dygraph(state_dict, model_path):
     for k, v in state_dict.items():
         if isinstance(v, (Variable, core.VarBase)):
             model_dict[k] = v.numpy()
+            name_table[k] = v.name
         else:
             model_dict[k] = v
-        name_table[k] = v.name
     model_dict["StructuredToParameterName@@"] = name_table
 
     file_name = model_path + suffix
diff --git a/python/paddle/fluid/dygraph/dygraph_to_static/assert_transformer.py b/python/paddle/fluid/dygraph/dygraph_to_static/assert_transformer.py
index 61ff82f5be860061274d1e87f688a5f2315a966c..73dba66d3fca4f64bbe7e5ec763e541d664b9804 100644
--- a/python/paddle/fluid/dygraph/dygraph_to_static/assert_transformer.py
+++ b/python/paddle/fluid/dygraph/dygraph_to_static/assert_transformer.py
@@ -17,12 +17,12 @@ from __future__ import print_function
 import gast
 
 from paddle.fluid.dygraph.dygraph_to_static.static_analysis import AstNodeWrapper
-from paddle.fluid.dygraph.dygraph_to_static.static_analysis import StaticAnalysisVisitor
+from paddle.fluid.dygraph.dygraph_to_static.utils import ast_to_source_code
 
 
 class AssertTransformer(gast.NodeTransformer):
     """
-    A class transforms python assert to fluid.layers.Assert.
+    A class transforms python assert to convert_assert.
     """
 
     def __init__(self, wrapper_root):
@@ -32,21 +32,15 @@ class AssertTransformer(gast.NodeTransformer):
         self.wrapper_root = wrapper_root
         self.root = wrapper_root.node
 
-        self.static_analysis_visitor = StaticAnalysisVisitor(self.root)
-
     def transform(self):
         self.visit(self.root)
 
     def visit_Assert(self, node):
-        if not self.static_analysis_visitor.is_tensor_node(node.test):
-            return node
-        cast_node = gast.Call(
-            func=gast.parse("fluid.layers.cast").body[0].value,
-            args=[node.test, gast.Constant(
-                value="bool", kind=None)],
-            keywords=[])
-        assert_node = gast.Call(
-            func=gast.parse("fluid.layers.Assert").body[0].value,
-            args=[cast_node],
-            keywords=[])
-        return gast.Expr(value=assert_node)
+        convert_assert_node = gast.parse(
+            'fluid.dygraph.dygraph_to_static.convert_operators.convert_assert({test}, {msg})'.
+            format(
+                test=ast_to_source_code(node.test),
+                msg=ast_to_source_code(node.msg)
+                if node.msg else "")).body[0].value
+
+        return gast.Expr(value=convert_assert_node)
diff --git a/python/paddle/fluid/dygraph/dygraph_to_static/ast_transformer.py b/python/paddle/fluid/dygraph/dygraph_to_static/ast_transformer.py
index 0815f61432f189bb8153ce692e68c3194206b55e..f859d40050c73d276bf9940d904c656debd35c82 100644
--- a/python/paddle/fluid/dygraph/dygraph_to_static/ast_transformer.py
+++ b/python/paddle/fluid/dygraph/dygraph_to_static/ast_transformer.py
@@ -21,14 +21,16 @@ from __future__ import print_function
 import gast
 
 from paddle.fluid.dygraph.dygraph_to_static.assert_transformer import AssertTransformer
-from paddle.fluid.dygraph.dygraph_to_static.call_transformer import CallTransformer
 from paddle.fluid.dygraph.dygraph_to_static.basic_api_transformer import BasicApiTransformer
 from paddle.fluid.dygraph.dygraph_to_static.break_continue_transformer import BreakContinueTransformer
+from paddle.fluid.dygraph.dygraph_to_static.call_transformer import CallTransformer
+from paddle.fluid.dygraph.dygraph_to_static.cast_transformer import CastTransformer
 from paddle.fluid.dygraph.dygraph_to_static.ifelse_transformer import IfElseTransformer
 from paddle.fluid.dygraph.dygraph_to_static.list_transformer import ListTransformer
 from paddle.fluid.dygraph.dygraph_to_static.logical_transformer import LogicalTransformer
 from paddle.fluid.dygraph.dygraph_to_static.loop_transformer import LoopTransformer
 from paddle.fluid.dygraph.dygraph_to_static.print_transformer import PrintTransformer
+from paddle.fluid.dygraph.dygraph_to_static.return_transformer import ReturnTransformer
 from paddle.fluid.dygraph.dygraph_to_static.tensor_shape_transformer import TensorShapeTransformer
 
 from paddle.fluid.dygraph.dygraph_to_static.static_analysis import StaticAnalysisVisitor
@@ -71,6 +73,9 @@ class DygraphToStaticAst(gast.NodeTransformer):
         # Transform break/continue in loops
         BreakContinueTransformer(node_wrapper).transform()
 
+        # Transform return in functions
+        ReturnTransformer(node_wrapper).transform()
+
         # Transform logical and/or/not
         LogicalTransformer(node_wrapper).transform()
 
@@ -89,6 +94,9 @@ class DygraphToStaticAst(gast.NodeTransformer):
         # Transform call recursively
         CallTransformer(node_wrapper).transform()
 
+        # Transform python type casting statement
+        CastTransformer(node_wrapper).transform()
+
     def visit_FunctionDef(self, node):
         if self.decorate_func_name is None:
             self.decorate_func_name = node.name
diff --git a/python/paddle/fluid/dygraph/dygraph_to_static/break_continue_transformer.py b/python/paddle/fluid/dygraph/dygraph_to_static/break_continue_transformer.py
index 9b25ff07ec4c06f8b61ea31d5f824df7108df922..c78f6e8f403196fc098914c4cc58c8a16a4d885c 100644
--- a/python/paddle/fluid/dygraph/dygraph_to_static/break_continue_transformer.py
+++ b/python/paddle/fluid/dygraph/dygraph_to_static/break_continue_transformer.py
@@ -49,7 +49,7 @@ class ForToWhileTransformer(gast.NodeTransformer):
                 new_stmts = self.get_for_stmt_nodes(body_list[i])
                 body_list[i:i + 1] = new_stmts
                 i += len(new_stmts)
-                return
+                return new_stmts
         if hasattr(self.parent_node, 'orelse'):
             body_list = self.parent_node.orelse
             i = index_in_list(body_list, self.loop_node)
@@ -57,7 +57,7 @@ class ForToWhileTransformer(gast.NodeTransformer):
                 new_stmts = self.get_for_stmt_nodes(body_list[i])
                 body_list[i:i + 1] = new_stmts
                 i += len(new_stmts)
-                return
+                return new_stmts
         raise ValueError(
             "parent_node doesn't contain the loop_node in ForToWhileTransformer")
 
diff --git a/python/paddle/fluid/dygraph/dygraph_to_static/cast_transformer.py b/python/paddle/fluid/dygraph/dygraph_to_static/cast_transformer.py
new file mode 100644
index 0000000000000000000000000000000000000000..71cb999eab0eb27935f876ad953a948881da267f
--- /dev/null
+++ b/python/paddle/fluid/dygraph/dygraph_to_static/cast_transformer.py
@@ -0,0 +1,47 @@
+#   Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+import gast
+
+from paddle.fluid.dygraph.dygraph_to_static.static_analysis import AstNodeWrapper
+from paddle.fluid.dygraph.dygraph_to_static.utils import ast_to_source_code
+
+
+class CastTransformer(gast.NodeTransformer):
+    """
+    This class transforms type casting into Static Graph Ast.
+    """
+
+    def __init__(self, wrapper_root):
+        assert isinstance(
+            wrapper_root, AstNodeWrapper
+        ), "Input non-AstNodeWrapper node for the initialization of CastTransformer."
+        self._root = wrapper_root.node
+        self._castable_type = {'bool', 'int', 'float'}
+
+    def transform(self):
+        self.visit(self._root)
+
+    def visit_Call(self, node):
+        self.generic_visit(node)
+        func_str = ast_to_source_code(node.func).strip()
+        if func_str in self._castable_type and len(node.args) > 0:
+            args_str = ast_to_source_code(node.args[0]).strip()
+            new_func_str = "fluid.dygraph.dygraph_to_static.convert_operators.convert_var_dtype({}, '{}')".format(
+                args_str, func_str)
+            new_node = gast.parse(new_func_str).body[0].value
+            return new_node
+
+        return node
diff --git a/python/paddle/fluid/dygraph/dygraph_to_static/convert_operators.py b/python/paddle/fluid/dygraph/dygraph_to_static/convert_operators.py
index c05173a28e25832ef2741c5268ce8a39d07c753e..1291be60c692950a68e8f4db636dc0a3e9cb5876 100644
--- a/python/paddle/fluid/dygraph/dygraph_to_static/convert_operators.py
+++ b/python/paddle/fluid/dygraph/dygraph_to_static/convert_operators.py
@@ -15,7 +15,7 @@
 from paddle.fluid.data_feeder import convert_dtype
 from paddle.fluid.dygraph.dygraph_to_static.variable_trans_func import to_static_variable
 from paddle.fluid.framework import Variable, core
-from paddle.fluid.layers import cast, control_flow, logical_and, logical_not, logical_or, nn
+from paddle.fluid.layers import Assert, cast, control_flow, logical_and, logical_not, logical_or, nn
 
 
 def convert_while_loop(cond, body, loop_vars):
@@ -238,3 +238,36 @@ def cast_bool_if_necessary(var):
     if convert_dtype(var.dtype) not in ['bool']:
         var = cast(var, dtype="bool")
     return var
+
+
+def convert_var_dtype(var, dtype):
+    if isinstance(var, Variable):
+        src_dtype = convert_dtype(var.dtype)
+        assert src_dtype in [
+            'bool', 'float16', 'float32', 'float64', 'int32', 'int64', 'uint8'
+        ], "The dtype of var {} is {}, which is not supported in the cast op.".format(
+            var.name, src_dtype)
+        assert dtype in [
+            'bool', 'int', 'float'
+        ], "The casted target dtype is {}, which is not supported in type casting.".format(
+            dtype)
+        cast_map = {
+            'bool': 'bool',
+            'int': 'int32',
+            'float': 'float32',
+        }
+        return cast(var, dtype=cast_map[dtype])
+    else:
+        return eval('{}(var)'.format(dtype))
+
+
+def convert_assert(cond, message=""):
+    """
+    A function representation of a Python ``assert`` statement.
+    """
+    if isinstance(cond, Variable):
+        cond = cast(cond, "bool")
+        # NOTE: message is not used because Paddle Assert has no corresponding parameter to use.
+        return Assert(cond)
+    else:
+        assert cond, message
diff --git a/python/paddle/fluid/dygraph/dygraph_to_static/partial_program.py b/python/paddle/fluid/dygraph/dygraph_to_static/partial_program.py
index 32c36bc38161cec059ad635f137541f8a31bd2f7..0a9e66a5bb0b1b7f15928891f8eefcbc67ebffb5 100644
--- a/python/paddle/fluid/dygraph/dygraph_to_static/partial_program.py
+++ b/python/paddle/fluid/dygraph/dygraph_to_static/partial_program.py
@@ -19,9 +19,10 @@ import logging
 from paddle.fluid import log_helper
 from paddle.fluid import framework, backward, core
 from paddle.fluid.dygraph import layers
+from paddle.fluid.dygraph.base import switch_to_static_graph
+from paddle.fluid.dygraph.dygraph_to_static.return_transformer import RETURN_NO_VALUE_MAGIC_NUM
 from paddle.fluid.layers.utils import flatten
 from paddle.fluid.layers.utils import pack_sequence_as
-from paddle.fluid.dygraph.base import switch_to_static_graph
 import paddle.compat as cpt
 
 _logger = log_helper.get_logger(
@@ -111,14 +112,7 @@ class PartialProgramLayer(layers.Layer):
         self._outputs = NestSequence(outputs, need_check=True)
         self._params = parameters if parameters is not None else []
 
-        # Check all params from main program can be found in self._params:
-        # 1. parameter in self._params should be type `framework.ParamBase` which are created in dygraph.
-        # 2. parameter from transformed program shall be found in self._params.
-        #    Because they share same data with ParamBase of original dygraph.
-        self._check_params_all_inited(main_program)
-        self._prune_unused_params(main_program)
-
-        self._infer_program = main_program
+        self._infer_program = self._verify_program(main_program)
         self._train_program = self._append_backward_desc()
         # Switch infer or train by train() and eval()
         self._trace_program = None
@@ -127,6 +121,20 @@ class PartialProgramLayer(layers.Layer):
         # Set default mode to train
         self.train()
 
+    def _verify_program(self, main_program):
+        """
+        Verify that the program parameter is initialized, prune some unused params,
+        and remove redundant op callstack.
+        """
+        # 1. Check all params from main program can be found in self._params
+        self._check_params_all_inited(main_program)
+        # 2. Prune the parameters not used anywhere in the program.
+        self._prune_unused_params(main_program)
+        # 3. Remove op's python call stack with redundant low-level error messages.
+        main_program = self._remove_op_call_stack(main_program)
+
+        return main_program
+
     @switch_to_static_graph
     def _append_backward_desc(self):
         program = self._infer_program.clone()
@@ -184,7 +192,8 @@ class PartialProgramLayer(layers.Layer):
                 'is_test': not self.training
             })
 
-        return self._restore_out(out_vars)
+        restored_nest_out = self._restore_out(out_vars)
+        return self._remove_no_value(restored_nest_out)
 
     def _prepare(self, inputs):
         """
@@ -239,11 +248,44 @@ class PartialProgramLayer(layers.Layer):
         for i, idx in enumerate(self._outputs.var_ids):
             flatten_outputs[idx] = out_vars[i]
         outs = self._outputs.restore(flatten_outputs)
-        if len(outs) == 1:
+        if outs is not None and len(outs) == 1:
             outs = outs[0]
 
         return outs
 
+    def _is_no_value(self, var):
+        if isinstance(var, core.VarBase):
+            if var.shape == [1] and var.numpy()[0] == RETURN_NO_VALUE_MAGIC_NUM:
+                return True
+        return False
+
+    def _remove_no_value(self, out_vars):
+        """
+        Removes invalid value for various-length return statement
+        """
+        if isinstance(out_vars, core.VarBase):
+            if self._is_no_value(out_vars):
+                return None
+            return out_vars
+        elif isinstance(out_vars, (tuple, list)):
+            if isinstance(out_vars, tuple):
+                res = tuple(
+                    var for var in out_vars if not self._is_no_value(var))
+            else:
+                # isinstance(out_vars, list)
+                res = [var for var in out_vars if not self._is_no_value(var)]
+
+            has_removed = (len(out_vars) > len(res))
+            # len(out_vars) > len(res) means we have removed var. This is
+            # preventing out_vars is empty or just one element at the beginning
+            if len(res) == 0 and has_removed:
+                return None
+            elif len(res) == 1 and has_removed:
+                return res[0]
+            return res
+
+        return out_vars
+
     def _set_grad_type(self, params):
         # NOTE: if user set sparse gradient mode, the param's gradient
         # will be SelectedRows, not LoDTensor. But tracer will just
@@ -260,6 +302,19 @@ class PartialProgramLayer(layers.Layer):
                 continue
             param._set_grad_type(grad_var.type())
 
+    def _remove_op_call_stack(self, main_program):
+        """
+        Remove op's python call stack with redundant low-level error messages related to
+        transforamtions to avoid confusing users.
+        """
+        assert isinstance(main_program, framework.Program)
+        for block in main_program.blocks:
+            for op in block.ops:
+                if op.has_attr("op_callstack"):
+                    op._remove_attr("op_callstack")
+
+        return main_program
+
     def _check_params_all_inited(self, main_program):
         """
         Check all params from main program are already initialized, see details as follows:
@@ -272,18 +327,19 @@ class PartialProgramLayer(layers.Layer):
                 "Type of self._params in PartialProgramLayer should be list or tuple, but received %s."
                 % type(self._params))
 
-        params_name_set = set()
-        for i, param in enumerate(self._params):
-            if not isinstance(param, framework.ParamBase):
+        param_and_buffer_names_set = set()
+        for i, var in enumerate(self._params):
+            # self._params constains parameters and buffers with persistable=True.
+            if not isinstance(var, core.VarBase):
                 raise TypeError(
-                    'Type of self._params[{}] in PartialProgramLayer should be framework.ParamBase, but received {}.'.
-                    format(i, type(param)))
-            params_name_set.add(param.name)
+                    'Type of self._params[{}] in PartialProgramLayer should be Parameter or Variable, but received {}.'.
+                    format(i, type(var)))
+            param_and_buffer_names_set.add(var.name)
 
         for block in main_program.blocks:
             for name, var in block.vars.items():
                 if isinstance(var, framework.Parameter):
-                    if name not in params_name_set:
+                    if name not in param_and_buffer_names_set:
                         raise ValueError(
                             "\n\tWe don't support to define layer with parameters in the function "
                             "decorated by `@declarative`.\n\tBecause that will re-defined parameters "
diff --git a/python/paddle/fluid/dygraph/dygraph_to_static/program_translator.py b/python/paddle/fluid/dygraph/dygraph_to_static/program_translator.py
index 463a968e56afa08c5d8159fbb35f6222c862e1ff..9701ebd7b4fccf21afa3af161a99b63fbe8f847b 100644
--- a/python/paddle/fluid/dygraph/dygraph_to_static/program_translator.py
+++ b/python/paddle/fluid/dygraph/dygraph_to_static/program_translator.py
@@ -15,7 +15,7 @@
 from __future__ import print_function
 import gast
 import inspect
-import logging
+import warnings
 import textwrap
 import threading
 import collections
@@ -36,11 +36,10 @@ from paddle.fluid.wrapped_decorator import signature_safe_contextmanager
 from paddle.fluid.dygraph.base import param_guard
 from paddle.fluid.data_feeder import check_type
 from paddle.fluid.dygraph.dygraph_to_static.partial_program import partial_program_from
+from paddle.fluid.annotations import deprecated
 
 __all__ = ['ProgramTranslator', 'convert_to_static']
 
-logger = logging.getLogger("fluid")
-
 
 class FunctionCache(object):
     """
@@ -127,20 +126,44 @@ class FunctionSpec(object):
         self._args = args
         self._kwargs = kwargs
 
+        dyfunc = getattr(func, '__wrapped__', func)
+        self._dyfunc_code = inspect.getsource(dyfunc)
+
     def is_method(self):
         return self._args and isinstance(self._args[0], layers.Layer)
 
     def parameters(self, include_sublayer=True):
+        """
+        Returns parameters of decorated layers. If set `include_sublayer` True,
+        the parameters created in sub layers will be added.
+        """
         params = collections.OrderedDict()
         if self.is_method():
+            layer_instance = self._args[0]
             if include_sublayer:
-                params = self._args[0].parameters()
+                params = layer_instance.parameters()
                 names = [p.name for p in params]
                 params = collections.OrderedDict(zip(names, params))
             else:
-                params = self._args[0]._parameters
+                params = layer_instance._parameters
         return params
 
+    def buffers(self, include_sublayer=True):
+        """
+        Returns Variable buffers of decorated layers. If set `include_sublayer` True,
+        the Variable buffers created in sub layers will be added.
+        """
+        buffers = collections.OrderedDict()
+        if self.is_method():
+            layer_instance = self._args[0]
+            if include_sublayer:
+                buffers = layer_instance.buffers()
+                names = [buffer.name for buffer in buffers]
+                buffers = collections.OrderedDict(zip(names, buffers))
+            else:
+                buffers = layer_instance._buffers
+        return buffers
+
     @switch_to_static_graph
     def to_static_inputs(self, main_program):
         inputs = []
@@ -179,7 +202,9 @@ class FunctionSpec(object):
         # Note: if dygraph function is a method of class,
         # consider instance info as hash key.
         if self.is_method():
-            return self._dyfunc, self._args[0]
+            # NOTE: we can use Layer's (instance + function code) as hash key.
+            # An instance will not hold two identical methods 
+            return self._dyfunc_code, self._args[0]
         else:
             return self._dyfunc
 
@@ -251,19 +276,22 @@ class ConcreteProgram(object):
                 # 1. Adds `fluid.data` layers for input if needed
                 inputs = func_spec.to_static_inputs(main_program)
 
-                # 2. Gets all ParamBases in the function
-                all_parameters = list(func_spec.parameters().values())
+                # 2. Gets all ParamBases and buffered VarBases in the function
+                all_parameters_and_buffers = list(func_spec.parameters().values(
+                )) + list(func_spec.buffers().values())
 
                 # 3. Builds program only once and returns the output Variables.
-                with param_guard(func_spec.parameters(False)):
+                with param_guard(func_spec.parameters(False)), param_guard(
+                        func_spec.buffers(False)):
                     outputs = static_func(*inputs)
-                if not isinstance(outputs, (tuple, list)):
-                    outputs = [outputs] if outputs else []
+                if not isinstance(outputs,
+                                  (tuple, list)) and outputs is not None:
+                    outputs = [outputs]
 
         return ConcreteProgram(
             inputs=inputs,
             outputs=outputs,
-            parameters=all_parameters,
+            parameters=all_parameters_and_buffers,
             func=dygraph_function,
             main_program=main_program,
             startup_program=startup_program)
@@ -290,6 +318,17 @@ class ProgramCache(object):
             self._caches[item] = self._build_once(item)
         return self._caches[item]
 
+    def get_program(self, item):
+        if not isinstance(item, FunctionSpec):
+            raise ValueError(
+                "Input item's type should be FunctionSpec, but received %s" %
+                type(item))
+        if item not in self._caches:
+            raise RuntimeError(
+                "Failed to find program for input item, please decorate input function by `@declarative`."
+            )
+        return self._caches[item]
+
     def last(self):
         assert len(
             self._caches) >= 1, "No valid cached program in ProgramCache."
@@ -439,7 +478,7 @@ class ProgramTranslator(object):
             dygraph_func
         ), "Input dygraph_func is not a callable in ProgramTranslator.get_output"
         if not self.enable_declarative:
-            logger.info(
+            warnings.warn(
                 "The ProgramTranslator.get_output doesn't work when setting ProgramTranslator.enable = False. "
                 "We will just return dygraph output.")
             return dygraph_func(*args, **kwargs)
@@ -490,7 +529,7 @@ class ProgramTranslator(object):
             dygraph_func
         ), "Input dygraph_func is not a callable in ProgramTranslator.get_func"
         if not self.enable_declarative:
-            logger.info(
+            warnings.warn(
                 "The ProgramTranslator.get_func doesn't work when setting ProgramTranslator.enable=False. We will "
                 "just return dygraph output.")
             return dygraph_func
@@ -543,7 +582,7 @@ class ProgramTranslator(object):
             dygraph_func
         ), "Input dygraph_func is not a callable in ProgramTranslator.get_program"
         if not self.enable_declarative:
-            logger.info(
+            warnings.warn(
                 "The ProgramTranslator.get_program doesn't work when setting ProgramTranslator.enable=False."
                 "We will just return dygraph output.")
             return dygraph_func(*args, **kwargs)
@@ -611,6 +650,7 @@ class ProgramTranslator(object):
         source_code = ast_to_source_code(root_wrapper.node)
         return source_code
 
+    @deprecated(since='2.0', instead="paddle.imperative.jit.save")
     @switch_to_static_graph
     def save_inference_model(self, dirname, feed=None, fetch=None):
         """
diff --git a/python/paddle/fluid/dygraph/dygraph_to_static/return_transformer.py b/python/paddle/fluid/dygraph/dygraph_to_static/return_transformer.py
new file mode 100644
index 0000000000000000000000000000000000000000..ef03e63dbbbb6253e6a4a337f5d1375476165a38
--- /dev/null
+++ b/python/paddle/fluid/dygraph/dygraph_to_static/return_transformer.py
@@ -0,0 +1,430 @@
+#   Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+
+import gast
+
+from paddle.fluid import unique_name
+from paddle.fluid.dygraph.dygraph_to_static.utils import index_in_list
+from paddle.fluid.dygraph.dygraph_to_static.break_continue_transformer import ForToWhileTransformer
+from paddle.fluid.dygraph.dygraph_to_static.variable_trans_func import create_fill_constant_node
+
+__all__ = [
+    'RETURN_NO_VALUE_MAGIC_NUM', 'RETURN_NO_VALUE_VAR_NAME', 'ReturnTransformer'
+]
+
+# Constant for the name of the variable which stores the boolean state that we
+# should return
+RETURN_PREFIX = '__return'
+
+# Constant for the name of the variable which stores the final return value
+RETURN_VALUE_PREFIX = '__return_value'
+
+# Constant for the name of variables to initialize the __return_value
+RETURN_VALUE_INIT_NAME = '__return_value_init'
+
+# Constant magic number representing returning no value. This constant amis to
+# support returning various lengths of variables. Static graph must have fixed
+# size of fetched output while dygraph can have flexible lengths of output, to
+# solve it in dy2stat, we put float64 value with this magic number at Static
+# graph as a place holder to indicate the returning placeholder means no value
+# should return.
+RETURN_NO_VALUE_MAGIC_NUM = 1.77113e+279
+RETURN_NO_VALUE_VAR_NAME = "__no_value_return_var"
+
+
+def get_return_size(return_node):
+    assert isinstance(return_node, gast.Return), "Input is not gast.Return node"
+    return_length = 0
+    if return_node.value is not None:
+        if isinstance(return_node.value, gast.Tuple):
+            return_length = len(return_node.value.elts)
+        else:
+            return_length = 1
+    return return_length
+
+
+class ReplaceReturnNoneTransformer(gast.NodeTransformer):
+    """
+    Replace 'return None' to  'return' because 'None' cannot be a valid input
+    in control flow. In ReturnTransformer single 'Return' will be appended no
+    value placeholder
+    """
+
+    def __init__(self, root_node):
+        self.root = root_node
+
+    def transform(self):
+        self.visit(self.root)
+
+    def visit_Return(self, node):
+        if isinstance(node.value, gast.Name) and node.value.id == 'None':
+            node.value = None
+            return node
+        if isinstance(node.value, gast.Constant) and node.value.value == None:
+            node.value = None
+            return node
+        return node
+
+
+class ReturnAnalysisVisitor(gast.NodeVisitor):
+    """
+    Visits gast Tree and analyze the information about 'return'.
+    """
+
+    def __init__(self, root_node):
+        self.root = root_node
+
+        # A list to store where the current function is.
+        self.function_def = []
+
+        # Mapping from gast.FunctionDef node to the number of return statements
+        # Python allows define function inside function so we have to handle it
+        self.count_return = {}
+
+        # Mapping from gast.FunctionDef node to the maximum number of variables
+        # returned by the function's return statement
+        self.max_return_length = {}
+        self.visit(self.root)
+
+    def visit_FunctionDef(self, node):
+        self.function_def.append(node)
+        self.count_return[node] = 0
+        self.max_return_length[node] = 0
+        self.generic_visit(node)
+        self.function_def.pop()
+        return node
+
+    def visit_Return(self, node):
+        assert len(
+            self.function_def) > 0, "Found 'return' statement out of function."
+        cur_func = self.function_def[-1]
+        if cur_func in self.count_return:
+            self.count_return[cur_func] += 1
+        else:
+            self.count_return[cur_func] = 1
+
+        return_length = get_return_size(node)
+        if cur_func in self.max_return_length:
+            self.max_return_length[cur_func] = max(
+                self.max_return_length[cur_func], return_length)
+        else:
+            self.max_return_length[cur_func] = return_length
+
+        self.generic_visit(node)
+
+    def get_func_return_count(self, func_node):
+        return self.count_return[func_node]
+
+    def get_func_max_return_length(self, func_node):
+        return self.max_return_length[func_node]
+
+
+class ReturnTransformer(gast.NodeTransformer):
+    """
+    Transforms return statements into equivalent python statements containing
+    only one return statement at last. The basics idea is using a return value
+    variable to store the early return statements and boolean states with
+    if-else to skip the statements after the return.
+
+    """
+
+    def __init__(self, wrapper_root):
+        self.wrapper_root = wrapper_root
+        self.root = wrapper_root.node
+
+        pre_transformer = ReplaceReturnNoneTransformer(self.root)
+        pre_transformer.transform()
+
+        self.ancestor_nodes = []
+        # The name of the variable which stores the final return value
+        # Mapping from FunctionDef node to string
+        self.return_value_name = {}
+        # The names of the variable which stores the boolean state that skip
+        # statments. Mapping from FunctionDef node to list
+        self.return_name = {}
+        # The names of the variable which is placeholder to handle various-
+        # length return. Mapping from FunctionDef node to list
+        self.return_no_value_name = {}
+        # A list of FunctionDef to store where the current function is.
+        self.function_def = []
+
+        self.pre_analysis = None
+
+    def transform(self):
+        self.visit(self.root)
+
+    def generic_visit(self, node):
+        # Because we change ancestor nodes during visit_Return, not current
+        # node, original generic_visit of NodeTransformer will visit node
+        # which may be deleted. To prevent that node being added into
+        # transformed AST, We self-write a generic_visit and visit
+        for field, value in gast.iter_fields(node):
+            if isinstance(value, list):
+                for item in value:
+                    if isinstance(item, gast.AST):
+                        self.visit(item)
+            elif isinstance(value, gast.AST):
+                self.visit(value)
+
+    def visit(self, node):
+        """
+        Self-defined visit for appending ancestor
+        """
+        self.ancestor_nodes.append(node)
+        method = 'visit_' + node.__class__.__name__
+        visitor = getattr(self, method, self.generic_visit)
+        ret = visitor(node)
+        self.ancestor_nodes.pop()
+        return ret
+
+    def visit_FunctionDef(self, node):
+        self.function_def.append(node)
+        self.return_value_name[node] = None
+        self.return_name[node] = []
+        self.return_no_value_name[node] = []
+
+        self.pre_analysis = ReturnAnalysisVisitor(node)
+        max_return_length = self.pre_analysis.get_func_max_return_length(node)
+        while self.pre_analysis.get_func_return_count(node) > 1:
+            self.generic_visit(node)
+            self.pre_analysis = ReturnAnalysisVisitor(node)
+
+        if max_return_length == 0:
+            self.function_def.pop()
+            return node
+
+        # Prepend initialization of final return and append final return statement
+        value_name = self.return_value_name[node]
+        if value_name is not None:
+            node.body.append(
+                gast.Return(value=gast.Name(
+                    id=value_name,
+                    ctx=gast.Load(),
+                    annotation=None,
+                    type_comment=None)))
+            init_names = [
+                unique_name.generate(RETURN_VALUE_INIT_NAME)
+                for i in range(max_return_length)
+            ]
+            assign_zero_nodes = [
+                create_fill_constant_node(iname, 0.0) for iname in init_names
+            ]
+            if len(init_names) == 1:
+                return_value_nodes = gast.Name(
+                    id=init_names[0],
+                    ctx=gast.Load(),
+                    annotation=None,
+                    type_comment=None)
+            else:
+                # We need to initialize return value as a tuple because control
+                # flow requires some inputs or outputs have same structure
+                return_value_nodes = gast.Tuple(
+                    elts=[
+                        gast.Name(
+                            id=iname,
+                            ctx=gast.Load(),
+                            annotation=None,
+                            type_comment=None) for iname in init_names
+                    ],
+                    ctx=gast.Load())
+            assign_return_value_node = gast.Assign(
+                targets=[
+                    gast.Name(
+                        id=value_name,
+                        ctx=gast.Store(),
+                        annotation=None,
+                        type_comment=None)
+                ],
+                value=return_value_nodes)
+            node.body.insert(0, assign_return_value_node)
+            node.body[:0] = assign_zero_nodes
+        # Prepend control flow boolean nodes such as '__return@1 = False'
+        for name in self.return_name[node]:
+            assign_false_node = create_fill_constant_node(name, False)
+            node.body.insert(0, assign_false_node)
+        # Prepend no value placeholders
+        for name in self.return_no_value_name[node]:
+            assign_no_value_node = create_fill_constant_node(
+                name, RETURN_NO_VALUE_MAGIC_NUM)
+            node.body.insert(0, assign_no_value_node)
+
+        self.function_def.pop()
+        return node
+
+    def visit_Return(self, node):
+        cur_func_node = self.function_def[-1]
+        return_name = unique_name.generate(RETURN_PREFIX)
+        self.return_name[cur_func_node].append(return_name)
+        max_return_length = self.pre_analysis.get_func_max_return_length(
+            cur_func_node)
+        for ancestor_index in reversed(range(len(self.ancestor_nodes) - 1)):
+            ancestor = self.ancestor_nodes[ancestor_index]
+            cur_node = self.ancestor_nodes[ancestor_index + 1]
+            if hasattr(ancestor,
+                       "body") and index_in_list(ancestor.body, cur_node) != -1:
+                if cur_node == node:
+                    self._replace_return_in_stmt_list(
+                        ancestor.body, cur_node, return_name, max_return_length)
+                self._replace_after_node_to_if_in_stmt_list(
+                    ancestor.body, cur_node, return_name)
+            elif hasattr(ancestor, "orelse") and index_in_list(ancestor.orelse,
+                                                               cur_node) != -1:
+                if cur_node == node:
+                    self._replace_return_in_stmt_list(ancestor.orelse, cur_node,
+                                                      return_name,
+                                                      max_return_length)
+                self._replace_after_node_to_if_in_stmt_list(
+                    ancestor.orelse, cur_node, return_name)
+
+            if isinstance(ancestor, gast.While):
+                cond_var_node = gast.UnaryOp(
+                    op=gast.Not(),
+                    operand=gast.Name(
+                        id=return_name,
+                        ctx=gast.Load(),
+                        annotation=None,
+                        type_comment=None))
+                ancestor.test = gast.BoolOp(
+                    op=gast.And(), values=[ancestor.test, cond_var_node])
+                continue
+
+            if isinstance(ancestor, gast.For):
+                cond_var_node = gast.UnaryOp(
+                    op=gast.Not(),
+                    operand=gast.Name(
+                        id=return_name,
+                        ctx=gast.Load(),
+                        annotation=None,
+                        type_comment=None))
+                parent_node = self.ancestor_nodes[ancestor_index - 1]
+                for_to_while = ForToWhileTransformer(parent_node, ancestor,
+                                                     cond_var_node)
+                new_stmts = for_to_while.transform()
+                while_node = new_stmts[-1]
+                self.ancestor_nodes[ancestor_index] = while_node
+
+            if ancestor == cur_func_node:
+                break
+        # return_node is replaced so we shouldn't return here
+
+    def _replace_return_in_stmt_list(self, stmt_list, return_node, return_name,
+                                     max_return_length):
+        assert max_return_length >= 0, "Input illegal max_return_length"
+        i = index_in_list(stmt_list, return_node)
+        if i == -1:
+            return False
+        assign_nodes = [create_fill_constant_node(return_name, True)]
+        cur_func_node = self.function_def[-1]
+        return_length = get_return_size(return_node)
+        if return_length < max_return_length:
+            # In this case we should append RETURN_NO_VALUE placeholder
+            #
+            # max_return_length must be >= 1 here because return_length will be
+            # 0 at least.
+            if self.return_value_name[cur_func_node] is None:
+                self.return_value_name[cur_func_node] = unique_name.generate(
+                    RETURN_VALUE_PREFIX)
+
+            no_value_names = [
+                unique_name.generate(RETURN_NO_VALUE_VAR_NAME)
+                for j in range(max_return_length - return_length)
+            ]
+            self.return_no_value_name[cur_func_node].extend(no_value_names)
+
+            # Handle tuple/non-tuple case
+            if max_return_length == 1:
+                assign_nodes.append(
+                    gast.Assign(
+                        targets=[
+                            gast.Name(
+                                id=self.return_value_name[cur_func_node],
+                                ctx=gast.Store(),
+                                annotation=None,
+                                type_comment=None)
+                        ],
+                        value=gast.Name(
+                            id=no_value_names[0],
+                            ctx=gast.Load(),
+                            annotation=None,
+                            type_comment=None)))
+            else:
+                # max_return_length > 1 which means we should assign tuple
+                fill_tuple = [
+                    gast.Name(
+                        id=n,
+                        ctx=gast.Load(),
+                        annotation=None,
+                        type_comment=None) for n in no_value_names
+                ]
+                if return_node.value is not None:
+                    if isinstance(return_node.value, gast.Tuple):
+                        fill_tuple[:0] = return_node.value.elts
+                    else:
+                        fill_tuple.insert(0, return_node.value)
+
+                assign_nodes.append(
+                    gast.Assign(
+                        targets=[
+                            gast.Name(
+                                id=self.return_value_name[cur_func_node],
+                                ctx=gast.Store(),
+                                annotation=None,
+                                type_comment=None)
+                        ],
+                        value=gast.Tuple(
+                            elts=fill_tuple, ctx=gast.Load())))
+        else:
+            # In this case we should NOT append RETURN_NO_VALUE placeholder
+            if return_node.value is not None:
+                cur_func_node = self.function_def[-1]
+                if self.return_value_name[cur_func_node] is None:
+                    self.return_value_name[
+                        cur_func_node] = unique_name.generate(
+                            RETURN_VALUE_PREFIX)
+
+                assign_nodes.append(
+                    gast.Assign(
+                        targets=[
+                            gast.Name(
+                                id=self.return_value_name[cur_func_node],
+                                ctx=gast.Store(),
+                                annotation=None,
+                                type_comment=None)
+                        ],
+                        value=return_node.value))
+
+        stmt_list[i:] = assign_nodes
+        return True
+
+    def _replace_after_node_to_if_in_stmt_list(self, stmt_list, node,
+                                               return_name):
+        i = index_in_list(stmt_list, node)
+        if i < 0 or i >= len(stmt_list):
+            return False
+        if i == len(stmt_list) - 1:
+            # No need to add, we consider this as added successfully
+            return True
+        if_stmt = gast.If(test=gast.UnaryOp(
+            op=gast.Not(),
+            operand=gast.Name(
+                id=return_name,
+                ctx=gast.Store(),
+                annotation=None,
+                type_comment=None)),
+                          body=stmt_list[i + 1:],
+                          orelse=[])
+        stmt_list[i + 1:] = [if_stmt]
+        return True
diff --git a/python/paddle/fluid/dygraph/dygraph_to_static/utils.py b/python/paddle/fluid/dygraph/dygraph_to_static/utils.py
index 4b489c7d2847dc49df86645f69c61573a4adfbcb..bb5b2843c92e2b1ed88b002bb1511c07ddd61f37 100644
--- a/python/paddle/fluid/dygraph/dygraph_to_static/utils.py
+++ b/python/paddle/fluid/dygraph/dygraph_to_static/utils.py
@@ -369,6 +369,8 @@ def ast_to_func(ast_root, dyfunc, delete_on_exit=True):
     function, the other inner functions are invisible for the decorated function.
     """
     source = ast_to_source_code(ast_root)
+    import_fluid = "import paddle.fluid as fluid\n"
+    source = import_fluid + source
     if six.PY2:
         source = source.encode('utf-8')
         f = tempfile.NamedTemporaryFile(mode='w', suffix='.py', delete=False)
diff --git a/python/paddle/fluid/dygraph/io.py b/python/paddle/fluid/dygraph/io.py
new file mode 100644
index 0000000000000000000000000000000000000000..38e4e517836ed8ddbeb36fb68a0c34fa9826f233
--- /dev/null
+++ b/python/paddle/fluid/dygraph/io.py
@@ -0,0 +1,772 @@
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+
+import os
+import six
+import pickle
+import numpy as np
+
+from paddle import compat as cpt
+from paddle.fluid import core
+from paddle.fluid import framework
+from paddle.fluid import backward
+from paddle.fluid.dygraph import layers
+from paddle.fluid.layers import nn
+from paddle.fluid.dygraph.base import switch_to_static_graph
+
+__all__ = ['TranslatedLayer']
+
+VARIABLE_FILENAME = "__variables__"
+EXTRA_VAR_INFO_FILENAME = "__variables.info__"
+
+
+def _load_program_desc(model_file_path):
+    # 1. parse program desc
+    with open(model_file_path, "rb") as f:
+        program_desc_str = f.read()
+
+    program_desc = core.ProgramDesc(program_desc_str)
+    if not core._is_program_version_supported(program_desc._version()):
+        raise ValueError("Unsupported program version: %d\n" %
+                         program_desc._version())
+
+    return program_desc
+
+
+def _is_persistable(var_desc):
+    if var_desc.type() == core.VarDesc.VarType.FEED_MINIBATCH or \
+            var_desc.type() == core.VarDesc.VarType.FETCH_LIST or \
+            var_desc.type() == core.VarDesc.VarType.READER or \
+            var_desc.type() == core.VarDesc.VarType.RAW:
+        return False
+    return var_desc.persistable()
+
+
+def _is_parameter(persistable_var_desc, program_desc):
+    # 1. firstly, param should be input of op
+    input_ops = []  # op can be repeated
+    for block_idx in six.moves.range(program_desc.num_blocks()):
+        block = program_desc.block(block_idx)
+        for op_idx in six.moves.range(block.op_size()):
+            op = block.op(op_idx)
+            # NOTE: parameter is the input of a certain op
+            if persistable_var_desc.name() in op.input_arg_names():
+                input_ops.append(op)
+    # 2. secondly, param should not be output of op or be same op's output
+    for block_idx in six.moves.range(program_desc.num_blocks()):
+        block = program_desc.block(block_idx)
+        for op_idx in six.moves.range(block.op_size()):
+            op = block.op(op_idx)
+            if persistable_var_desc.name() in op.output_arg_names():
+                # such as batch_norm_op
+                if op in input_ops:
+                    continue
+                else:
+                    return False
+    return True
+
+
+def _get_persistable_vars(program_desc):
+    persistable_vars = []
+    for i in six.moves.range(program_desc.num_blocks()):
+        block = program_desc.block(i)
+        persistable_vars.extend(list(filter(_is_persistable, block.all_vars())))
+    return persistable_vars
+
+
+def _get_persistable_var_names(program_desc):
+    """
+    Get all persistable variable names in ProgramDesc.
+    """
+    var_names = []
+    persistable_vars = _get_persistable_vars(program_desc)
+    for var in persistable_vars:
+        var_names.append(var.name())
+    return var_names
+
+
+def _get_all_var_names(program_desc):
+    all_var_names = set()
+    for i in six.moves.range(program_desc.num_blocks()):
+        block = program_desc.block(i)
+        for var in block.all_vars():
+            all_var_names.add(var.name())
+    return all_var_names
+
+
+def _append_loaded_suffix(name):
+    """
+    Append loaded suffix to the given variable name
+    e.g. x ==> x@LOADED
+    """
+    suffix = core.loaded_var_suffix()
+    name = cpt.to_text(name)
+    if suffix not in name:
+        name = name + suffix
+    return name
+
+
+def _remove_loaded_suffix(name):
+    """
+    Remove loaded suffix to the given variable name
+    e.g. x@LOADED ==> x
+    """
+    suffix = core.loaded_var_suffix()
+    name = cpt.to_text(name)
+    return name.replace(suffix, '')
+
+
+def _append_loaded_suffix_to_var(program_desc):
+    persistable_vars = _get_persistable_vars(program_desc)
+    for var_desc in persistable_vars:
+        old_name = var_desc.name()
+        new_name = _append_loaded_suffix(var_desc.name())
+        var_desc.set_name(new_name)
+        for block_idx in six.moves.range(program_desc.num_blocks()):
+            block = program_desc.block(block_idx)
+            for op_idx in six.moves.range(block.op_size()):
+                op = block.op(op_idx)
+                op._rename_input(old_name, new_name)
+                op._rename_output(old_name, new_name)
+
+
+@switch_to_static_graph
+def _build_program_by_desc(program_desc):
+    prog = framework.Program()
+    prog.desc = program_desc
+    prog.blocks = [
+        framework.Block(prog, i)
+        for i in six.moves.range(prog.desc.num_blocks())
+    ]
+    prog._sync_with_cpp()
+    return prog
+
+
+def _change_is_test_status(program_desc, is_test):
+    # change all `is_test` attributes
+    for i in six.moves.range(program_desc.num_blocks()):
+        block = program_desc.block(i)
+        for j in six.moves.range(block.op_size()):
+            op = block.op(j)
+            if op.has_attr('is_test'):
+                op._set_attr('is_test', is_test)
+
+
+class _ProgramHolder(object):
+    """
+    Holds the execution information of a Program.
+
+    _ProgramHolder is the execution unit of TranslatedLayer, 
+    if TranslatedLayer contains multiple _ProgramHolder, 
+    it can execute multiple methods
+
+    _ProgramHolder is an internal concept.
+    """
+
+    def __init__(self, program_desc):
+        super(_ProgramHolder, self).__init__()
+
+        # input, output, persistable var info
+        self._input_names = []
+        self._persistable_names = []
+        self._output_descs = []
+
+        # execution scope
+        self._inner_scope = core.Scope()
+
+        # forward program
+        self._infer_program_desc = self._preprocess(program_desc)
+        # forward + backward program
+        self._train_program_desc = self._append_backward_desc(
+            self._infer_program_desc)
+
+    @property
+    def infer_program(self):
+        return self._infer_program_desc
+
+    @property
+    def train_program(self):
+        return self._train_program_desc
+
+    @property
+    def input_names(self):
+        return self._input_names
+
+    @property
+    def output_decs(self):
+        return self._output_descs
+
+    @property
+    def persistable_names(self):
+        return self._persistable_names
+
+    @property
+    def scope(self):
+        return self._inner_scope
+
+    def _preprocess(self, program_desc):
+        # 1. Prune original program
+        # remove feed, fetch and scale-1 op, remove op_callstack attr
+        ops_to_remove = []
+        root_block = program_desc.block(0)
+        for i in six.moves.range(root_block.op_size()):
+            op = root_block.op(i)
+            if op.type() == 'feed':
+                ops_to_remove.append(i)
+                feed_var_name = cpt.to_bytes(op.input('X')[0])
+                root_block._remove_var(feed_var_name)
+                self._input_names.append(cpt.to_bytes(op.output('Out')[0]))
+            elif op.type() == 'scale' and op.output('Out')[0].startswith(
+                    'save_infer_model/scale_'):
+                ops_to_remove.append(i)
+                out_var_name = cpt.to_bytes(op.output('Out')[0])
+                root_block._remove_var(out_var_name)
+                self._output_descs.append(
+                    root_block.find_var(cpt.to_bytes(op.input('X')[0])))
+            elif op.type() == 'fetch':
+                ops_to_remove.append(i)
+                fetch_var_name = cpt.to_bytes(op.output('Out')[0])
+                root_block._remove_var(fetch_var_name)
+                # NOTE: some old pre-train models have no extra scale_op
+                if not op.input('X')[0].startswith('save_infer_model/scale_'):
+                    self._output_descs.append(
+                        root_block.find_var(cpt.to_bytes(op.input('X')[0])))
+            else:
+                if op.has_attr("op_callstack"):
+                    op.remove_attr("op_callstack")
+
+        for op_idx in reversed(ops_to_remove):
+            root_block._remove_op(op_idx, op_idx + 1)
+
+        # 2. Input processing, reverse feed vars
+        self._input_names.reverse()
+
+        # 3. Output processing, add scale for outputs
+        tmp_program = _build_program_by_desc(program_desc)
+        # NOTE: [why need append scale for outputs]
+        # When dealing with some more complex pre-training models, there 
+        # will be situations where the pre-training model has multiple 
+        # fetch outputs. In the scenario of multiple fetch outputs, 
+        # there is a special case where multiple outputs of the model 
+        # may be on the same branch. According to the user's subsequent 
+        # use, multiple outputs may be associated with multiple branches.
+        # These subsequent operations are added in TranslatedLayer is 
+        # agnostic during initialization, which results in subsequent 
+        # gradient accumulation operations that are required on the 
+        # output node in the middle of the branch will not be performed, 
+        # resulting in error, details see pull request:
+        # [https://github.com/PaddlePaddle/Paddle/pull/24627]
+        self._append_scale_to_output(tmp_program)
+
+        # 4. Persistable vars processing
+        # - append @LOADED suffix to persistable vars
+        # NOTE: [why need to append suffix to persistable vars]
+        # Dygraph and static graph mode use the same naming mechanism. 
+        # If users want to load the model fine-tune, it is possible 
+        # to add the existing Layer in the loaded model to enhance 
+        # the network. For example, the original saved model has linear, 
+        # and later after loading, a new linear is added. At this time, 
+        # there will be a problem of duplicate names, so here is unified 
+        # to add the LOADED suffix to the parameters of the model loaded
+        # during training. And in order to avoid multiple @LOADED suffix
+        # are appended to variable name, we only append @LOADED suffix to
+        # the variable that not contains @LOADED suffix.
+        _append_loaded_suffix_to_var(program_desc)
+        # - get persistable var
+        self._persistable_names = _get_persistable_var_names(program_desc)
+
+        return program_desc
+
+    @switch_to_static_graph
+    def _append_scale_to_output(self, program):
+        # 1. append scale & save var
+        scale_output_vars = []
+        with framework.program_guard(program):
+            for i, out in enumerate(self._output_descs):
+                var = program.global_block().var(out.name())
+                var = nn.scale(
+                    var, 1., name="static_model_runner/scale_{}".format(i))
+                scale_output_vars.append(var)
+        # 2. update output names & descs
+        for i, var in enumerate(scale_output_vars):
+            self._output_descs[i] = var.desc
+
+    @switch_to_static_graph
+    def _append_backward_desc(self, infer_program_desc):
+        program_desc_copy = core.ProgramDesc(infer_program_desc)
+
+        # 1. set all `is_test` attributes to False
+        _change_is_test_status(program_desc_copy, False)
+
+        # 2. prepare program and related var
+        # NOTE: To reuse backward interfaces, build Program firstly.
+        # Originally, there is no need to build a program, but need to almost
+        # rewrite a series of methods for append_backward for program_desc. 
+        # Therefore, in order to reuse the method of backward.py, build the program here.
+        program = _build_program_by_desc(program_desc_copy)
+
+        targets = []
+        for out in self._output_descs:
+            targets.append(program.global_block().var(out.name()))
+
+        # 3. append backward
+        backward.gradients(targets=targets, inputs=[])
+        return program.desc
+
+
+# [ TranslatedLayer : Run program in imperative mode ]
+# 
+# DESIGN IDEA: using an special operator `RunProgram`, execute program inside operator.
+#
+# Op's Inputs:
+#   - the input variable of the user feed
+#   - the necessary parameters of the network
+# Op's Outputs:
+#   - the output variable of fetch
+# 
+# This op receives a complete program desc, internally creates scope
+# and executor, executes this program. Key points:
+#
+# 1. Data Sharing: 
+#   The varBase of the dynamic graph is not in the scope, so before the op
+#   executes the program internally, create persistent variables with the
+#   same name as feed, parameters, and fetch in the scope, and share the
+#   LoDTensor of the op input.
+# 
+# 2. Forward and Backward Separation:
+#   Because the dynamic graph op performs the forward and backward separately,
+#   in the forward op RunProgram, we only execute the forward part of whole program,
+#   and in the backward op RunProgramGrad, we execute the backward part of program.
+#   We can not separate the program into forward and backward part, which will 
+#   make some control flow execution logic wrong.
+
+
+# NOTE: [compatible] deal with model saved by save_inference_model,
+# which need get var info from program desc
+def _load_persistable_vars_by_program(model_path,
+                                      program_holder,
+                                      params_filename=None):
+    # make sure the path has been checked
+    persistable_vars = _get_persistable_vars(program_holder.infer_program)
+    load_var_dict = {}
+    for each_var in persistable_vars:
+        orig_each_name = _remove_loaded_suffix(each_var.name())
+        if _is_parameter(each_var, program_holder.infer_program):
+            # create output varbase
+            new_var = framework.ParamBase(
+                shape=each_var.shape(),
+                dtype=each_var.dtype(),
+                name=each_var.name(),
+                type=each_var.type(),
+                persistable=True)
+        else:
+            new_var = framework._varbase_creator(
+                type=each_var.type(),
+                name=each_var.name(),
+                shpae=each_var.shape(),
+                dtype=each_var.dtype(),
+                persistable=True)
+        if params_filename is None:
+            framework._dygraph_tracer().trace_op(
+                type='load',
+                inputs={},
+                outputs={'Out': new_var},
+                attrs={'file_path': os.path.join(model_path, orig_each_name)})
+        new_var.stop_gradient = False
+        load_var_dict[each_var.name()] = new_var
+
+    if params_filename is not None:
+        load_var_list = []
+        for name in sorted(load_var_dict.keys()):
+            load_var_list.append(load_var_dict[name])
+
+        framework._dygraph_tracer().trace_op(
+            type='load_combine',
+            inputs={},
+            outputs={'Out': load_var_list},
+            attrs={'file_path': os.path.join(model_path, params_filename)})
+
+        for each_var in persistable_vars:
+            if not _is_parameter(each_var, program_holder.infer_program):
+                continue
+            param = load_var_dict[each_var.name()]
+            param.stop_gradient = False
+
+    # NOTE: [Recovery stop gradient information based on the program]
+    # After loading the model, the stop_gradient information 
+    # of the original variable is lost, but if a parameter does not
+    # have a corresponding @GRAD variable in the backward program,
+    # it can be said that it is also stop_gradient
+    all_var_names = _get_all_var_names(program_holder.train_program)
+    for var_name in load_var_dict:
+        grad_var_name = var_name + core.grad_var_suffix()
+        if grad_var_name not in all_var_names:
+            load_var_dict[var_name].stop_gradient = True
+
+    return load_var_dict
+
+
+def _load_persistable_vars(model_path,
+                           var_info_path,
+                           separate_params=False,
+                           params_filename=None):
+    # 1. load extra var info
+    with open(var_info_path, 'rb') as f:
+        extra_var_info = pickle.load(f) if six.PY2 else pickle.load(
+            f, encoding='latin1')
+
+    # 2. construct var dict
+    load_var_dict = dict()
+    load_var_list = []
+    # NOTE: some var may not be Parameter
+    for name in sorted(extra_var_info):
+        # append suffix, see [why need to append suffix to persistable vars]
+        new_name = _append_loaded_suffix(name)
+        # create output varbase
+        if extra_var_info[name].get('trainable', None) is not None:
+            # use default shape and dtype
+            new_var = framework.ParamBase(
+                shape=[1],  # only to pass check, this shape is not meaningful
+                dtype=core.VarDesc.VarType.FP32,
+                name=new_name,
+                persistable=True)
+        else:
+            new_var = framework._varbase_creator(
+                name=new_name, persistable=True)
+
+        # load separate vars
+        if separate_params is True:
+            framework._dygraph_tracer().trace_op(
+                type='load',
+                inputs={},
+                outputs={'Out': new_var},
+                attrs={'file_path': os.path.join(model_path, name)})
+
+        new_var.stop_gradient = extra_var_info[name]['stop_gradient']
+        load_var_dict[new_name] = new_var
+        load_var_list.append(new_var)
+
+    # 3. load all vars
+    if separate_params is False:
+        if params_filename is not None:
+            var_file_path = os.path.join(model_path, params_filename)
+        else:
+            var_file_path = os.path.join(model_path, VARIABLE_FILENAME)
+        framework._dygraph_tracer().trace_op(
+            type='load_combine',
+            inputs={},
+            outputs={'Out': load_var_list},
+            attrs={'file_path': var_file_path})
+
+    return load_var_dict
+
+
+def _construct_program_holders(model_path, model_filename=None):
+    # make sure the path has been checked
+    program_holder_dict = dict()
+
+    if model_filename is not None:
+        # [compatible] if assign model_filename, only can load one program as Layer.forward
+        model_filename = os.path.basename(model_filename)
+        model_file_path = os.path.join(model_path, model_filename)
+        program_holder_dict['forward'] = _ProgramHolder(
+            _load_program_desc(model_file_path))
+    else:
+        for _, _, file_names in os.walk(model_path):
+            for name in file_names:
+                if 'model' in name:
+                    model_file_path = os.path.join(model_path, name)
+                    method_name = name.strip('_')
+                    if method_name == 'model':
+                        method_name = 'forward'
+                    else:
+                        method_name.replace('model', '')
+                    program_holder_dict[method_name] = _ProgramHolder(
+                        _load_program_desc(model_file_path))
+
+    return program_holder_dict
+
+
+def _construct_params_and_buffers(model_path,
+                                  programs,
+                                  separate_params=False,
+                                  params_filename=None):
+    var_info_path = os.path.join(model_path, EXTRA_VAR_INFO_FILENAME)
+    if os.path.exists(var_info_path):
+        var_dict = _load_persistable_vars(model_path, var_info_path,
+                                          separate_params, params_filename)
+    else:
+        var_dict = _load_persistable_vars_by_program(
+            model_path, programs['forward'], params_filename)
+    return var_dict
+
+
+class TranslatedLayer(layers.Layer):
+    """
+    TranslatedLayer is a imperative Layer for holding the model loaded by 
+    :ref:`api_imperative_jit_load` . It can be used like a general Layer 
+    object in eval or train mode.
+    
+    .. note:
+        The TranslatedLayer objects should not be created by constructor, it only can be loaded and constructed by :ref:`api_imperative_jit_load` .
+
+    Examples:
+        .. code-block:: python
+
+            import numpy as np
+            import paddle.fluid as fluid
+            from paddle.fluid.dygraph import Linear
+            from paddle.fluid.dygraph import declarative
+
+            BATCH_SIZE = 32
+            BATCH_NUM = 20
+
+            def random_batch_reader():
+                def _get_random_images_and_labels(image_shape, label_shape):
+                    image = np.random.random(size=image_shape).astype('float32')
+                    label = np.random.random(size=label_shape).astype('int64')
+                    return image, label
+
+                def __reader__():
+                    for _ in range(BATCH_NUM):
+                        batch_image, batch_label = _get_random_images_and_labels(
+                            [BATCH_SIZE, 784], [BATCH_SIZE, 1])
+                        yield batch_image, batch_label
+
+                return __reader__
+
+            class LinearNet(fluid.dygraph.Layer):
+                def __init__(self, in_size, out_size):
+                    super(LinearNet, self).__init__()
+                    self._linear = Linear(in_size, out_size)
+
+                @declarative
+                def forward(self, x):
+                    return self._linear(x)
+
+            # enable dygraph mode
+            fluid.enable_dygraph() 
+
+            # 1. train & save model.
+            # create network
+            net = LinearNet(784, 1)
+            adam = fluid.optimizer.AdamOptimizer(learning_rate=0.1, parameter_list=net.parameters())
+            # create data loader
+            train_loader = fluid.io.DataLoader.from_generator(capacity=5)
+            train_loader.set_batch_generator(random_batch_reader())
+            # train
+            for data in train_loader():
+                img, label = data
+                label.stop_gradient = True
+
+                cost = net(img)
+
+                loss = fluid.layers.cross_entropy(cost, label)
+                avg_loss = fluid.layers.mean(loss)
+
+                avg_loss.backward()
+                adam.minimize(avg_loss)
+                net.clear_gradients()
+
+            model_path = "linear.example.model"
+            fluid.dygraph.jit.save(
+                layer=net,
+                model_path=model_path,
+                input_spec=[img])
+
+            # 2. load model as TranslatedLayer
+            translated_layer = fluid.dygraph.jit.load(model_path)
+            # inference
+            translated_layer.eval()
+            x = fluid.dygraph.to_variable(np.random.random((1, 784)).astype('float32'))
+            pred = translated_layer(x)
+            # fine-tune
+            translated_layer.train()
+            adam = fluid.optimizer.AdamOptimizer(learning_rate=0.1, parameter_list=translated_layer.parameters())
+            train_loader = fluid.io.DataLoader.from_generator(capacity=5)
+            train_loader.set_batch_generator(random_batch_reader())
+            for data in train_loader():
+                img, label = data
+                label.stop_gradient = True
+
+                cost = translated_layer(img)
+
+                loss = fluid.layers.cross_entropy(cost, label)
+                avg_loss = fluid.layers.mean(loss)
+
+                avg_loss.backward()
+                adam.minimize(avg_loss)
+                translated_layer.clear_gradients()
+    """
+
+    def __init__(self, programs, persistable_vars):
+        super(TranslatedLayer, self).__init__()
+
+        if not isinstance(programs, dict):
+            raise TypeError(
+                "TranslatedLayer need to use _ProgramHolder's dict for initialization."
+            )
+        if not isinstance(persistable_vars, dict):
+            raise TypeError(
+                "TranslatedLayer need to use persisatbale variable dict for initialization."
+            )
+
+        self._program_holder_dict = programs
+
+        for name, var in persistable_vars.items():
+            if isinstance(var, framework.ParamBase):
+                self.add_parameter(name, var)
+            elif isinstance(var, core.VarBase):
+                self.register_buffer(name, var)
+            else:
+                raise TypeError(
+                    "Adding persistent variable which  to layer is not supported now"
+                )
+
+        self._is_test = True
+
+    @staticmethod
+    @framework.dygraph_only
+    def _construct(model_path, configs=None):
+        # 0. dir and filename check
+        model_path = os.path.normpath(model_path)
+        if not os.path.isdir(model_path):
+            raise ValueError("There is no directory named '%s'" % model_path)
+        model_filename = None
+        params_filename = None
+        separate_params = False
+        if configs is not None:
+            model_filename = configs.model_filename
+            params_filename = configs.params_filename
+            separate_params = configs.separate_params
+
+        # 1. load program desc & construct _ProgramHolder
+        programs = _construct_program_holders(model_path, model_filename)
+
+        # 2. load layer parameters & parameter attirbutes
+        persistable_vars = _construct_params_and_buffers(
+            model_path, programs, separate_params, params_filename)
+
+        # 3. construct TranslatedLayer object
+        translated_layer = TranslatedLayer(programs, persistable_vars)
+
+        # 4. create TranslatedLayer's execution method
+        for method_name, program_holder in programs.items():
+            setattr(TranslatedLayer, method_name,
+                    TranslatedLayer._execution_method_creator(method_name,
+                                                              program_holder))
+
+        # 5. set TranslatedLayer's default mode to eval
+        translated_layer.eval()
+
+        return translated_layer
+
+    @staticmethod
+    def _execution_method_creator(method_name, program_holder):
+        def __impl__(self, *input):
+            # 1. prepare inputs, outputs, attrs
+            input_vars = []
+            for i, value in enumerate(input):
+                if not isinstance(value, (np.ndarray, core.VarBase)):
+                    raise TypeError(
+                        "The type of input in TranslatedLayer must be numpy array or Variable(VarBase), but received %s."
+                        % type(value))
+                # NOTE: In order to unify the API, firstly convert the input to VarBase
+                if isinstance(value, np.ndarray):
+                    var = core.VarBase(
+                        value=value,
+                        name=program_holder.input_names[i],
+                        persistable=False,
+                        place=framework._current_expected_place(),
+                        zero_copy=True)
+                else:
+                    var = value
+                    # NOTE: we changed var name here, 
+                    # but it may be an important name set by user
+                    var.name = program_holder.input_names[i]
+                input_vars.append(var)
+
+            persistable_vars = []
+            for var_name in program_holder.persistable_names:
+                if var_name in self._parameters:
+                    persistable_vars.append(self._parameters[var_name])
+                elif var_name in self._buffers:
+                    persistable_vars.append(self._buffers[var_name])
+                else:
+                    raise ValueError(
+                        "The persistable variable %s is not exists in current TranslatedLayer."
+                        % var_name)
+
+            output_vars = []
+            for var_desc in program_holder.output_decs:
+                var = core.VarBase(var_desc.dtype(),
+                                   var_desc.shape(),
+                                   var_desc.name(), var_desc.type(), False)
+                output_vars.append(var)
+
+            # hold forward variables
+            tmp_scope_vec = core.VarBase(core.VarDesc.VarType.FP32, [],
+                                         "program_out_scope",
+                                         core.VarDesc.VarType.STEP_SCOPES, True)
+            tmp_scope_vec.value().set_scope(program_holder.scope)
+
+            # 2. run prorgam by op
+            trace_program = program_holder.infer_program if self._is_test else program_holder.train_program
+            end_op_index = program_holder.infer_program.block(0).op_size()
+            framework._dygraph_tracer().trace_op(
+                type='run_program',
+                inputs={'X': input_vars,
+                        'Params': persistable_vars},
+                outputs={'Out': output_vars,
+                         'OutScope': tmp_scope_vec},
+                attrs={
+                    'global_block': trace_program.block(0),
+                    'start_op_index': 0,
+                    'end_op_index': end_op_index,
+                    'is_test': self._is_test
+                })
+
+            # NOTE: [ why need set param's gradient type here ]
+            # if user set sparse gradient mode, the param's gradient
+            # will be SelectedRows, not LoDTensor. But tracer will just
+            # set param grad VarBase by forward VarBase(LoDTensor)
+            # If we don't change grad_var type here, RunProgramOp need
+            # transform SelectedRows to LoDTensor forcely, it may not
+            # be user wanted result.
+            for persistable_var in persistable_vars:
+                grad_var_name = var.name + core.grad_var_suffix()
+                grad_var = trace_program.block(0).find_var(
+                    cpt.to_bytes(grad_var_name))
+                # NOTE: cannot find var desc maybe not problem, 
+                # such as in batch_norm
+                if grad_var is None:
+                    continue
+                persistable_var._set_grad_type(grad_var.type())
+
+            # 3. prepare output, keep same form with inputs
+            outs = output_vars
+            if len(output_vars) == 1:
+                outs = output_vars[0]
+            return outs
+
+        __impl__.__name__ = method_name
+        return __impl__
+
+    def train(self):
+        self._is_test = False
+
+    def eval(self):
+        self._is_test = True
diff --git a/python/paddle/fluid/dygraph/jit.py b/python/paddle/fluid/dygraph/jit.py
index a60e03f8a881685e91145bf680324023882ca93d..bd468b55d812e76841cd946d30e5e9a9503c2a65 100644
--- a/python/paddle/fluid/dygraph/jit.py
+++ b/python/paddle/fluid/dygraph/jit.py
@@ -14,19 +14,22 @@
 
 from __future__ import print_function
 
-__all__ = ['TracedLayer', 'declarative', 'dygraph_to_static_func']
+import os
+import six
+import pickle
 
-import logging
+import warnings
 from paddle.fluid import core
 from paddle.fluid.compiler import CompiledProgram
 from paddle.fluid.dygraph.base import program_desc_tracing_guard, switch_to_static_graph
-from paddle.fluid.dygraph.dygraph_to_static.program_translator import ProgramTranslator
+from paddle.fluid.dygraph.dygraph_to_static.program_translator import ProgramTranslator, FunctionSpec
 from paddle.fluid.dygraph.layers import Layer
 from paddle.fluid.executor import Executor, scope_guard
-from paddle.fluid.framework import Program, Block, Variable, _dygraph_tracer, dygraph_only, _dygraph_guard, _current_expected_place, in_dygraph_mode
+from paddle.fluid.framework import Program, Block, Variable, ParamBase, _dygraph_tracer, dygraph_only, _dygraph_guard, _current_expected_place, in_dygraph_mode
 from paddle.fluid.wrapped_decorator import wrap_decorator
+from paddle.fluid.dygraph.io import TranslatedLayer, VARIABLE_FILENAME, EXTRA_VAR_INFO_FILENAME
 
-logger = logging.getLogger("fluid")
+__all__ = ['TracedLayer', 'declarative', 'dygraph_to_static_func']
 
 
 def create_program_from_desc(program_desc):
@@ -104,7 +107,7 @@ def _dygraph_to_static_func_(dygraph_func):
     def __impl__(*args, **kwargs):
         program_translator = ProgramTranslator()
         if in_dygraph_mode() or not program_translator.enable_declarative:
-            logger.info(
+            warnings.warn(
                 "The decorator 'dygraph_to_static_func' doesn't work in "
                 "dygraph mode or set ProgramTranslator.enable to False. "
                 "We will just return dygraph output.")
@@ -156,7 +159,7 @@ def _declarative_(dygraph_func):
     def __impl__(*args, **kwargs):
         program_translator = ProgramTranslator()
         if not program_translator.enable_declarative:
-            logger.info(
+            warnings.warn(
                 "The decorator 'declarative' doesn't work when setting ProgramTranslator.enable=False. "
                 "We will just return dygraph output.")
             return dygraph_func(*args, **kwargs)
@@ -168,6 +171,802 @@ def _declarative_(dygraph_func):
 declarative = wrap_decorator(_declarative_)
 
 
+class SaveLoadConfig(object):
+    """
+    The additional configuration options may be used in function 
+    :ref:`api_imperative_jit_save` that save :ref:`api_imperative_TranslatedLayer` 
+    or used in function :ref:`api_imperative_jit_load` that 
+    load :ref:`api_imperative_TranslatedLayer` .
+    
+    Examples:
+        1. Using ``SaveLoadConfig`` when saving model
+
+        .. code-block:: python
+
+            import numpy as np
+            import paddle.fluid as fluid
+            from paddle.fluid.dygraph import Linear
+            from paddle.fluid.dygraph import declarative
+
+            class SimpleNet(fluid.dygraph.Layer):
+                def __init__(self, in_size, out_size):
+                    super(SimpleNet, self).__init__()
+                    self._linear = Linear(in_size, out_size)
+
+                @declarative
+                def forward(self, x):
+                    y = self._linear(x)
+                    z = self._linear(y)
+                    return z
+
+            # enable dygraph mode
+            fluid.enable_dygraph() 
+
+            # train model
+            net = SimpleNet(8, 8)
+            adam = fluid.optimizer.AdamOptimizer(learning_rate=0.1, parameter_list=net.parameters())
+            x = fluid.dygraph.to_variable(np.random.random((4, 8)).astype('float32'))
+            for i in range(10):
+                out = net(x)
+                loss = fluid.layers.mean(out)
+                loss.backward()
+                adam.minimize(loss)
+                net.clear_gradients()
+
+            # use SaveLoadconfig when saving model
+            model_path = "simplenet.example.model"
+            configs = fluid.dygraph.jit.SaveLoadConfig()
+            configs.model_filename = "__simplenet__"
+            fluid.dygraph.jit.save(
+                layer=net,
+                model_path=model_path,
+                input_spec=[x],
+                configs=configs)
+
+        2. Using ``SaveLoadConfig`` when loading model
+
+        .. code-block:: python
+
+            import numpy as np
+            import paddle.fluid as fluid
+
+            # enable dygraph mode
+            fluid.enable_dygraph() 
+
+            # use SaveLoadconfig when loading model
+            model_path = "simplenet.example.model"
+            configs = fluid.dygraph.jit.SaveLoadConfig()
+            configs.model_filename = "__simplenet__"
+            infer_net = fluid.dygraph.jit.load(model_path, configs=configs)
+            # inference
+            x = fluid.dygraph.to_variable(np.random.random((4, 8)).astype('float32'))
+            pred = infer_net(x)
+    """
+
+    def __init__(self):
+        self._output_spec = None
+        self._model_filename = None
+        self._params_filename = None
+        self._separate_params = False
+
+        # NOTE: Users rarely use following configs, so these configs are not open to users,
+        # reducing user learning costs, but we retain the configuration capabilities
+
+        # If True, programs are modified to only support direct inference deployment. 
+        # Otherwise,more information will be stored for flexible optimization and re-training. 
+        # Currently, only True is supported
+        self._export_for_deployment = True
+
+        # If True, It will save inference program only, and do not save params of Program
+        self._program_only = False
+
+    @property
+    def output_spec(self):
+        """
+        Selects the output targets of the saved model ( :ref:`api_imperative_TranslatedLayer` ).
+        By default, all return variables of original Layer's forward function
+        are kept as the output of the saved TranslatedLayer.
+
+        The ``output_spec`` type should be list[Variable]. If the provided ``output_spec``
+        list is not all output variables, the saved model will be pruned according to the
+        given ``output_spec`` list.
+
+        .. note::
+            The ``output_spec`` is only used when saving model.
+
+        Examples:
+            .. code-block:: python
+
+                import numpy as np
+                import paddle.fluid as fluid
+                from paddle.fluid.dygraph import Linear
+                from paddle.fluid.dygraph import declarative
+
+                class SimpleNet(fluid.dygraph.Layer):
+                    def __init__(self, in_size, out_size):
+                        super(SimpleNet, self).__init__()
+                        self._linear = Linear(in_size, out_size)
+
+                    @declarative
+                    def forward(self, x):
+                        y = self._linear(x)
+                        z = self._linear(y)
+                        loss = fluid.layers.mean(z)
+                        return z, loss
+
+                # enable dygraph mode
+                fluid.enable_dygraph() 
+
+                # train model
+                net = SimpleNet(8, 8)
+                adam = fluid.optimizer.AdamOptimizer(learning_rate=0.1, parameter_list=net.parameters())
+                x = fluid.dygraph.to_variable(np.random.random((4, 8)).astype('float32'))
+                for i in range(10):
+                    out, loss = net(x)
+                    loss.backward()
+                    adam.minimize(loss)
+                    net.clear_gradients()
+
+                # use SaveLoadconfig.output_spec
+                model_path = "simplenet.example.model.output_spec"
+                configs = fluid.dygraph.jit.SaveLoadConfig()
+                # only keep the predicted output in saved model, diccard loss
+                configs.output_spec = [out]
+
+                fluid.dygraph.jit.save(
+                    layer=net,
+                    model_path=model_path,
+                    input_spec=[x],
+                    configs=configs)
+
+                infer_net = fluid.dygraph.jit.load(model_path, configs=configs)
+                x = fluid.dygraph.to_variable(np.random.random((4, 8)).astype('float32'))
+                # only have the predicted output
+                pred = infer_net(x)
+        """
+        return self._output_spec
+
+    @output_spec.setter
+    def output_spec(self, spec):
+        if not isinstance(spec, list):
+            raise TypeError(
+                "The SaveLoadConfig.output_spec should be 'list', but received input type is %s."
+                % type(input))
+            for var in spec:
+                if not isinstance(var, core.VarBase):
+                    raise TypeError(
+                        "The element in SaveLoadConfig.output_spec list should be 'Variable', but received element's type is %s."
+                        % type(var))
+        self._output_spec = spec
+
+    @property
+    def model_filename(self):
+        """
+        The name of file to save the translated program of target Layer.
+        Default filename is :code:`__model__` .
+
+        Exampels:
+            .. code-block:: python
+
+                import numpy as np
+                import paddle.fluid as fluid
+                from paddle.fluid.dygraph import Linear
+                from paddle.fluid.dygraph import declarative
+
+                class SimpleNet(fluid.dygraph.Layer):
+                    def __init__(self, in_size, out_size):
+                        super(SimpleNet, self).__init__()
+                        self._linear = Linear(in_size, out_size)
+
+                    @declarative
+                    def forward(self, x):
+                        y = self._linear(x)
+                        z = self._linear(y)
+                        return z
+
+                # enable dygraph mode
+                fluid.enable_dygraph() 
+
+                # train model
+                net = SimpleNet(8, 8)
+                adam = fluid.optimizer.AdamOptimizer(learning_rate=0.1, parameter_list=net.parameters())
+                x = fluid.dygraph.to_variable(np.random.random((4, 8)).astype('float32'))
+                for i in range(10):
+                    out = net(x)
+                    loss = fluid.layers.mean(out)
+                    loss.backward()
+                    adam.minimize(loss)
+                    net.clear_gradients()
+
+                model_path = "simplenet.example.model.model_filename"
+                configs = fluid.dygraph.jit.SaveLoadConfig()
+                configs.model_filename = "__simplenet__"
+
+                # saving with configs.model_filename
+                fluid.dygraph.jit.save(
+                    layer=net,
+                    model_path=model_path,
+                    input_spec=[x],
+                    configs=configs)
+                # [result] the saved model directory contains:
+                # __simplenet__  __variables__  __variables.info__
+
+                # loading with configs.model_filename
+                infer_net = fluid.dygraph.jit.load(model_path, configs=configs)
+                x = fluid.dygraph.to_variable(np.random.random((4, 8)).astype('float32'))
+                pred = infer_net(x)
+        """
+        return self._model_filename
+
+    @model_filename.setter
+    def model_filename(self, filename):
+        if not isinstance(filename, six.string_types):
+            raise TypeError(
+                "The SaveLoadConfig.model_filename should be str, but received input's type is %s."
+                % type(filename))
+        if len(filename) == 0:
+            raise ValueError(
+                "The SaveLoadConfig.model_filename is empty string.")
+        self._model_filename = filename
+
+    @property
+    def params_filename(self):
+        """
+        The name of file to save all persistable variables in target Layer. 
+        Default file name is :code:`__variables__` .
+        
+        Exampels:
+            .. code-block:: python
+
+                import numpy as np
+                import paddle.fluid as fluid
+                from paddle.fluid.dygraph import Linear
+                from paddle.fluid.dygraph import declarative
+
+                class SimpleNet(fluid.dygraph.Layer):
+                    def __init__(self, in_size, out_size):
+                        super(SimpleNet, self).__init__()
+                        self._linear = Linear(in_size, out_size)
+
+                    @declarative
+                    def forward(self, x):
+                        y = self._linear(x)
+                        z = self._linear(y)
+                        return z
+
+                # enable dygraph mode
+                fluid.enable_dygraph() 
+
+                # train model
+                net = SimpleNet(8, 8)
+                adam = fluid.optimizer.AdamOptimizer(learning_rate=0.1, parameter_list=net.parameters())
+                x = fluid.dygraph.to_variable(np.random.random((4, 8)).astype('float32'))
+                for i in range(10):
+                    out = net(x)
+                    loss = fluid.layers.mean(out)
+                    loss.backward()
+                    adam.minimize(loss)
+                    net.clear_gradients()
+
+                model_path = "simplenet.example.model.params_filename"
+                configs = fluid.dygraph.jit.SaveLoadConfig()
+                configs.params_filename = "__params__"
+
+                # saving with configs.params_filename
+                fluid.dygraph.jit.save(
+                    layer=net,
+                    model_path=model_path,
+                    input_spec=[x],
+                    configs=configs)
+                # [result] the saved model directory contains:
+                # __model__  __params__  __variables.info__
+
+                # loading with configs.params_filename
+                infer_net = fluid.dygraph.jit.load(model_path, configs=configs)
+                x = fluid.dygraph.to_variable(np.random.random((4, 8)).astype('float32'))
+                pred = infer_net(x)
+        """
+        return self._params_filename
+
+    @params_filename.setter
+    def params_filename(self, filename):
+        if not isinstance(filename, six.string_types):
+            raise TypeError(
+                "The SaveLoadConfig.params_filename should be str, but received input's type is %s."
+                % type(filename))
+        if len(filename) == 0:
+            raise ValueError(
+                "The SaveLoadConfig.params_filename is empty string.")
+        self._params_filename = filename
+
+    # NOTE: [why not use params_filename=None control params saved separately]
+    # The new save interface does not recommend parameters to be saved separately. 
+    # Here, the concept should be separated as clearly as possible. 
+    # Setting params_filename=None only means that the saved file name is set 
+    # and without any other meaning. New separate_params control for file saved
+    # separately can makes the concept clearer.
+    @property
+    def separate_params(self):
+        """
+        Configure whether to save the Layer parameters as separete files.
+        (In order to be compatible with the behavior of :ref:`api_fluid_io_save_inference_model` )
+
+        If True, each parameter will be saved to a file separately, the file name is the parameter name,
+        and the SaveLoadConfig.params_filename configuration will not take effect. Default False.
+
+        Examples:
+            .. code-block:: python
+
+                import numpy as np
+                import paddle.fluid as fluid
+                from paddle.fluid.dygraph import Linear
+                from paddle.fluid.dygraph import declarative
+
+                class SimpleNet(fluid.dygraph.Layer):
+                    def __init__(self, in_size, out_size):
+                        super(SimpleNet, self).__init__()
+                        self._linear = Linear(in_size, out_size)
+
+                    @declarative
+                    def forward(self, x):
+                        y = self._linear(x)
+                        z = self._linear(y)
+                        return z
+
+                # enable dygraph mode
+                fluid.enable_dygraph() 
+
+                # train model
+                net = SimpleNet(8, 8)
+                adam = fluid.optimizer.AdamOptimizer(learning_rate=0.1, parameter_list=net.parameters())
+                x = fluid.dygraph.to_variable(np.random.random((4, 8)).astype('float32'))
+                for i in range(10):
+                    out = net(x)
+                    loss = fluid.layers.mean(out)
+                    loss.backward()
+                    adam.minimize(loss)
+                    net.clear_gradients()
+
+                model_path = "simplenet.example.model.separate_params"
+                configs = fluid.dygraph.jit.SaveLoadConfig()
+                configs.separate_params = True
+
+                # saving with configs.separate_params
+                fluid.dygraph.jit.save(
+                    layer=net,
+                    model_path=model_path,
+                    input_spec=[x],
+                    configs=configs)
+                # [result] the saved model directory contains:
+                # linear_0.b_0  linear_0.w_0  __model__  __variables.info__
+
+                # loading with configs.params_filename
+                infer_net = fluid.dygraph.jit.load(model_path, configs=configs)
+                x = fluid.dygraph.to_variable(np.random.random((4, 8)).astype('float32'))
+                pred = infer_net(x)
+        """
+        return self._separate_params
+
+    @separate_params.setter
+    def separate_params(self, value):
+        if not isinstance(value, bool):
+            raise TypeError(
+                "The SaveLoadConfig.separate_params should be bool value, but received input's type is %s."
+                % type(value))
+        self._separate_params = value
+
+
+@switch_to_static_graph
+def save(layer, model_path, input_spec=None, configs=None):
+    """
+    Saves input declarative Layer as :ref:`api_imperative_TranslatedLayer` 
+    format model, which can be used for inference or fine-tuning after loading.
+
+    It will save the translated program and all related persistable 
+    variables of input declarative Layer to given ``model_path``.
+    
+    The default saved translated program file name is ``__model__``,
+    and the default saved persistable variables file name is ``__variables__``,
+    and it also saved some additional variable description information to file 
+    ``__varibales.info__``, these additional information is used in fine-tuning.
+
+    The saved model can be loaded by follow APIs:
+      - :ref:`api_imperative_jit_load`
+      - :ref:`api_fluid_io_load_inference_model` (need pass ``params_filename='__variables__'``)
+      - Other C++ inference APIs
+
+    Args:
+        layer (Layer): the Layer to be saved. The Layer should be decorated by `@declarative`.
+        model_path (str): the directory to save the model.
+        input_spec (list[Varibale], optional): Describes the input of the saved model. 
+            It is the example inputs that will be passed to saved TranslatedLayer's forward
+            function. If None, all input variables of the original Layer's forward function
+            would be the inputs of the saved model. Default None.
+        configs (SaveLoadConfig, optional): :ref:`api_imperative_jit_saveLoadConfig` object
+            that specifies additional configuration options. Default None.
+    Returns:
+        None
+
+    Examples:
+        .. code-block:: python
+
+            import numpy as np
+            import paddle.fluid as fluid
+            from paddle.fluid.dygraph import Linear
+            from paddle.fluid.dygraph import declarative
+
+            BATCH_SIZE = 32
+            BATCH_NUM = 20
+
+            def random_batch_reader():
+                def _get_random_images_and_labels(image_shape, label_shape):
+                    image = np.random.random(size=image_shape).astype('float32')
+                    label = np.random.random(size=label_shape).astype('int64')
+                    return image, label
+
+                def __reader__():
+                    for _ in range(BATCH_NUM):
+                        batch_image, batch_label = _get_random_images_and_labels(
+                            [BATCH_SIZE, 784], [BATCH_SIZE, 1])
+                        yield batch_image, batch_label
+
+                return __reader__
+
+            class LinearNet(fluid.dygraph.Layer):
+                def __init__(self, in_size, out_size):
+                    super(LinearNet, self).__init__()
+                    self._linear = Linear(in_size, out_size)
+
+                @declarative
+                def forward(self, x):
+                    return self._linear(x)
+
+            # enable dygraph mode
+            fluid.enable_dygraph() 
+
+            # create network
+            net = LinearNet(784, 1)
+            adam = fluid.optimizer.AdamOptimizer(learning_rate=0.1, parameter_list=net.parameters())
+            # create data loader
+            train_loader = fluid.io.DataLoader.from_generator(capacity=5)
+            train_loader.set_batch_generator(random_batch_reader())
+            # train
+            for data in train_loader():
+                img, label = data
+                label.stop_gradient = True
+
+                cost = net(img)
+
+                loss = fluid.layers.cross_entropy(cost, label)
+                avg_loss = fluid.layers.mean(loss)
+
+                avg_loss.backward()
+                adam.minimize(avg_loss)
+                net.clear_gradients()
+
+            # save model
+            model_path = "linear.example.model"
+            fluid.dygraph.jit.save(
+                layer=net,
+                model_path=model_path,
+                input_spec=[img])
+    """
+
+    def get_inout_spec(all_vars, target_vars, return_name=False):
+        valid_vars = [var for var in all_vars if isinstance(var, Variable)]
+        valid_var_dict = {}
+        for var in valid_vars:
+            valid_var_dict[var.name] = var
+        if target_vars:
+            for i, var in enumerate(target_vars):
+                # check target var whether exists
+                if var.name not in valid_var_dict:
+                    raise RuntimeError(
+                        "The variable to feed/fetch are not exist.")
+                target_vars[i] = valid_var_dict[var.name]
+        else:
+            target_vars = valid_vars
+        if return_name:
+            target_vars = [var.name for var in target_vars]
+
+        return target_vars
+
+    # 1. input check
+    prog_translator = ProgramTranslator()
+    if not prog_translator.enable:
+        raise RuntimeError(
+            "The paddle.imperative.jit.save doesn't work when setting ProgramTranslator.enable=False."
+        )
+    if not isinstance(layer, Layer):
+        raise TypeError(
+            "The input layer of paddle.imperative.jit.save should be 'Layer', but received layer type is %s."
+            % type(layer))
+
+    if configs is None:
+        configs = SaveLoadConfig()
+
+    if input_spec is not None:
+        if not isinstance(input_spec, list):
+            raise TypeError(
+                "The input input_spec should be 'list', but received input_spec's type is %s."
+                % type(input_spec))
+        for var in input_spec:
+            if not isinstance(var, core.VarBase):
+                raise TypeError(
+                    "The element in input_spec list should be 'Variable', but received element's type is %s."
+                    % type(var))
+
+    # 2. get program of declarative Layer.forward
+    prog_cache = prog_translator.get_program_cache()
+    # make dummy args & kwargs, to get excepted FunctionSpec
+    layer_func = FunctionSpec(type(layer).forward, [layer], {})
+    concrete_program, _ = prog_cache.get_program(layer_func)
+
+    # 3. share parameters from Layer to scope & record var info
+    scope = core.Scope()
+    state_dict = layer.state_dict()
+    extra_var_info = dict()
+    for structured_name, param_or_buffer in state_dict.items():
+        # share to scope
+        param_or_buffer_tensor = scope.var(param_or_buffer.name).get_tensor()
+        src_tensor = param_or_buffer.value().get_tensor()
+        param_or_buffer_tensor._share_data_with(src_tensor)
+        # record var info
+        extra_info_dict = dict()
+        extra_info_dict['structured_name'] = structured_name
+        extra_info_dict['stop_gradient'] = param_or_buffer.stop_gradient
+        if isinstance(param_or_buffer, ParamBase):
+            extra_info_dict['trainable'] = param_or_buffer.trainable
+        extra_var_info[param_or_buffer.name] = extra_info_dict
+
+    # 4. build input & output spec
+    input_var_names = get_inout_spec(concrete_program.inputs, input_spec, True)
+    output_vars = get_inout_spec(concrete_program.outputs, configs.output_spec)
+
+    # 5. save inference model
+    from paddle.fluid.io import save_inference_model
+
+    # VARIABLE_FILENAME keep nameing style consistent with '__model__'
+    if configs.params_filename is None:
+        configs.params_filename = VARIABLE_FILENAME
+
+    with scope_guard(scope):
+        save_inference_model(
+            dirname=model_path,
+            feeded_var_names=input_var_names,
+            target_vars=output_vars,
+            executor=Executor(_current_expected_place()),
+            main_program=concrete_program.main_program.clone(),
+            model_filename=configs.model_filename,
+            params_filename=None
+            if configs.separate_params else configs.params_filename,
+            export_for_deployment=configs._export_for_deployment,
+            program_only=configs._program_only)
+
+        # NOTE: [ Save extra variable info ]
+        # save_inference_model will lose some important variable information, including:
+        #   - Variable name and correspondence (when saved variables as one file)
+        #   - Variable.stop_gradient information
+        #   - Which persistent variable are parameter and which are not
+        #   - Parameter.trainable information
+        #
+        # The lost information cannot be recovered when it is loaded again, 
+        # so if we want to perform fine-tune after loading, we may need to 
+        # configure redundant information to proceed.
+        #
+        # Due to compatibility issues, we cannot change the original storage structure, 
+        # but we can save these information in `jit.save` without changing the original 
+        # storage to improve user experience. So we save extra information into
+        # file `__variables.info__`
+        extra_var_info_path = os.path.join(model_path, EXTRA_VAR_INFO_FILENAME)
+        with open(extra_var_info_path, 'wb') as f:
+            pickle.dump(extra_var_info, f, protocol=2)
+
+
+@dygraph_only
+def load(model_path, configs=None):
+    """
+    :api_attr: imperative
+
+    Load model saved by :ref:`api_imperative_jit_save` or :ref:`api_fluid_io_save_inference_model`
+    as :ref:`api_imperative_TranslatedLayer`, then performing inference or fine-tune training.
+
+    .. note::
+        For some historical reasons, if you load model saved by :ref:`api_fluid_io_save_inference_model`,
+        there will be the following limitations when using it in fine-tuning:
+        1. Imperative mode do not support LoDTensor. All original model's feed targets or parametars that depend on LoD are temporarily unavailable.
+        2. All saved model's feed targets need to be passed into TranslatedLayer's forwrad function.
+        3. The variable's ``stop_gradient`` information is lost and can not be recovered.
+        4. The parameter's ``trainable`` information is lost and can not be recovered.
+
+    Args:
+        model_path (str): The directory path where the model is saved.
+        configs (SaveLoadConfig, optional): :ref:`api_imperative_jit_saveLoadConfig` object that specifies 
+            additional configuration options. Default None.
+
+    Returns:
+        TranslatedLayer: A Layer object can run saved translated model.
+
+    Examples:
+        1. Load model saved by :ref:`api_imperative_jit_save` then performing inference and fine-tune training.
+
+        .. code-block:: python
+
+            import numpy as np
+            import paddle.fluid as fluid
+            from paddle.fluid.dygraph import Linear
+            from paddle.fluid.dygraph import declarative
+
+            BATCH_SIZE = 32
+            BATCH_NUM = 20
+
+            def random_batch_reader():
+                def _get_random_images_and_labels(image_shape, label_shape):
+                    image = np.random.random(size=image_shape).astype('float32')
+                    label = np.random.random(size=label_shape).astype('int64')
+                    return image, label
+
+                def __reader__():
+                    for _ in range(BATCH_NUM):
+                        batch_image, batch_label = _get_random_images_and_labels(
+                            [BATCH_SIZE, 784], [BATCH_SIZE, 1])
+                        yield batch_image, batch_label
+
+                return __reader__
+
+            class LinearNet(fluid.dygraph.Layer):
+                def __init__(self, in_size, out_size):
+                    super(LinearNet, self).__init__()
+                    self._linear = Linear(in_size, out_size)
+
+                @declarative
+                def forward(self, x):
+                    return self._linear(x)
+
+            # enable dygraph mode
+            fluid.enable_dygraph() 
+
+            # 1. train & save model.
+            # create network
+            net = LinearNet(784, 1)
+            adam = fluid.optimizer.AdamOptimizer(learning_rate=0.1, parameter_list=net.parameters())
+            # create data loader
+            train_loader = fluid.io.DataLoader.from_generator(capacity=5)
+            train_loader.set_batch_generator(random_batch_reader())
+            # train
+            for data in train_loader():
+                img, label = data
+                label.stop_gradient = True
+
+                cost = net(img)
+
+                loss = fluid.layers.cross_entropy(cost, label)
+                avg_loss = fluid.layers.mean(loss)
+
+                avg_loss.backward()
+                adam.minimize(avg_loss)
+                net.clear_gradients()
+
+            model_path = "linear.example.model"
+            fluid.dygraph.jit.save(
+                layer=net,
+                model_path=model_path,
+                input_spec=[img])
+
+            # 2. load model & inference
+            # load model
+            infer_net = fluid.dygraph.jit.load(model_path)
+            # inference
+            x = fluid.dygraph.to_variable(np.random.random((1, 784)).astype('float32'))
+            pred = infer_net(x)
+
+            # 3. load model & fine-tune
+            # load model
+            train_net = fluid.dygraph.jit.load(model_path)
+            train_net.train()
+            adam = fluid.optimizer.AdamOptimizer(learning_rate=0.1, parameter_list=train_net.parameters())
+            # create data loader
+            train_loader = fluid.io.DataLoader.from_generator(capacity=5)
+            train_loader.set_batch_generator(random_batch_reader())
+            # fine-tune
+            for data in train_loader():
+                img, label = data
+                label.stop_gradient = True
+
+                cost = train_net(img)
+
+                loss = fluid.layers.cross_entropy(cost, label)
+                avg_loss = fluid.layers.mean(loss)
+
+                avg_loss.backward()
+                adam.minimize(avg_loss)
+                train_net.clear_gradients()
+
+        2. Load model saved by :ref:`api_fluid_io_save_inference_model` then performing and fine-tune training.
+
+        .. code-block:: python
+
+            import numpy as np
+            import paddle.fluid as fluid
+
+            BATCH_SIZE = 32
+            BATCH_NUM = 20
+
+            def random_batch_reader():
+                def _get_random_images_and_labels(image_shape, label_shape):
+                    image = np.random.random(size=image_shape).astype('float32')
+                    label = np.random.random(size=label_shape).astype('int64')
+                    return image, label
+
+                def __reader__():
+                    for _ in range(BATCH_NUM):
+                        batch_image, batch_label = _get_random_images_and_labels(
+                            [BATCH_SIZE, 784], [BATCH_SIZE, 1])
+                        yield batch_image, batch_label
+
+                return __reader__
+
+            img = fluid.data(name='img', shape=[None, 784], dtype='float32')
+            label = fluid.data(name='label', shape=[None, 1], dtype='int64')
+            pred = fluid.layers.fc(input=img, size=10, act='softmax')
+            loss = fluid.layers.cross_entropy(input=pred, label=label)
+            avg_loss = fluid.layers.mean(loss)
+
+            optimizer = fluid.optimizer.SGD(learning_rate=0.001)
+            optimizer.minimize(avg_loss)
+
+            place = fluid.CPUPlace()
+            exe = fluid.Executor(place)
+            exe.run(fluid.default_startup_program())
+
+            loader = fluid.io.DataLoader.from_generator(
+                feed_list=[img, label], capacity=5, iterable=True)
+            loader.set_batch_generator(random_batch_reader(), places=place)
+
+            # 1. train and save inference model
+            for data in loader():
+                exe.run(
+                    fluid.default_main_program(),
+                    feed=data, 
+                    fetch_list=[avg_loss])
+
+            model_path = "fc.example.model"
+            fluid.io.save_inference_model(
+                model_path, ["img"], [pred], exe)
+
+            # enable dygraph mode
+            fluid.enable_dygraph() 
+
+            # 2. load model & inference
+            fc = fluid.dygraph.jit.load(model_path)
+            x = fluid.dygraph.to_variable(np.random.random((1, 784)).astype('float32'))
+            pred = fc(x)
+
+            # 3. load model & fine-tune
+            fc = fluid.dygraph.jit.load(model_path)
+            fc.train()
+            sgd = fluid.optimizer.SGD(learning_rate=0.001,
+                                        parameter_list=fc.parameters())
+
+            train_loader = fluid.io.DataLoader.from_generator(capacity=5)
+            train_loader.set_batch_generator(
+                random_batch_reader(), places=place)
+
+            for data in train_loader():
+                img, label = data
+                label.stop_gradient = True
+
+                cost = fc(img)
+
+                loss = fluid.layers.cross_entropy(cost, label)
+                avg_loss = fluid.layers.mean(loss)
+
+                avg_loss.backward()
+                sgd.minimize(avg_loss)
+    """
+    return TranslatedLayer._construct(model_path, configs)
+
+
 @dygraph_only
 def _trace(layer,
            inputs,
diff --git a/python/paddle/fluid/dygraph/layers.py b/python/paddle/fluid/dygraph/layers.py
index d80170800deab146f7062da1e8f884c8afe12e24..bba4eb071a4db36b2c4b772843f545937c64e916 100644
--- a/python/paddle/fluid/dygraph/layers.py
+++ b/python/paddle/fluid/dygraph/layers.py
@@ -90,6 +90,9 @@ class Layer(core.Layer):
         self._dtype = dtype
 
         self._parameters = collections.OrderedDict()
+        # Buffers the variable (not parameter) created in layer
+        self._buffers = collections.OrderedDict()
+        self._non_persistable_buffer_names_set = set()
         self._sub_layers = collections.OrderedDict()
         self._loaddict_holder = collections.OrderedDict()
 
@@ -414,6 +417,137 @@ class Layer(core.Layer):
                         layers_set=layers_set):
                     yield p, l
 
+    def register_buffer(self, name, variable, persistable=True):
+        """
+        Registers a variable as buffer into the layer.
+
+        `buffer` is a non-parameteric variable and will not be updated by optimizer,
+        but is necessary for evaluation and inference. For example, the mean and variance in BatchNorm layers.
+        The registered buffer is persistable by default, and will be saved into
+        `state_dict` alongside parameters. If set persistable=False, it registers
+        a non-persistable buffer, so that it will not be a part of `state_dict` .
+
+        Buffers can be accessed as attributes using given names.
+
+        Parameters:
+            name (string): name of the buffer. The buffer can be accessed
+                from this layer using the given name
+            variable (Variable): the variable to be registered as buffer.
+            persistable (bool): whether the buffer is part of this layer's
+                state_dict.
+
+        Returns:
+            None
+        
+        Examples:
+            .. code-block:: python
+
+                import numpy as np
+                import paddle.fluid as fluid
+
+                with fluid.dygraph.guard():
+                    linear = fluid.Linear(10, 3)
+                    value = np.array([0]).astype("float32")
+                    buffer = fluid.dygraph.to_variable(value)
+                    linear.register_buffer("buf_name", buffer, persistable=True)
+                    
+                    # get the buffer by attribute.
+                    print(linear.buf_name)
+
+        """
+
+        if '_buffers' not in self.__dict__:
+            raise ValueError(
+                "super(YourLayer, self).__init__() should be called first")
+        elif not isinstance(name, six.string_types):
+            raise TypeError(
+                "The name of buffer should be a string, but received {}.".
+                format(type(name).__name__))
+        elif '.' in name:
+            raise KeyError("The name of buffer can not contain \".\"")
+        elif name == '':
+            raise KeyError("The name of buffer can not be empty.")
+        elif hasattr(self, name) and name not in self._buffers:
+            raise KeyError("attribute '{}' already exists.".format(name))
+        elif variable is not None and not type(variable) == core.VarBase:
+            raise TypeError(
+                "The registered buffer should be a core.VarBase, but received {}.".
+                format(type(variable).__name__))
+        else:
+            self._buffers[name] = variable
+            if persistable:
+                self._non_persistable_buffer_names_set.discard(name)
+            else:
+                self._non_persistable_buffer_names_set.add(name)
+
+    def buffers(self, include_sublayers=True):
+        """
+        Returns a list of all buffers from current layer and its sub-layers.
+
+        Parameters:
+            include_sublayers(bool, optional): Whether include the buffers of sublayers. If True, also include the buffers from sublayers. Default: True
+
+        Returns:
+            list of :ref:`api_guide_Variable_en` : a list of buffers.
+        """
+        ret = [
+            buffer
+            for _, buffer in self.named_buffers(
+                include_sublayers=include_sublayers)
+        ]
+        return ret
+
+    def named_buffers(self, prefix='', include_sublayers=True):
+        """
+        Returns an iterator over all buffers in the Layer, yielding tuple of name and Variable.
+
+        Parameters:
+            prefix(str, optional): Prefix to prepend to all buffer names. Default: ''.
+            include_sublayers(bool, optional): Whether include the buffers of sublayers.
+                If True, also include the named buffers from sublayers. Default: True.
+
+        Yields:
+            (string, Variable): Tuple of name and Variable
+
+        Examples:
+            .. code-block:: python
+
+                import numpy as np
+                import paddle.fluid as fluid
+
+                with fluid.dygraph.guard():
+                    fc1 = fluid.Linear(10, 3)
+                    buffer1 = fluid.dygraph.to_variable(np.array([0]).astype("float32"))
+                    # register a variable as buffer by specific `persistable`
+                    fc1.register_buffer("buf_name_1", buffer1, persistable=True)
+
+                    fc2 = fluid.Linear(3, 10)
+                    buffer2 = fluid.dygraph.to_variable(np.array([1]).astype("float32"))
+                    # register a buffer by assigning an attribute with Variable.
+                    # The `persistable` can only be False by this way.
+                    fc2.buf_name_2 = buffer2
+
+                    model = fluid.dygraph.Sequential(fc1, fc2)
+
+                    # get all named buffers
+                    for name, buffer in model.named_buffers():
+                        print(name, buffer)
+
+        """
+        buffers_set = set()
+        named_sublayers = self.named_sublayers(
+            prefix=prefix,
+            include_sublayers=include_sublayers,
+            include_self=True)
+        for layer_prefix, sublayer in named_sublayers:
+            buffers = sublayer._buffers.items()
+            for key, buffer in buffers:
+                if buffer is None or buffer in buffers_set:
+                    continue
+                buffers_set.add(buffer)
+                name = layer_prefix + ('.' if layer_prefix else '') + key
+                yield name, buffer
+
     def clear_gradients(self):
         """
         Clear the gradients of all parameters for this layer.
@@ -462,7 +596,7 @@ class Layer(core.Layer):
                         self._parameters.values())
             self._built = True
 
-        with param_guard(self._parameters):
+        with param_guard(self._parameters), param_guard(self._buffers):
             outputs = self.forward(*inputs, **kwargs)
 
         for forward_post_hook in self._forward_post_hooks.values():
@@ -534,6 +668,8 @@ class Layer(core.Layer):
             return self._parameters[name]
         elif name in self._sub_layers:
             return self._sub_layers[name]
+        elif name in self._buffers:
+            return self._buffers[name]
         else:
             return object.__getattribute__(self, name)
 
@@ -556,7 +692,7 @@ class Layer(core.Layer):
 
                 value.set_value(self._loaddict_holder[value.name])
 
-            _remove_if_exist(self.__dict__, self._sub_layers)
+            _remove_if_exist(self.__dict__, self._buffers, self._sub_layers)
             params[name] = value
         elif params is not None and name in params:
             if value is not None:
@@ -572,7 +708,7 @@ class Layer(core.Layer):
                         "super(YourLayer, self).__init__() should be called first"
                     )
 
-                _remove_if_exist(self.__dict__, self._parameters)
+                _remove_if_exist(self.__dict__, self._parameters, self._buffers)
                 layers[name] = value
             elif layers is not None and name in layers:
                 if value is not None:
@@ -581,29 +717,89 @@ class Layer(core.Layer):
                         .format(name, type(value).__name__))
                 layers[name] = None
             else:
-                object.__setattr__(self, name, value)
+                _buffers = self.__dict__.get('_buffers', None)
+                if type(value) == core.VarBase:
+                    if _buffers is None:
+                        raise ValueError(
+                            "super(YourLayer, self).__init__() should be called first"
+                        )
+                    _remove_if_exist(self.__dict__, self._parameters,
+                                     self._sub_layers)
+                    # Set persistable=False by default. Only `register_buffer` can
+                    # add a persistable buffer.
+                    if name not in self._buffers:
+                        self._non_persistable_buffer_names_set.add(name)
+                    _buffers[name] = value
+                elif _buffers is not None and name in _buffers:
+                    if value is not None:
+                        raise TypeError(
+                            "assignment to buffers '{}' should be of type core.VarBase or None, but got '{}'"
+                            .format(name, type(value).__name__))
+                    # Assigning None will remove the buffer, but if re-assign a new varBase to it,
+                    # it will be remarked as a buffer with same `persistable` attribute.
+                    _buffers[name] = None
+                else:
+                    object.__setattr__(self, name, value)
 
     def __delattr__(self, name):
         if name in self._parameters:
             del self._parameters[name]
         elif name in self._sub_layers:
             del self._sub_layers[name]
+        elif name in self._buffers:
+            del self._buffers[name]
+            self._non_persistable_buffer_names_set.discard(name)
         else:
             object.__delattr__(self, name)
 
+    def __dir__(self):
+        """
+        Return a list. Get all parameters, buffers(non-parameter variables), sublayers, method and attr of Layer.
+
+        Examples:
+            import paddle.fluid as fluid
+            import numpy as np
+
+            fluid.dygraph.enable_dygraph()
+
+            class Mylayer(fluid.dygraph.Layer):
+                def __init__(self):
+                    super(Mylayer, self).__init__()
+                    self.linear1 = fluid.dygraph.Linear(10, 10)
+                    self.linear2 = fluid.dygraph.Linear(5, 5)
+                    self.conv2d = fluid.dygraph.Conv2D(3, 2, 3)
+                    self.embedding = fluid.dygraph.Embedding(size=[128, 16])
+                    self.h_0 = fluid.dygraph.to_variable(np.zeros([10, 10]).astype('float32'))
+
+            mylayer = Mylayer()
+            print(dir(mylayer))
+            # only parts are shown, because of list have too much content
+            # ['__call__', '__class__',  ... , 'conv2d', 'embedding', 'h_0', 'linear1', 'linear2', ... , 'sublayers', 'train']
+
+        """
+        method = dir(self.__class__)
+        attrs = list(self.__dict__.keys())
+        parameters = list(self._parameters.keys())
+        sublayers = list(self._sub_layers.keys())
+        buffers = list(self._buffers.keys())
+
+        keys = method + attrs + parameters + sublayers + buffers
+
+        return keys
+
     def state_dict(self,
                    destination=None,
                    include_sublayers=True,
                    structured_name_prefix=""):
         '''
-        Get all parameters of current layer and its sub-layers. And set all the parameters into a dict
+        Get all parameters and persistable buffers of current layer and its sub-layers. And set them into a dict
 
         Parameters:
-            destination(dict, optional) : If provide, all the parameters will set to this dict . Default: None
-            include_sublayers(bool, optional) : If true, also include the parameters from sublayers. Default: True
+            destination(dict, optional) : If provide, all the parameters and persistable buffers will be set to this dict . Default: None
+            include_sublayers(bool, optional) : If true, also include the parameters and persistable buffers from sublayers. Default: True
 
         Retruns:
-            dict: a dict contains all the parameters
+            dict: a dict contains all the parameters and persistable buffers.
 
         Examples:
             .. code-block:: python
@@ -622,6 +818,9 @@ class Layer(core.Layer):
         for name, data in self._parameters.items():
             if data is not None:
                 destination[structured_name_prefix + name] = data
+        for name, buffer in self._buffers.items():
+            if buffer is not None and name not in self._non_persistable_buffer_names_set:
+                destination[structured_name_prefix + name] = buffer
 
         if include_sublayers:
             for layer_name, layer_item in self._sub_layers.items():
@@ -639,12 +838,12 @@ class Layer(core.Layer):
                  include_sublayers=True,
                  use_structured_name=True):
         '''
-        Set parameters from stat_dict. All the parameters will be reset by the tensor in the stat_dict
+        Set parameters and persistable buffers from stat_dict. All the parameters and buffers will be reset by the tensor in the stat_dict
 
         Parameters:
-            state_dict(dict) : Dict contains all the parameters
-            include_sublayers(bool, optional) : If true, also include the parameters from sublayers. Default: True
-            use_structured_name(bool, optional) : If true, use structured name as key, otherwise, use parameter name as key. 
+            state_dict(dict) : Dict contains all the parameters and persistable buffers.
+            include_sublayers(bool, optional) : If true, also include the parameters and peresistable buffers from sublayers. Default: True
+            use_structured_name(bool, optional) : If true, use structured name as key, otherwise, use parameter or buffer name as key. 
                                                   Default: True
         Returns:
             None
@@ -674,14 +873,14 @@ class Layer(core.Layer):
                   include_sublayers=True,
                   use_structured_name=True):
         '''
-        Set parameters from stat_dict. All the parameters will be reset by the tensor in the stat_dict
+        Set parameters and persistable buffers from stat_dict. All the parameters and persistabl buffers will be reset by the tensor in the stat_dict
 
         This api will be Deprecated. Please use set_dict
 
         Parameters:
-            state_dict(dict) : Dict contains all the parameters
-            include_sublayers(bool, optional) : If true, also include the parameters from sublayers. Default: True
-            use_structured_name(bool, optional) : If true, use structured name as key, otherwise, use parameter name as key.
+            state_dict(dict) : Dict contains all the parameters and persistable buffers.
+            include_sublayers(bool, optional) : If true, also include the parameters and persistable buffers from sublayers. Default: True
+            use_structured_name(bool, optional) : If true, use structured name as key, otherwise, use parameter or buffer name as key.
                                                   Default: True
         Returns:
             None
@@ -704,13 +903,13 @@ class Layer(core.Layer):
 
         inner_state_dict = self.state_dict()
 
-        for name, para in inner_state_dict.items():
-            key_name = name if use_structured_name else para.name
+        for name, param_or_buffer in inner_state_dict.items():
+            key_name = name if use_structured_name else param_or_buffer.name
             if key_name in stat_dict:
-                para.set_value(stat_dict[key_name])
+                param_or_buffer.set_value(stat_dict[key_name])
             else:
                 raise RuntimeError(
-                    "Parameter not found, Can't not find [ {} ] in stat_dict"
+                    "Parameter or persistable buffer not found, Can't find [ {} ] in stat_dict"
                     "use_structured_name is set to [{}]".format(
                         key_name, use_structured_name))
         unused_para_list = []
@@ -719,5 +918,5 @@ class Layer(core.Layer):
                 unused_para_list.append(k)
         if len(unused_para_list) > 0:
             warnings.warn(
-                "Varibale [ {} ] are not used, because not included in layers state_dict".
+                "Variables [ {} ] are not used, because not included in layers state_dict".
                 format(" ".join(unused_para_list)))
diff --git a/python/paddle/fluid/dygraph/learning_rate_scheduler.py b/python/paddle/fluid/dygraph/learning_rate_scheduler.py
index f7ca63d22036a7f350f99a4e966de203c40b813c..2fcd0fe1e5a6d0d51bcc31b4f18f778c4a50e249 100644
--- a/python/paddle/fluid/dygraph/learning_rate_scheduler.py
+++ b/python/paddle/fluid/dygraph/learning_rate_scheduler.py
@@ -15,6 +15,7 @@
 from __future__ import print_function
 
 import math
+import warnings
 
 from .. import unique_name
 from ..framework import Variable
@@ -23,7 +24,7 @@ from ..data_feeder import check_type
 __all__ = [
     'NoamDecay', 'PiecewiseDecay', 'NaturalExpDecay', 'ExponentialDecay',
     'InverseTimeDecay', 'PolynomialDecay', 'CosineDecay', 'LinearLrWarmup',
-    'ReduceLROnPlateau'
+    'ReduceLROnPlateau', 'StepDecay', 'MultiStepDecay', 'LambdaDecay'
 ]
 
 
@@ -66,6 +67,51 @@ class LearningRateDecay(object):
             persistable=False)
         return lr
 
+    def state_dict(self):
+        """
+        Returns the state of the scheduler as a :class:`dict`.
+
+        It is a subset of self.__dict__ .
+        """
+        self._state_keys()
+        state_dict = {}
+        for key in self.keys:
+            if key not in self.__dict__:
+                continue
+            value = self.__dict__[key]
+            if isinstance(value, Variable):
+                assert value.shape == [
+                    1
+                ], "shape of Variable in state_dict must be [1] {}".format(
+                    value.shape)
+                value = value.numpy()[0]
+            state_dict[key] = value
+
+        return state_dict
+
+    def _state_keys(self):
+        """
+        set the keys in self.__dict__ that are needed to be saved.
+        """
+        self.keys = ['step_num']
+
+    def set_dict(self, state_dict):
+        """
+        Loads the schedulers state.
+        """
+        self._state_keys()
+        for key in self.keys:
+            if key in state_dict:
+                self.__dict__[key] = state_dict[key]
+            else:
+                raise RuntimeError(
+                    "Please check whether state_dict is correct for optimizer. Can't find [ {} ] in state_dict".
+                    format(key))
+        if len(state_dict) > len(self.keys):
+            warnings.warn(
+                "There are some unused values in state_dict. Maybe the optimizer have different 'LearningRateDecay' when invoking state_dict and set_dict"
+            )
+
     def step(self):
         raise NotImplementedError()
 
@@ -402,7 +448,7 @@ class PolynomialDecay(LearningRateDecay):
         learning_rate(Variable|float): The initial learning rate. If the type 
             is Variable, it's a tensor with shape [1], the data type can be  
             float32 or float64. It also can be set to python int number.
-        decay_steps(int32): The decay step size. It determines the decay cycle.
+        decay_steps(int): The decay step size. It determines the decay cycle.
         end_learning_rate(float, optional): The minimum final learning rate. The default value is 0.0001.
         power(float, optional): Power of polynomial. The default value is 1.0.
         cycle(bool, optional): If set true, decay the learning rate every decay_steps. The default value is False.
@@ -595,6 +641,8 @@ class NoamDecay(LearningRateDecay):
 
 class LinearLrWarmup(LearningRateDecay):
     """
+    :api_attr: imperative
+
     This operator use the linear learning rate warm up strategy to adjust the learning rate preliminarily before the normal learning rate scheduling.
     For more information, please refer to `Bag of Tricks for Image Classification with Convolutional Neural Networks <https://arxiv.org/abs/1812.01187>`_
     
@@ -664,6 +712,7 @@ class LinearLrWarmup(LearningRateDecay):
                 format(learning_rate))
         self.learning_rate = learning_rate
         self.warmup_steps = warmup_steps
+        self.start_lr = start_lr
         assert end_lr > start_lr, "end_lr {} must be greater than start_lr {}".format(
             end_lr, start_lr)
         self.lr_ratio_before_warmup = (
@@ -676,13 +725,15 @@ class LinearLrWarmup(LearningRateDecay):
 
         from .. import layers
         if self.step_num < self.warmup_steps:
-            return self.lr_ratio_before_warmup * self.step_num
+            return self.lr_ratio_before_warmup * self.step_num + self.start_lr
         else:
             return base_lr
 
 
 class ReduceLROnPlateau(LearningRateDecay):
     """
+    :api_attr: imperative
+
     Reduce learning rate when ``loss`` has stopped descending. Models often benefit from reducing the learning rate 
     by 2 to 10 times once model performance has no longer improvement.
 
@@ -780,18 +831,19 @@ class ReduceLROnPlateau(LearningRateDecay):
             raise ValueError(
                 'new_lr = origin_lr * decay_rate and decay_rate should be < 1.0.'
             )
-        self.decay_rate = decay_rate
+        self.decay_rate = self.create_lr_var(decay_rate)
 
         threshold_mode = threshold_mode.lower()
         if threshold_mode not in ['rel', 'abs']:
             raise ValueError('threshold mode ' + threshold_mode +
                              ' is unknown!')
         self.threshold_mode = threshold_mode
-
         check_type(learning_rate, 'learning_rate', (float, int, Variable),
                    'ReduceLROnPlateau')
-        if isinstance(learning_rate, (float, int)):
-            learning_rate = self.create_lr_var(learning_rate)
+        if not isinstance(learning_rate, (float, int, Variable)):
+            raise TypeError(
+                "The type of 'learning_rate' in 'ReduceLROnPlateau' must be 'float, int, Variable', but received %s."
+                % type(learning_rate))
 
         self.learning_rate = learning_rate
         self.verbose = verbose
@@ -805,9 +857,17 @@ class ReduceLROnPlateau(LearningRateDecay):
         self.cooldown_counter = 0
         self.best_loss = None
         self.num_bad_epochs = 0
-        self.epoch = 0
+        self.epoch_num = 0
+
+    def _state_keys(self):
+        self.keys = [
+            'cooldown_counter', 'best_loss', 'num_bad_epochs', 'epoch_num',
+            'learning_rate'
+        ]
 
     def __call__(self):
+        if not isinstance(self.learning_rate, Variable):
+            self.learning_rate = self.create_lr_var(self.learning_rate)
         return self.learning_rate
 
     def step(self, loss):
@@ -833,7 +893,7 @@ class ReduceLROnPlateau(LearningRateDecay):
             "should be (1L,), but the current loss.shape is {}. Maybe that "  \
             "you should call fluid.layers.mean to process it first.".format(loss.shape)
 
-        self.epoch += 1
+        self.epoch_num += 1
         if self.cooldown_counter > 0:
             self.cooldown_counter -= 1
         else:
@@ -851,10 +911,11 @@ class ReduceLROnPlateau(LearningRateDecay):
                                                 self.decay_rate, self.min_lr)
                 if self.learning_rate - new_lr > self.eps:
                     if self.verbose:
+                        old_lr = self.learning_rate.numpy()[0] if isinstance(
+                            self.learning_rate,
+                            Variable) else self.learning_rate
                         print('Epoch {}: reducing learning rate from {} to {}.'.
-                              format(self.epoch,
-                                     self.learning_rate.numpy()[0],
-                                     new_lr.numpy()[0]))
+                              format(self.epoch_num, old_lr, new_lr.numpy()[0]))
                     self.learning_rate = new_lr
 
     def _is_better(self, current, best):
@@ -869,3 +930,288 @@ class ReduceLROnPlateau(LearningRateDecay):
 
         else:
             return current > best + self.threshold
+
+
+class _LearningRateEpochDecay(LearningRateDecay):
+    """
+    :api_attr: imperative
+
+    Base class of learning rate decay, which is updated each epoch.
+    
+    Define the common interface of an _LearningRateEpochDecay.
+    User should not use this class directly,
+    but need to use one of it's implementation. And invoke method: `epoch()` each epoch.
+    """
+
+    def __init__(self, learning_rate, dtype=None):
+        if not isinstance(learning_rate, (float, int)):
+            raise TypeError(
+                "The type of 'learning_rate' must be 'float, int', but received %s."
+                % type(learning_rate))
+        if learning_rate < 0:
+            raise ValueError("Invalid learning rate: {}".format(learning_rate))
+
+        self.base_lr = float(learning_rate)
+
+        self.epoch_num = -1
+        self.dtype = dtype
+        if dtype is None:
+            self.dtype = "float32"
+        self.learning_rate = self.create_lr_var(self.base_lr)
+
+        self.epoch()
+
+    def _state_keys(self):
+        self.keys = ['epoch_num', 'learning_rate']
+
+    def __call__(self):
+        """ 
+        Return last computed learning rate on current epoch.
+        """
+        if not isinstance(self.learning_rate, Variable):
+            self.learning_rate = self.create_lr_var(self.learning_rate)
+        return self.learning_rate
+
+    def epoch(self, epoch=None):
+        """
+        compueted learning_rate and update it when invoked.
+        """
+        if epoch is None:
+            self.epoch_num += 1
+        else:
+            self.epoch_num = epoch
+
+        self.learning_rate = self.get_lr()
+
+    def get_lr(self):
+        raise NotImplementedError
+
+
+class StepDecay(_LearningRateEpochDecay):
+    """
+    :api_attr: imperative
+
+    Decays the learning rate of ``optimizer`` by ``decay_rate`` every ``step_size`` number of epoch.
+
+    The algorithm can be described as the code below. 
+
+    .. code-block:: text
+
+        learning_rate = 0.5
+        step_size = 30
+        decay_rate = 0.1
+
+        learning_rate = 0.5     if epoch < 30
+        learning_rate = 0.05    if 30 <= epoch < 60
+        learning_rate = 0.005   if 60 <= epoch < 90
+        ...
+
+    Parameters:
+        learning_rate (float|int): The initial learning rate. It can be set to python float or int number.
+        step_size (int): Period of learning rate decay.
+        decay_rate (float, optional): The Ratio that the learning rate will be reduced. ``new_lr = origin_lr * decay_rate`` . 
+            It should be less than 1.0. Default: 0.1.
+
+    Returns:
+        None.
+
+    Examples:
+        .. code-block:: python
+            
+            import paddle.fluid as fluid
+            import numpy as np
+            with fluid.dygraph.guard():
+                x = np.random.uniform(-1, 1, [10, 10]).astype("float32")
+                linear = fluid.dygraph.Linear(10, 10)
+                input = fluid.dygraph.to_variable(x)
+                scheduler = fluid.dygraph.StepDecay(0.5, step_size=3)
+                adam = fluid.optimizer.Adam(learning_rate = scheduler, parameter_list = linear.parameters())
+
+                for epoch in range(9):
+                    for batch_id in range(5):
+                        out = linear(input)
+                        loss = fluid.layers.reduce_mean(out)
+                        adam.minimize(loss)  
+                    scheduler.epoch()
+
+                    print("epoch:{}, current lr is {}" .format(epoch, adam.current_step_lr()))
+                    # epoch:0, current lr is 0.5
+                    # epoch:1, current lr is 0.5
+                    # epoch:2, current lr is 0.5
+                    # epoch:3, current lr is 0.05
+                    # epoch:4, current lr is 0.05
+                    # epoch:5, current lr is 0.05
+                    # epoch:6, current lr is 0.005
+                    # epoch:7, current lr is 0.005
+                    # epoch:8, current lr is 0.005
+
+    """
+
+    def __init__(self, learning_rate, step_size, decay_rate=0.1):
+        if not isinstance(step_size, int):
+            raise TypeError(
+                "The type of 'step_size' must be 'int', but received %s." %
+                type(step_size))
+        if decay_rate >= 1.0:
+            raise ValueError('decay_rate should be < 1.0.')
+
+        self.step_size = step_size
+        self.decay_rate = decay_rate
+        super(StepDecay, self).__init__(learning_rate)
+
+    def get_lr(self):
+        decay_rate = self.create_lr_var(self.decay_rate)
+        i = self.epoch_num // self.step_size
+        return self.base_lr * (decay_rate**i)
+
+
+class MultiStepDecay(_LearningRateEpochDecay):
+    """
+    :api_attr: imperative
+
+    Decays the learning rate of ``optimizer`` by ``decay_rate`` once ``epoch`` reaches one of the milestones.
+
+    The algorithm can be described as the code below. 
+
+    .. code-block:: text
+
+        learning_rate = 0.5
+        milestones = [30, 50]
+        decay_rate = 0.1
+        if epoch < 30:
+            learning_rate = 0.5
+        elif epoch < 50:
+            learning_rate = 0.05
+        else:
+            learning_rate = 0.005
+
+    Parameters:
+        learning_rate (float|int): The initial learning rate. It can be set to python float or int number.
+        milestones (tuple|list): List or tuple of each boundaries. Must be increasing.
+        decay_rate (float, optional): The Ratio that the learning rate will be reduced. ``new_lr = origin_lr * decay_rate`` . 
+            It should be less than 1.0. Default: 0.1.
+
+    Returns:
+        None.
+
+    Examples:
+        .. code-block:: python
+            
+            import paddle.fluid as fluid
+            import numpy as np
+            with fluid.dygraph.guard():
+                x = np.random.uniform(-1, 1, [10, 10]).astype("float32")
+                linear = fluid.dygraph.Linear(10, 10)
+                input = fluid.dygraph.to_variable(x)
+                scheduler = fluid.dygraph.MultiStepDecay(0.5, milestones=[3, 5])
+                adam = fluid.optimizer.Adam(learning_rate = scheduler, parameter_list = linear.parameters())
+
+                for epoch in range(6):
+                    for batch_id in range(5):
+                        out = linear(input)
+                        loss = fluid.layers.reduce_mean(out)
+                        adam.minimize(loss)
+                    scheduler.epoch()
+
+                    print("epoch:{}, current lr is {}" .format(epoch, adam.current_step_lr()))
+                    # epoch:0, current lr is 0.5
+                    # epoch:1, current lr is 0.5
+                    # epoch:2, current lr is 0.5
+                    # epoch:3, current lr is 0.05
+                    # epoch:4, current lr is 0.05
+                    # epoch:5, current lr is 0.005
+
+    """
+
+    def __init__(self, learning_rate, milestones, decay_rate=0.1):
+        if not isinstance(milestones, (tuple, list)):
+            raise TypeError(
+                "The type of 'milestones' in 'MultiStepDecay' must be 'tuple, list', but received %s."
+                % type(milestones))
+
+        if not all([
+                milestones[i] < milestones[i + 1]
+                for i in range(len(milestones) - 1)
+        ]):
+            raise ValueError('The elements of milestones must be incremented')
+        if decay_rate >= 1.0:
+            raise ValueError('decay_rate should be < 1.0.')
+
+        self.milestones = milestones
+        self.decay_rate = decay_rate
+        super(MultiStepDecay, self).__init__(learning_rate)
+
+    def get_lr(self):
+        decay_rate = self.create_lr_var(self.decay_rate)
+        for i in range(len(self.milestones)):
+            if self.epoch_num < self.milestones[i]:
+                return self.base_lr * (decay_rate**i)
+
+        return self.base_lr * (decay_rate**len(self.milestones))
+
+
+class LambdaDecay(_LearningRateEpochDecay):
+    """
+    :api_attr: imperative
+
+    Sets the learning rate of ``optimizer`` to the initial lr times a multiplicative factor, and this multiplicative
+    factor is computed by function ``lr_lambda`` . ``lr_lambda`` is funciton which receives ``epoch`` .
+
+    The algorithm can be described as the code below. 
+
+    .. code-block:: text
+
+        learning_rate = 0.5        # init learning_rate
+        lr_lambda = lambda epoch: 0.95 ** epoch
+
+        learning_rate = 0.5        # epoch 0
+        learning_rate = 0.475      # epoch 1
+        learning_rate = 0.45125    # epoch 2
+
+    Parameters:
+        learning_rate (float|int): The initial learning rate. It can be set to python float or int number.
+        lr_lambda (function): A function which computes a multiplicative factor given an integer parameter ``epoch`` , and 
+            then multiply the initial learning rate by this multiplicative factor.
+    
+    Returns:
+        None.
+
+    Examples:
+        .. code-block:: python
+            
+            import paddle.fluid as fluid
+            import numpy as np
+            with fluid.dygraph.guard():
+                x = np.random.uniform(-1, 1, [10, 10]).astype("float32")
+                linear = fluid.dygraph.Linear(10, 10)
+                input = fluid.dygraph.to_variable(x)
+                scheduler = fluid.dygraph.LambdaDecay(0.5, lr_lambda=lambda x: 0.95**x)
+                adam = fluid.optimizer.Adam(learning_rate = scheduler, parameter_list = linear.parameters())
+
+                for epoch in range(6):
+                    for batch_id in range(5):
+                        out = linear(input)
+                        loss = fluid.layers.reduce_mean(out)
+                        adam.minimize(loss)
+                    scheduler.epoch()
+
+                    print("epoch:%d, current lr is %f" .format(epoch, adam.current_step_lr()))
+                    # epoch:0, current lr is 0.5
+                    # epoch:1, current lr is 0.475
+                    # epoch:2, current lr is 0.45125
+
+    """
+
+    def __init__(self, learning_rate, lr_lambda):
+        if not callable(lr_lambda):
+            raise TypeError(
+                "The type of 'lr_lambda' in 'LambdaDecay' must be 'function', but received %s."
+                % type(lr_lambda))
+
+        self.lr_lambda = lr_lambda
+        super(LambdaDecay, self).__init__(learning_rate)
+
+    def get_lr(self):
+        base_lr = self.create_lr_var(self.base_lr)
+
+        return self.base_lr * self.lr_lambda(self.epoch_num)
diff --git a/python/paddle/fluid/dygraph/nn.py b/python/paddle/fluid/dygraph/nn.py
index c4ea1fabfcba96fd41a38abe290aa8af6379f677..cc2b746b0c1e9a00c21ebe6762ba4da38d20c511 100644
--- a/python/paddle/fluid/dygraph/nn.py
+++ b/python/paddle/fluid/dygraph/nn.py
@@ -1028,16 +1028,16 @@ class InstanceNorm(layers.Layer):
         num_channels(int): Indicate the number of channels of the input ``Tensor``.
         epsilon(float, optional): A value added to the denominator for
             numerical stability. Default is 1e-5.
-        param_attr(ParamAttr, optional): The parameter attribute for Parameter `scale`
+        param_attr(ParamAttr|bool, optional): The parameter attribute for Parameter `scale`
              of instance_norm. If it is set to None or one attribute of ParamAttr, instance_norm
 	     will create ParamAttr as param_attr, the name of scale can be set in ParamAttr.
 	     If the Initializer of the param_attr is not set, the parameter is initialized 
-	     one. Default: None.
-        bias_attr(ParamAttr, optional): The parameter attribute for the bias of instance_norm.
+	     one. If it is set to False, will not create param_attr. Default: None.
+        bias_attr(ParamAttr|bool, optional): The parameter attribute for the bias of instance_norm.
              If it is set to None or one attribute of ParamAttr, instance_norm
 	     will create ParamAttr as bias_attr, the name of bias can be set in ParamAttr. 
 	     If the Initializer of the bias_attr is not set, the bias is initialized zero. 
-	     Default: None.
+             If it is set to False, will not create bias_attr. Default: None.
         dtype(str, optional): Indicate the data type of the input ``Tensor``,
              which can be float32 or float64. Default: float32.
 
@@ -1071,25 +1071,30 @@ class InstanceNorm(layers.Layer):
                  bias_attr=None,
                  dtype='float32'):
         super(InstanceNorm, self).__init__()
-        assert bias_attr is not False, "bias_attr should not be False in InstanceNorm."
 
+        if param_attr == False or bias_attr == False:
+            assert bias_attr == param_attr, "param_attr and bias_attr must be set to Fasle at the same time in InstanceNorm"
         self._epsilon = epsilon
         self._param_attr = param_attr
         self._bias_attr = bias_attr
         self._dtype = dtype
 
-        self.scale = self.create_parameter(
-            attr=self._param_attr,
-            shape=[num_channels],
-            dtype=self._dtype,
-            default_initializer=Constant(1.0),
-            is_bias=False)
-        self.bias = self.create_parameter(
-            attr=self._bias_attr,
-            shape=[num_channels],
-            dtype=self._dtype,
-            default_initializer=Constant(0.0),
-            is_bias=True)
+        if param_attr != False and bias_attr != False:
+            self.scale = self.create_parameter(
+                attr=self._param_attr,
+                shape=[num_channels],
+                dtype=self._dtype,
+                default_initializer=Constant(1.0),
+                is_bias=False)
+            self.bias = self.create_parameter(
+                attr=self._bias_attr,
+                shape=[num_channels],
+                dtype=self._dtype,
+                default_initializer=Constant(0.0),
+                is_bias=True)
+        else:
+            self.scale = None
+            self.bias = None
 
     def forward(self, input):
         if in_dygraph_mode():
@@ -1102,7 +1107,10 @@ class InstanceNorm(layers.Layer):
 
         attrs = {"epsilon": self._epsilon}
 
-        inputs = {"X": [input], "Scale": [self.scale], "Bias": [self.bias]}
+        if self.scale and self.bias:
+            inputs = {"X": [input], "Scale": [self.scale], "Bias": [self.bias]}
+        else:
+            inputs = {"X": [input]}
 
         saved_mean = self._helper.create_variable_for_type_inference(
             dtype=self._dtype, stop_gradient=True)
diff --git a/python/paddle/fluid/dygraph/static_runner.py b/python/paddle/fluid/dygraph/static_runner.py
index 98960d3707fc0b00d47f3dcfda0231ac56c68706..d482077cd4f2aa5bf1cc30e4c71eac6e9bb7752f 100644
--- a/python/paddle/fluid/dygraph/static_runner.py
+++ b/python/paddle/fluid/dygraph/static_runner.py
@@ -14,550 +14,26 @@
 
 from __future__ import print_function
 
-import logging
-import numpy as np
-import os
-import six
+from paddle.fluid.dygraph.jit import SaveLoadConfig
+from paddle.fluid.dygraph.io import TranslatedLayer
 
-from . import layers
-from .. import core
-from .. import framework
-from .. import backward
 
-from ..layers import nn
-from .base import switch_to_static_graph
-from ... import compat as cpt
-
-# DESIGN IDEA: Add an special operator, execute static program inside operator.
-#
-# Op's Inputs:
-#   - the input variable of the user feed
-#   - the necessary parameters of the network
-# Op's Outputs:
-#   - the output variable of fetch
-# 
-# This op receives a complete program desc, internally creates scope
-# and executor, executes this program. Key points:
-#
-# 1. Data Sharing: 
-#   The varBase of the dynamic graph is not in the scope, so before the op
-#   executes the program internally, create persistent variables with the
-#   same name as feed, parameters, and fetch in the scope, and share the
-#   LoDTensor of the op input.
-# 
-# 2. Forward and Backward Separation:
-#   Because the dynamic graph op performs the forward and backward separately,
-#   the forward program is used as the execution object of the forward op,
-#   and the reverse program is used as the execution object of the grad op.
-
-
-class StaticModelRunner(layers.Layer):
+# NOTE: This class will be deprecated later.
+# It is kept here because PaddleHub is already using this API.
+class StaticModelRunner(object):
     """
     A Dynamic graph Layer for loading inference program and related parameters,
     and then performing fine-tune training or inference.
 
-    The loaded program and parameters are saved by `fluid.io.save_inference_model`.
-
     .. note::
-        **1. Dynamic graph mode do not support LoDTensor. 
-             All original static graph model's feed targets or parametars 
-             that depend on LoD are temporarily unavailable.**
-        **2. All saved inference model's feed targets need be given.**
-        **3. The ``stop_gradient`` information is lost and can not be recovered.**
-        **4. The parameter's ``trainable`` information is lost and can not be recovered.**
-        **5. Double gradient model is not supported now.**
-        **6. Now only supports loading models saved by `fluid.io.save_inference_model`.**
-
-    Args:
-        model_dir(str): The directory path where the model is saved.
-        model_filename(str, optional): The file name of saved inference program. 
-                                       If set to None, a default filename is
-                                       :code:`__model__`.
-                                       The default value is None.
-        params_filename(str, optional): The file name of saved all related parameters.
-                                        If set to None, parameters are saved
-                                        in separate files. 
-                                        The default value is None.
-
-    Returns:
-        Layer: A Layer object can run loaded program.
-
-    Examples:
-      .. code-block:: python
-
-        import numpy as np
-        import paddle.fluid as fluid
-
-        BATCH_SIZE = 32
-        BATCH_NUM = 20
-        SAVE_DIRNAME = "fc.inference.model"
-
-        def random_batch_reader():
-            def _get_random_images_and_labels(image_shape, label_shape):
-                image = np.random.random(size=image_shape).astype('float32')
-                label = np.random.random(size=label_shape).astype('int64')
-                return image, label
-
-            def __reader__():
-                for _ in range(BATCH_NUM):
-                    batch_image, batch_label = _get_random_images_and_labels(
-                        [BATCH_SIZE, 784], [BATCH_SIZE, 1])
-                    yield batch_image, batch_label
-
-            return __reader__
-
-        def train_and_save_static_model(place):
-            img = fluid.data(name='img', shape=[None, 784], dtype='float32')
-            label = fluid.data(name='label', shape=[None, 1], dtype='int64')
-
-            pred = fluid.layers.fc(input=img, size=10, act='softmax')
-
-            loss = fluid.layers.cross_entropy(input=pred, label=label)
-            avg_loss = fluid.layers.mean(loss)
-
-            optimizer = fluid.optimizer.SGD(learning_rate=0.001)
-            optimizer.minimize(avg_loss)
-
-            exe = fluid.Executor(place)
-            exe.run(fluid.default_startup_program())
-
-            loader = fluid.io.DataLoader.from_generator(
-                feed_list=[img, label], capacity=5, iterable=True)
-            loader.set_batch_generator(random_batch_reader(), places=place)
-
-            for data in loader():
-                exe.run(
-                    fluid.default_main_program(),
-                    feed=data, 
-                    fetch_list=[avg_loss])
-
-            # save model by fluid.io.save_inference_model
-            fluid.io.save_inference_model(
-                SAVE_DIRNAME, ["img"], [pred], exe)
-
-
-        # Step 1. train and save inference model in static graph mode
-        place = fluid.CPUPlace()
-        train_and_save_static_model(place)
-
-        # Step 2. load inference model in dygraph and fine-tune
-        with fluid.dygraph.guard(place):
-            fc = fluid.dygraph.static_runner.StaticModelRunner(SAVE_DIRNAME)
-
-            sgd = fluid.optimizer.SGD(learning_rate=0.001,
-                                    parameter_list=fc.parameters())
-
-            train_loader = fluid.io.DataLoader.from_generator(capacity=5)
-            train_loader.set_batch_generator(
-                random_batch_reader(), places=place)
-
-            for data in train_loader():
-                img = data[0]
-                label = data[1]
-                label.stop_gradient = True
-
-                cost = fc(inputs=img)
-
-                loss = fluid.layers.cross_entropy(cost, label)
-                avg_loss = fluid.layers.mean(loss)
-
-                avg_loss.backward()
-                sgd.minimize(avg_loss)
+        This is a temporary API, which will be deprecated later, please use 
+        `fluid.dygraph.jit.load` to achieve the same function.
     """
 
-    def __init__(self, model_dir, model_filename=None, params_filename=None):
-        super(StaticModelRunner, self).__init__()
-
-        # Step 0. key variable definitions
-        # loaded inference program desc
-        self._infer_program_desc = None
-        # recovered train program desc
-        self._train_program_desc = None
-        # StaticModelRunner executed program desc,
-        # switch infer or train by train() and eval()
-        self._trace_program_desc = None
-        self._inner_scope = core.Scope()
-        # the layer outputs var desc
-        self._output_descs = []
-        # input, output, params name list
-        self._input_names = []
-        self._output_names = []
-        self._param_names = []
-        # train or eval flag
-        self._is_test = False
-
-        # Step 1. load program desc from disk
-        # the saved model hold feed, fetch & scale op, no need, can be remove
-        self._infer_program_desc = self._load_static_model(model_dir,
-                                                           model_filename)
-
-        # Step 2. load all parameters
-        self._load_persisitable_dict(model_dir, params_filename)
-
-        # Step 3. generate backwar program desc
-        self._train_program_desc = self._append_backward_desc()
-
-        # Step 4. recheck parameters stop gradients
-        self._recheck_stop_gradients()
-
-        # Step 5. set default mode to train
-        self.train()
-
-    def train(self):
-        self._is_test = False
-        self._trace_program_desc = self._train_program_desc
-
-    def eval(self):
-        self._is_test = True
-        self._trace_program_desc = self._infer_program_desc
-
-    def forward(self, *args):
-        """
-        Executed forward part of StaticModelRunner Layer.
-        Generally execute directly using the Layer object.
-
-        Args:
-            args(tuple(np.ndarray|Variable)): the inputs of StaticModelRunner.
-                The order of input variables needs to be the same as the order 
-                of feed variables when using `save_inference_model` to save model.
-        
-        Returns:
-            Variable|list[Variable]: The forward outputs of StaticModelRunner Layer.
-                If there is only one output, return Variable;
-                if there are multiple outputs, return list[Variable].
-        """
-        # Step 1. prepare inputs, outputs, attrs
-        input_vars = []
-        for i, value in enumerate(args):
-            if not isinstance(value, (np.ndarray, core.VarBase)):
-                raise TypeError(
-                    "The type of inputs.value in StaticModelRunner.forward must be numpy array or Variable(VarBase), but received %s."
-                    % type(value))
-            # NOTE: In order to unify the API, firstly convert the input to VarBase
-            if isinstance(value, np.ndarray):
-                var = core.VarBase(
-                    value=value,
-                    name=self._input_names[i],
-                    persistable=False,
-                    place=framework._current_expected_place(),
-                    zero_copy=True)
-            else:
-                var = value
-                # TODO: here may have important name set by user
-                var.name = self._input_names[i]
-            input_vars.append(var)
-
-        params = []
-        for param in self._parameters.values():
-            params.append(param)
-
-        output_vars = []
-        for var_desc in self._output_descs:
-            var = core.VarBase(var_desc.dtype(),
-                               var_desc.shape(),
-                               var_desc.name(), var_desc.type(), False)
-            output_vars.append(var)
-
-        # hold forward variables
-        tmp_scope_vec = core.VarBase(core.VarDesc.VarType.FP32, [],
-                                     "program_out_scope",
-                                     core.VarDesc.VarType.STEP_SCOPES, True)
-        tmp_scope_vec.value().set_scope(self._inner_scope)
-
-        # Step 2. run prorgam by op
-        framework._dygraph_tracer().trace_op(
-            type='run_program',
-            inputs={'X': input_vars,
-                    'Params': params},
-            outputs={'Out': output_vars,
-                     'OutScope': tmp_scope_vec},
-            attrs={
-                'global_block': self._trace_program_desc.block(0),
-                'start_op_index': 0,
-                'end_op_index': self._infer_program_desc.block(0).op_size(),
-                'is_test': self._is_test
-            })
-
-        # NOTE: [ why need set param's gradient type here ]
-        # if user set sparse gradient mode, the param's gradient
-        # will be SelectedRows, not LoDTensor. But tracer will just
-        # set param grad VarBase by forward VarBase(LoDTensor)
-        # If we don't change grad_var type here, RunProgramOp need
-        # transform SelectedRows to LoDTensor forcely, it may not
-        # be user wanted result.
-        for param in params:
-            grad_name = param.name + core.grad_var_suffix()
-            grad_var = self._trace_program_desc.block(0).find_var(
-                cpt.to_bytes(grad_name))
-            # NOTE: cannot find var desc maybe no problem, such as in batch_norm
-            if grad_var is None:
-                continue
-            param._set_grad_type(grad_var.type())
-
-        # Step 3. prepare output, keep same form with inputs
-        outs = output_vars
-        if len(output_vars) == 1:
-            outs = output_vars[0]
-        return outs
-
-    def _load_static_model(self, model_dir, model_filename=None):
-        # Step 1. dir and filename check
-        load_dirname = os.path.normpath(model_dir)
-        if not os.path.isdir(load_dirname):
-            raise ValueError("There is no directory named '%s'" % load_dirname)
-
+    def __new__(cls, model_dir, model_filename=None, params_filename=None):
+        configs = SaveLoadConfig()
         if model_filename is not None:
-            model_filename = os.path.basename(model_filename)
-        else:
-            model_filename = "__model__"
-        model_filename = os.path.join(load_dirname, model_filename)
-
-        # Step 2. parse program desc
-        with open(model_filename, "rb") as f:
-            program_desc_str = f.read()
-
-        program_desc = core.ProgramDesc(program_desc_str)
-        if not core._is_program_version_supported(program_desc._version()):
-            raise ValueError("Unsupported program version: %d\n" %
-                             program_desc._version())
-
-        # Step 3. 
-        # - remove feed, fetch and useless scale-1 op
-        # - remove op_callstack attr
-        ops_to_remove = []
-        root_block = program_desc.block(0)
-        for i in six.moves.range(root_block.op_size()):
-            op = root_block.op(i)
-            if op.type() == 'feed':
-                ops_to_remove.append(i)
-                feed_var_name = cpt.to_bytes(op.input('X')[0])
-                root_block._remove_var(feed_var_name)
-                self._input_names.append(cpt.to_bytes(op.output('Out')[0]))
-            elif op.type() == 'scale' and op.output('Out')[0].startswith(
-                    'save_infer_model/scale_'):
-                ops_to_remove.append(i)
-                out_var_name = cpt.to_bytes(op.output('Out')[0])
-                root_block._remove_var(out_var_name)
-                self._output_names.append(cpt.to_bytes(op.input('X')[0]))
-                self._output_descs.append(
-                    root_block.find_var(cpt.to_bytes(op.input('X')[0])))
-            elif op.type() == 'fetch':
-                ops_to_remove.append(i)
-                fetch_var_name = cpt.to_bytes(op.output('Out')[0])
-                root_block._remove_var(fetch_var_name)
-                # NOTE: some old pre-train models have no extra scale_op
-                if not op.input('X')[0].startswith('save_infer_model/scale_'):
-                    self._output_names.append(cpt.to_bytes(op.input('X')[0]))
-                    self._output_descs.append(
-                        root_block.find_var(cpt.to_bytes(op.input('X')[0])))
-            else:
-                if op.has_attr("op_callstack"):
-                    op.remove_attr("op_callstack")
-
-        for op_idx in reversed(ops_to_remove):
-            root_block._remove_op(op_idx, op_idx + 1)
-
-        # NOTE: reverse feed vars
-        self._input_names.reverse()
-
-        # Step 4. add scale for outputs
-        tmp_program = self._build_program_by_desc(program_desc)
-        self._append_scale_to_output(tmp_program)
-
-        return program_desc
-
-    @switch_to_static_graph
-    def _append_scale_to_output(self, program):
-        # 1. append scale & save var
-        scale_output_vars = []
-        with framework.program_guard(program):
-            for i, out in enumerate(self._output_descs):
-                var = program.global_block().var(out.name())
-                var = nn.scale(
-                    var, 1., name="static_model_runner/scale_{}".format(i))
-                scale_output_vars.append(var)
-        # 2. update output names & descs
-        for i, var in enumerate(scale_output_vars):
-            self._output_names[i] = var.name
-            self._output_descs[i] = var.desc
-
-    @switch_to_static_graph
-    def _append_backward_desc(self):
-        assert self._infer_program_desc is not None, "The StaticModelRunner not initialized properly."
-        program_desc_copy = core.ProgramDesc(self._infer_program_desc)
-
-        # Step 1. set all `is_test` attributes to False
-        self._change_is_test_status(program_desc_copy, False)
-
-        # Step 2. prepare program and related var
-        # NOTE: To reuse backward interfaces, build Program firstly.
-        # Originally, there is no need to build a program, but need to almost
-        # rewrite a series of methods for append_backward for program_desc. 
-        # Therefore, in order to reuse the method of backward.py, build the program here.
-        fwd_op_num = program_desc_copy.block(0).op_size()
-        program = self._build_program_by_desc(program_desc_copy)
-
-        # TODO: could the targets be in sub block?
-        targets = []
-        for out in self._output_descs:
-            targets.append(program.global_block().var(out.name()))
-
-        # Step 3. append backward
-        backward.gradients(targets=targets, inputs=[])
-        return program.desc
-
-    def _load_persisitable_dict(self, model_dir, params_filename=None):
-        load_dirname = os.path.normpath(model_dir)
-        assert self._infer_program_desc is not None, "The StaticModelRunner not initialized properly."
-
-        persis_vars = self._get_persis_vars(self._infer_program_desc)
-        load_var_map = {}
-        for each_var in persis_vars:
-            orig_each_name = each_var.name()
-            # append suffix
-            self._append_loaded_suffix_to_param(each_var)
-            # create output varbase
-            new_var = framework.ParamBase(
-                shape=each_var.shape(),
-                dtype=each_var.dtype(),
-                name=each_var.name(),
-                type=each_var.type(),
-                persistable=True)
-            if params_filename is None:
-                if not self._is_parameter(each_var):
-                    continue
-                framework._dygraph_tracer().trace_op(
-                    type='load',
-                    inputs={},
-                    outputs={'Out': new_var},
-                    attrs={
-                        'file_path': os.path.join(load_dirname, orig_each_name)
-                    })
-                new_var.stop_gradient = False
-                self.add_parameter(name=new_var.name, parameter=new_var)
-                self._param_names.append(new_var.name)
-            else:
-                load_var_map[each_var.name()] = new_var
-
+            configs.model_filename = model_filename
         if params_filename is not None:
-            load_var_list = []
-            for name in sorted(load_var_map.keys()):
-                load_var_list.append(load_var_map[name])
-
-            framework._dygraph_tracer().trace_op(
-                type='load_combine',
-                inputs={},
-                outputs={'Out': load_var_list},
-                attrs={
-                    'file_path': os.path.join(load_dirname, params_filename)
-                })
-
-            for each_var in persis_vars:
-                if not self._is_parameter(each_var):
-                    continue
-                param = load_var_map[each_var.name()]
-                param.stop_gradient = False
-                self.add_parameter(name=param.name, parameter=param)
-                self._param_names.append(param.name)
-
-    def _recheck_stop_gradients(self):
-        assert self._train_program_desc is not None, "The StaticModelRunner not initialized properly."
-        # NOTE: After loading the model, the stop_gradient information 
-        # of the original variable is lost, but if a parameter does not
-        # have a corresponding @GRAD variable in the backward program,
-        # it can be said that it is also stop_gradient
-        all_var_names = self._get_all_var_names(self._train_program_desc)
-        for param_name in self._parameters:
-            param_grad_name = param_name + core.grad_var_suffix()
-            if param_grad_name not in all_var_names:
-                self._parameters[param_name].stop_gradient = True
-
-    def _get_all_var_names(self, program_desc):
-        all_var_names = set()
-        for i in six.moves.range(program_desc.num_blocks()):
-            block = program_desc.block(i)
-            for var in block.all_vars():
-                all_var_names.add(var.name())
-        return all_var_names
-
-    def _get_persis_vars(self, program_desc):
-        persis_vars = []
-        for i in six.moves.range(program_desc.num_blocks()):
-            block = program_desc.block(i)
-            persis_vars.extend(
-                list(filter(self._is_persistable, block.all_vars())))
-        return persis_vars
-
-    @switch_to_static_graph
-    def _build_program_by_desc(self, program_desc):
-        prog = framework.Program()
-        prog.desc = program_desc
-        prog.blocks = [
-            framework.Block(prog, i)
-            for i in six.moves.range(prog.desc.num_blocks())
-        ]
-        prog._sync_with_cpp()
-        return prog
-
-    def _is_persistable(self, var_desc):
-        if var_desc.type() == core.VarDesc.VarType.FEED_MINIBATCH or \
-                var_desc.type() == core.VarDesc.VarType.FETCH_LIST or \
-                var_desc.type() == core.VarDesc.VarType.READER or \
-                var_desc.type() == core.VarDesc.VarType.RAW:
-            return False
-        return var_desc.persistable()
-
-    def _is_parameter(self, persis_var_desc):
-        assert self._infer_program_desc is not None, "The StaticModelRunner not initialized properly."
-        # 1. firstly, param should be input of op
-        input_ops = []  # op can be repeated
-        for block_idx in six.moves.range(self._infer_program_desc.num_blocks()):
-            block = self._infer_program_desc.block(block_idx)
-            for op_idx in six.moves.range(block.op_size()):
-                op = block.op(op_idx)
-                # NOTE: parameter is the input of a certain op
-                if persis_var_desc.name() in op.input_arg_names():
-                    input_ops.append(op)
-        # 2. secondly, param should not be output of op or be same op's output
-        for block_idx in six.moves.range(self._infer_program_desc.num_blocks()):
-            block = self._infer_program_desc.block(block_idx)
-            for op_idx in six.moves.range(block.op_size()):
-                op = block.op(op_idx)
-                if persis_var_desc.name() in op.output_arg_names():
-                    # such as batch_norm_op
-                    if op in input_ops:
-                        continue
-                    else:
-                        return False
-        return True
-
-    def _change_is_test_status(self, program_desc, is_test):
-        # change all `is_test` attributes
-        for i in six.moves.range(program_desc.num_blocks()):
-            block = program_desc.block(i)
-            for j in six.moves.range(block.op_size()):
-                op = block.op(j)
-                if op.has_attr('is_test'):
-                    op._set_attr('is_test', is_test)
-
-    def _append_loaded_suffix(self, name):
-        """
-        Append grad suffix to the given variable name
-        e.g. x ==> x@LOADED
-        """
-        suffix = core.loaded_var_suffix()
-        name = cpt.to_text(name)
-        if suffix not in name:
-            name = name + suffix
-        return name
-
-    def _append_loaded_suffix_to_param(self, param_desc):
-        old_name = param_desc.name()
-        new_name = self._append_loaded_suffix(param_desc.name())
-        param_desc.set_name(new_name)
-        for block_idx in six.moves.range(self._infer_program_desc.num_blocks()):
-            block = self._infer_program_desc.block(block_idx)
-            for op_idx in six.moves.range(block.op_size()):
-                op = block.op(op_idx)
-                op._rename_input(old_name, new_name)
-                op._rename_output(old_name, new_name)
+            configs.params_filename = params_filename
+        return TranslatedLayer._construct(model_dir, configs)
diff --git a/python/paddle/fluid/dygraph/varbase_patch_methods.py b/python/paddle/fluid/dygraph/varbase_patch_methods.py
index 6b528479ff579c3ae01f39a8f6f8ef3de11e4259..2e41a8ff417b3083d96d0a9bd1fa453c8fddc014 100644
--- a/python/paddle/fluid/dygraph/varbase_patch_methods.py
+++ b/python/paddle/fluid/dygraph/varbase_patch_methods.py
@@ -12,16 +12,67 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+import inspect
 from .. import framework
 from .. import core
 from . import BackwardStrategy
-from ..framework import Variable, _getitem_impl_
-from .. import unique_name
+from ..framework import Variable, Parameter, ParamBase
+from .base import switch_to_static_graph
 import numpy as np
 from .math_op_patch import monkey_patch_math_varbase
 
 
 def monkey_patch_varbase():
+    @switch_to_static_graph
+    def _to_static_var(self, to_parameter=False, **kwargs):
+        """
+        **Notes**:
+            **This API is ONLY available in Dygraph mode**
+
+        Transform a VarBase into static Variable with same attributes. It's a low level interface used
+        in dy2static and shall not be called directly.
+
+        Args:
+            to_parameter (bool): It takes effect only if the input a VarBase. If set True,
+                                 the VarBase will be converted into framework.Parameters. Otherwise, it will
+                                 be converted into framework.Variable. Default False.
+
+        Examples:
+            .. code-block:: python
+
+                import paddle.fluid as fluid
+                from paddle.fluid.dygraph.base import to_variable
+                import numpy as np
+
+                data = np.ones([3, 1024], dtype='float32')
+                with fluid.dygraph.guard():
+                    var_base = to_variable(data)
+                    static_var = var_base._to_static_var()
+
+        """
+        if isinstance(self, ParamBase):
+            attr_kwargs = self.__dict__.copy()
+        else:
+            attr_names = [
+                name for name in dir(self)
+                if not (inspect.ismethod(getattr(self, name)) or
+                        name.startswith('_'))
+            ]
+            attr_kwargs = {name: getattr(self, name) for name in attr_names}
+
+        attr_keys = ['block', 'shape', 'dtype', 'type', 'name', 'persistable']
+        for attr in attr_keys:
+            attr_kwargs[attr] = getattr(self, attr, None)
+
+        attr_kwargs.update(kwargs)
+
+        if to_parameter or isinstance(self, ParamBase):
+            del attr_kwargs['persistable']
+            static_var = Parameter(**attr_kwargs)
+        else:
+            static_var = Variable(**attr_kwargs)
+        return static_var
+
     # TODO(jiabin): move this to cplusplus end if we find some performance issue on it
     @framework.dygraph_only
     def set_value(self, value):
@@ -161,46 +212,35 @@ def monkey_patch_varbase():
             return np.array(new_ivar.value().get_tensor())
 
     def __str__(self):
-        return self.to_string(True)
-
-    @property
-    def block(self):
-        return framework.default_main_program().global_block()
-
-    def to_string(self, throw_on_error, with_details=False):
         """
-        Get debug string.
-
-        Args:
-
-            throw_on_error (bool): True if raise an exception when self is not initialized.
-
-            with_details (bool): more details about variables and parameters (e.g. trainable, optimize_attr, ...) will be printed when with_details is True. Default value is False;
+        Convert a VarBase object to a readable string.
 
-        Returns:
-            str: The debug string.
+        Returns(str): A readable string.
 
         Examples:
             .. code-block:: python
 
-                import paddle.fluid as fluid
-
-                cur_program = fluid.Program()
-                cur_block = cur_program.current_block()
-                new_variable = cur_block.create_var(name="X",
-                                                    shape=[-1, 23, 48],
-                                                    dtype='float32')
-                print(new_variable.to_string(True))
-                print("=============with detail===============")
-                print(new_variable.to_string(True, True))
+                import paddle
+                paddle.enable_imperative()
+                x = paddle.rand([1, 5])
+                print(x)
+                # Variable: eager_tmp_0
+                #   - place: CUDAPlace(0)
+                #   - shape: [1, 5]
+                #   - layout: NCHW
+                #   - dtype: float
+                #   - data: [0.645307 0.597973 0.732793 0.646921 0.540328]
+                paddle.disable_imperative()
         """
-        if framework.in_dygraph_mode():
-            # TODO(panyx0718): add more dygraph debug info.
-            tensor = self.value().get_tensor()
-            if tensor._is_initialized():
-                return 'Variable: %s\n%s' % (self.name, str(tensor))
-            else:
-                return 'Variable: %s, not initialized' % (self.name)
+        tensor = self.value().get_tensor()
+        if tensor._is_initialized():
+            return 'Variable: %s\n%s' % (self.name, str(tensor))
+        else:
+            return 'Variable: %s, not initialized' % (self.name)
+
+    @property
+    def block(self):
+        return framework.default_main_program().global_block()
 
     def __nonzero__(self):
         numel = np.prod(self.shape)
@@ -214,8 +254,9 @@ def monkey_patch_varbase():
 
     for method_name, method in (
         ("__bool__", __bool__), ("__nonzero__", __nonzero__),
-        ("set_value", set_value), ("block", block), ("backward", backward),
-        ("gradient", gradient), ("__str__", __str__), ("to_string", to_string)):
+        ("_to_static_var", _to_static_var), ("set_value", set_value),
+        ("block", block), ("backward", backward), ("gradient", gradient),
+        ("__str__", __str__)):
         setattr(core.VarBase, method_name, method)
 
     # patch math methods for varbase
diff --git a/python/paddle/fluid/executor.py b/python/paddle/fluid/executor.py
index 0d9d617c4d220e0b7fb36fad1be1dcd5fa1625b5..f6cca91374e58d39cda125fda79861810726e135 100644
--- a/python/paddle/fluid/executor.py
+++ b/python/paddle/fluid/executor.py
@@ -404,29 +404,35 @@ def _as_lodtensor(data, place, dtype=None):
             >>>     ...
 
         Args:
-            data(numpy.ndarray): a instance of array
+            data(numpy.ndarray|list|tuple|scalar): a instance of array, scalar, list or tuple
             data(core.Place): the place of created tensor
-            dtype(core.VarDesc.VarType): the expected data type of created tensor
+            dtype(core.VarDesc.VarType|str): the expected data type of created tensor
 
         Returns:
             LoDTensor
         """
-    if isinstance(data, list):
-        raise RuntimeError("Some of your feed data hold LoD information. \
-                They can not be completely cast from a list of Python \
-                ndarray to LoDTensor. Please convert data to LoDTensor \
-                directly before feeding the data.\
-                ")
-
-    #NOTE(zhiqiu): convert python builtin ,like float and int, to numpy array
+    #NOTE(zhiqiu): convert python builtin, like float, int, and list, to numpy ndarray
     if not isinstance(data, np.ndarray):
+        assert dtype is not None, 'The dtype should be given when feed data is not np.ndarray'
+        dtype = convert_dtype(dtype) if isinstance(
+            dtype, core.VarDesc.VarType) else dtype
         if np.isscalar(data):
-            assert dtype is not None, 'dtype should be given when casting python scalar to tensor'
-            dtype = convert_dtype(dtype) if isinstance(
-                dtype, core.VarDesc.VarType) else dtype
             data = np.array([data]).astype(dtype)
+        elif isinstance(data, (list, tuple)):
+            data = np.array(data)
+            if data.dtype == np.object:
+                raise TypeError(
+                    "\n\tFaild to convert input data to a regular ndarray :\n\t* Usually "
+                    "this means the input data contains nested lists with different lengths. "
+                    "Please consider using 'fluid.create_lod_tensor' to convert it to a LoD-Tensor."
+                )
+            data = data.astype(dtype)
+        else:
+            raise TypeError(
+                "Convert data of type {} to Tensor is not supported".format(
+                    type(data)))
 
-    # single tensor case
+    # convert numpy.ndarray to tensor
     tensor = core.LoDTensor()
     tensor.set(data, place)
     return tensor
diff --git a/python/paddle/fluid/framework.py b/python/paddle/fluid/framework.py
index 7454c70d55b8a3b478eb708a873d34590d8050c6..cf4f47d13fc9f1ccd70fae4b6582b7c974c0f926 100644
--- a/python/paddle/fluid/framework.py
+++ b/python/paddle/fluid/framework.py
@@ -5111,7 +5111,8 @@ class ParamBase(core.VarBase):
                                         list(shape) if shape else [], name,
                                         core.VarDesc.VarType.LOD_TENSOR, True)
 
-        self.trainable = kwargs.get('trainable', True)
+        trainable = kwargs.get('trainable', True)
+        self.stop_gradient = not trainable
 
         self.optimize_attr = kwargs.get('optimize_attr', {'learning_rate': 1.0})
 
@@ -5120,36 +5121,42 @@ class ParamBase(core.VarBase):
         self.do_model_average = kwargs.get('do_model_average', None)
 
         self.is_distributed = False
-
         # self.block = default_main_program().global_block()
 
-    def __str__(self):
-        return self.to_string(True)
+    @property
+    def trainable(self):
+        return not self.stop_gradient
 
-    def to_string(self, throw_on_error, with_details=False):
-        """
-        To debug string.
+    @trainable.setter
+    def trainable(self, trainable):
+        if isinstance(trainable, bool):
+            self.stop_gradient = not trainable
+        else:
+            raise ValueError(
+                "The type of trainable MUST be bool, but the type is ",
+                type(trainable))
 
-        Args:
-            throw_on_error(bool): raise exception when self is not initialized
-                when throw_on_error is True
-            with_details(bool): more details about variables and parameters
-                (e.g. trainable, optimize_attr, ...) will be printed when with_details is True
+    def __str__(self):
+        """
+        Convert a ParamBase object to a readable string.
 
-        Returns(str): The debug string.
+        Returns(str): A readable string.
 
         Examples:
             .. code-block:: python
 
-                import paddle.fluid as fluid
-
-                prog = fluid.default_main_program()
-                rlt = fluid.layers.data("fake_data", shape=[1,1], dtype='float32')
-                debug_str = prog.to_string(throw_on_error=True, with_details=False)
-                print(debug_str)
+                import paddle
+                paddle.enable_imperative()
+                conv = paddle.nn.Conv2D(3, 3, 5)
+                print(conv.weight)
+                # Parameter: conv2d_0.w_0
+                #   - place: CUDAPlace(0)
+                #   - shape: [3, 3, 5, 5]
+                #   - layout: NCHW
+                #   - dtype: float
+                #   - data: [...] 
+                paddle.disable_imperative()
         """
-        assert isinstance(throw_on_error, bool) and isinstance(with_details,
-                                                               bool)
         tensor = self.value().get_tensor()
         if tensor._is_initialized():
             return 'Parameter: %s\n%s' % (self.name, str(tensor))
diff --git a/python/paddle/fluid/incubate/fleet/collective/__init__.py b/python/paddle/fluid/incubate/fleet/collective/__init__.py
index e6304473b6507de22805f0a9bd5088dbe5a29ecd..4bcd5196a3b3cc5a11a93d27d4dc3dffe61e8644 100644
--- a/python/paddle/fluid/incubate/fleet/collective/__init__.py
+++ b/python/paddle/fluid/incubate/fleet/collective/__init__.py
@@ -26,7 +26,7 @@ from paddle.fluid.incubate.fleet.base.fleet_base import Mode
 from paddle.fluid.incubate.fleet.base.fleet_base import DistributedOptimizer
 
 from paddle.fluid import compiler
-from paddle.distributed.fs_wrapper import LocalFS, BDFS
+from paddle.fluid.incubate.fleet.utils.fs import LocalFS
 
 import os
 import sys
@@ -70,7 +70,7 @@ class Collective(Fleet):
         self._origin_program = None
         self._transpiled_program = None
         self.main_program = None
-        self._checkoint_prefix = "__paddle_fleet_checkpoint__"
+        self._checkpoint_prefix = "__paddle_fleet_checkpoint__"
         self._param_file_name = "_paddle_fleet_param__"
 
     def init_worker(self):
@@ -186,8 +186,8 @@ class Collective(Fleet):
         max_no = -1
         d = {}
         dirs = fs.list_dirs(root_path)
-        for dir in dirs:
-            g = dir.split(".")
+        for d in dirs:
+            g = d.split(".")
             if len(g) != 2:
                 continue
 
@@ -203,10 +203,10 @@ class Collective(Fleet):
 
         return max_no
 
-    def clean_redundant_check_points(self,
-                                     root_path,
-                                     fs=LocalFS(),
-                                     checkpoint_num=1):
+    def clean_redundant_checkpoints(self,
+                                    root_path,
+                                    fs=LocalFS(),
+                                    checkpoint_num=1):
         max_no = self._get_last_checkpoint_no(root_path, fs)
         if max_no < 0:
             return
@@ -215,32 +215,32 @@ class Collective(Fleet):
             checkpoint_num = 1
 
         dirs = fs.list_dirs(root_path)
-        for dir in dirs:
-            g = dir.split(".")
+        for d in dirs:
+            g = d.split(".")
             if len(g) != 2:
                 continue
 
-            if g[0] != self._checkoint_prefix:
+            if g[0] != self._checkpoint_prefix:
                 continue
 
             try:
                 n = int(g[1])
                 if n <= max_no - checkpoint_num:
-                    path = "{}/{}.{}".format(root_path, self._checkoint_prefix,
+                    path = "{}/{}.{}".format(root_path, self._checkpoint_prefix,
                                              n)
-                    fs.rmr(path)
+                    fs.delete(path)
             except Exception as e:
                 print(e)
                 continue
 
-    def save_check_point(self,
-                         executor,
-                         path,
-                         train_status,
-                         main_program=None,
-                         fs=LocalFS(),
-                         local_cache_path=".cache",
-                         remain_all_checkpoint=True):
+    def save_checkpoint(self,
+                        executor,
+                        path,
+                        train_status,
+                        main_program=None,
+                        fs=LocalFS(),
+                        local_cache_path=".cache",
+                        remain_all_checkpoint=True):
         """
         This function save persistables and current epoch num to path.
         """
@@ -248,14 +248,16 @@ class Collective(Fleet):
         if main_program == None:
             main_program = self._transpiled_program
 
-        if not fs.stat(path):
-            fs.mkdir(path)
+        if not fs.is_exist(path):
+            fs.mkdirs(path)
+        else:
+            assert fs.is_dir(path), "path:%s must be a directory".format(path)
 
         max_no = self._get_last_checkpoint_no(path, fs=fs)
         if max_no < 0:
             max_no = -1
 
-        real_path = "{}/{}.{}".format(path, self._checkoint_prefix, max_no + 1)
+        real_path = "{}/{}.{}".format(path, self._checkpoint_prefix, max_no + 1)
         tmp_path = "{}.tmp".format(real_path)
         saved_path = tmp_path
 
@@ -264,9 +266,14 @@ class Collective(Fleet):
         cache_path = None
         if fs.need_upload_download():
             cache_path = "{}/{}.{}.saved_cache".format(
-                local_cache_path, self._checkoint_prefix, max_no + 1)
-            if not local_fs.stat(cache_path):
-                local_fs.mkdir(cache_path)
+                local_cache_path, self._checkpoint_prefix, max_no + 1)
+            if not local_fs.is_exist(cache_path):
+                local_fs.mkdirs(cache_path)
+            else:
+                assert fs.is_dir(
+                    path), "cache path:{} must be a directory".format(
+                        cache_path)
+
             saved_path = cache_path
 
         self.save_persistables(
@@ -282,16 +289,16 @@ class Collective(Fleet):
         fs.mv(tmp_path, real_path)
 
         if not remain_all_checkpoint:
-            self.clean_redundant_check_points(path)
-
-    def load_check_point(self,
-                         executor,
-                         path,
-                         trainer_id,
-                         main_program=None,
-                         fs=LocalFS(),
-                         local_cache_path=".cache",
-                         ignore_empty=True):
+            self.clean_redundant_checkpoints(path)
+
+    def load_checkpoint(self,
+                        executor,
+                        path,
+                        trainer_id,
+                        main_program=None,
+                        fs=LocalFS(),
+                        local_cache_path=".cache",
+                        ignore_empty=True):
         """
         This function load persistables and current epoch num from path.
         """
@@ -306,11 +313,13 @@ class Collective(Fleet):
         local_fs = LocalFS()
         if fs.need_upload_download():
             cache_path = "{}/{}.{}.load_cache.{}".format(
-                local_cache_path, self._checkoint_prefix, max_no, trainer_id)
-            if local_fs.stat(cache_path):
+                local_cache_path, self._checkpoint_prefix, max_no, trainer_id)
+            if not local_fs.is_exist(local_cache_path):
+                local_fs.mkdirs(local_cache_path)
+            if local_fs.is_exist(cache_path):
                 local_fs.delete(cache_path)
 
-        real_path = "{}/{}.{}".format(path, self._checkoint_prefix, max_no)
+        real_path = "{}/{}.{}".format(path, self._checkpoint_prefix, max_no)
         load_path = real_path
         if fs.need_upload_download():
             fs.download(real_path, cache_path)
diff --git a/python/paddle/fluid/incubate/fleet/utils/fs.py b/python/paddle/fluid/incubate/fleet/utils/fs.py
new file mode 100644
index 0000000000000000000000000000000000000000..4782c2f8d90c84027f9fd1d11022425a9ed6c84b
--- /dev/null
+++ b/python/paddle/fluid/incubate/fleet/utils/fs.py
@@ -0,0 +1,164 @@
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import os
+import sys
+import subprocess
+import multiprocessing
+from datetime import datetime
+
+import re
+import copy
+import errno
+import time
+import logging
+import abc
+from pathlib import PurePosixPath, Path
+import shutil
+
+__all__ = ['FS', 'LocalFS']
+
+
+class ExecuteError(Exception):
+    pass
+
+
+class FSFileExistsError(Exception):
+    pass
+
+
+class FSFileNotExistsError(Exception):
+    pass
+
+
+class FSTimeOut(Exception):
+    pass
+
+
+class FS(object):
+    @abc.abstractmethod
+    def ls_dir(self, fs_path):
+        raise NotImplementedError
+
+    @abc.abstractmethod
+    def is_file(self, fs_path):
+        raise NotImplementedError
+
+    @abc.abstractmethod
+    def is_dir(self, fs_path):
+        raise NotImplementedError
+
+    @abc.abstractmethod
+    def is_exist(self, fs_path):
+        raise NotImplementedError
+
+    @abc.abstractmethod
+    def upload(self, local_path, fs_path):
+        raise NotImplementedError
+
+    @abc.abstractmethod
+    def download(self, fs_path, local_path):
+        raise NotImplementedError
+
+    @abc.abstractmethod
+    def mkdirs(self, fs_path):
+        raise NotImplementedError
+
+    @abc.abstractmethod
+    def delete(self, fs_path):
+        raise NotImplementedError
+
+    @abc.abstractmethod
+    def need_upload_download(self):
+        raise NotImplementedError
+
+    @abc.abstractmethod
+    def rename(self, fs_src_path, fs_dst_path):
+        raise NotImplementedError
+
+    @abc.abstractmethod
+    def mv(self, fs_src_path, fs_dst_path):
+        raise NotImplementedError
+
+    @abc.abstractmethod
+    def upload_dir(self, local_dir, dest_dir):
+        raise NotImplementedError
+
+    @abc.abstractmethod
+    def list_dirs(self, fs_path):
+        raise NotImplementedError
+
+
+class LocalFS(FS):
+    def ls_dir(self, fs_path):
+        return [f for f in os.listdir(fs_path)]
+
+    def mkdirs(self, fs_path):
+        assert not os.path.isfile(fs_path), "{} is already a file".format(
+            fs_path)
+        os.system("mkdir -p {}".format(fs_path))
+
+    def rename(self, fs_src_path, fs_dst_path):
+        os.rename(fs_src_path, fs_dst_path)
+
+    def _rmr(self, fs_path):
+        shutil.rmtree(fs_path)
+
+    def _rm(self, fs_path):
+        os.remove(fs_path)
+
+    def delete(self, fs_path):
+        if not self.is_exist(fs_path):
+            return
+
+        if os.path.isfile(fs_path):
+            return self._rm(fs_path)
+
+        return self._rmr(fs_path)
+
+    def need_upload_download(self):
+        return False
+
+    def is_file(self, fs_path):
+        return os.path.isfile(fs_path)
+
+    def is_dir(self, fs_path):
+        return os.path.isdir(fs_path)
+
+    def is_exist(self, fs_path):
+        return os.path.exists(fs_path)
+
+    def touch(self, fs_path):
+        return Path(fs_path).touch()
+
+    def mv(self, src_path, dst_path):
+        if not self.is_exist(src_path):
+            raise FSFileNotExistsError
+
+        if self.is_exist(dst_path):
+            raise FSFileExistsError
+
+        return self.rename(src_path, dst_path)
+
+    def list_dirs(self, fs_path):
+        """	
+        list directory under fs_path, and only give the pure name, not include the fs_path	
+        """
+        if not self.is_exist(fs_path):
+            return []
+
+        dirs = [
+            f for f in os.listdir(fs_path) if os.path.isdir(fs_path + "/" + f)
+        ]
+
+        return dirs
diff --git a/python/paddle/fluid/incubate/fleet/utils/hdfs.py b/python/paddle/fluid/incubate/fleet/utils/hdfs.py
index c16d7e3cc458f3f2052507497a3ba7ebaec3d042..27f68076f27feb32f823977274d2b33bccf3eb1c 100644
--- a/python/paddle/fluid/incubate/fleet/utils/hdfs.py
+++ b/python/paddle/fluid/incubate/fleet/utils/hdfs.py
@@ -24,596 +24,221 @@ import copy
 import errno
 import time
 import logging
+import six
+from . import fs
+from .fs import FS, LocalFS, FSFileExistsError, FSFileNotExistsError, ExecuteError, FSTimeOut
+import paddle.fluid as fluid
+import functools
 
-__all__ = ["HDFSClient"]
-
-
-def get_logger(name, level, fmt):
-    logger = logging.getLogger(name)
-    logger.setLevel(level)
-    handler = logging.FileHandler('hdfs.log', mode='w')
-    formatter = logging.Formatter(fmt=fmt)
-    handler.setFormatter(formatter)
-    logger.addHandler(handler)
-    return logger
-
-
-_logger = get_logger(
-    __name__, logging.INFO, fmt='%(asctime)s-%(levelname)s: %(message)s')
-
+from pathlib import PurePosixPath, Path
+import shutil
 
-class HDFSClient(object):
-    """
-    A tool of HDFS
-
-    Args:
-        hadoop_home (string): hadoop_home
-        configs (dict): hadoop config, it is a dict, please contain \
-            key "fs.default.name" and "hadoop.job.ugi"
-        Can be a float value
-    Examples:
-        hadoop_home = "/home/client/hadoop-client/hadoop/"
-
-        configs = {
-            "fs.default.name": "hdfs://xxx.hadoop.com:54310",
-            "hadoop.job.ugi": "hello,hello123"
-        }
+__all__ = ["HDFSClient"]
 
-        client = HDFSClient(hadoop_home, configs)
 
-        client.ls("/user/com/train-25")
-        files = client.lsr("/user/com/train-25/models")
-    """
+def _handle_errors(f):
+    def handler(*args, **kwargs):
+        start = time.time()
+        while True:
+            try:
+                return f(*args, **kwargs)
+            except ExecuteError as e:
+                o = args[0]
+                time_out = float(o._time_out) / 1000.0
+                inter = float(o._sleep_inter) / 1000.0
+                if time.time() - start >= time_out:
+                    raise FSTimeOut
+                time.sleep(inter)
+
+    return functools.wraps(f)(handler)
+
+
+class HDFSClient(FS):
+    def __init__(
+            self,
+            hadoop_home,
+            configs,
+            time_out=5 * 60 * 1000,  #ms
+            sleep_inter=1000):  #ms
+        # Raise exception if JAVA_HOME not exists.
+        java_home = os.environ["JAVA_HOME"]
 
-    def __init__(self, hadoop_home, configs):
         self.pre_commands = []
         hadoop_bin = '%s/bin/hadoop' % hadoop_home
         self.pre_commands.append(hadoop_bin)
         dfs = 'fs'
         self.pre_commands.append(dfs)
 
-        for k, v in configs.iteritems():
-            config_command = '-D%s=%s' % (k, v)
-            self.pre_commands.append(config_command)
-
-    def __run_hdfs_cmd(self, commands, retry_times=5):
-        whole_commands = copy.deepcopy(self.pre_commands)
-        whole_commands.extend(commands)
-
-        ret_code = 0
-        ret_out = None
-        ret_err = None
-        retry_sleep_second = 3
-        whole_commands = " ".join(whole_commands)
-        for x in range(retry_times + 1):
-            proc = subprocess.Popen(
-                whole_commands,
-                stdout=subprocess.PIPE,
-                stderr=subprocess.PIPE,
-                shell=True)
-            (output, errors) = proc.communicate()
-            ret_code, ret_out, ret_err = proc.returncode, output, errors
-
-            _logger.info(
-                'Times: %d, Running command: %s. Return code: %d, Msg: %s' %
-                (x, whole_commands, proc.returncode, errors))
-
-            if ret_code == 0:
-                break
-            time.sleep(retry_sleep_second)
-
-        return ret_code, ret_out, ret_err
-
-    def cat(self, hdfs_path=None):
-        """
-        cat hdfs file
-        Args:
-            hdfs_path(str): the hdfs file path
-        Returns:
-            file content
-        """
-        if self.is_file(hdfs_path):
-            exist_cmd = ['-cat', hdfs_path]
-            returncode, output, errors = self.__run_hdfs_cmd(
-                exist_cmd, retry_times=1)
-            if returncode != 0:
-                _logger.error("HDFS cat HDFS path: {} failed".format(hdfs_path))
-                return ""
-            else:
-                _logger.info("HDFS cat HDFS path: {} succeed".format(hdfs_path))
-                return output.strip()
+        if configs:
+            for k, v in six.iteritems(configs):
+                config_command = '-D%s=%s' % (k, v)
 
-        else:
-            return ""
+        self._time_out = time_out
+        self._sleep_inter = sleep_inter
+        self._base_cmd = " ".join(self.pre_commands)
+        self._bd_err_re = re.compile(
+            r'\s?responseErrorMsg\s?\:.*, errorCode\:\s?[0-9]+, path\:')
 
-    def is_exist(self, hdfs_path=None):
-        """
-        whether the remote HDFS path exists
-
-        Args:
-            hdfs_path(str): the hdfs file path
-
-        Returns:
-            True or False
-        """
-        exist_cmd = ['-test', '-e', hdfs_path]
-        returncode, output, errors = self.__run_hdfs_cmd(
-            exist_cmd, retry_times=1)
-
-        if returncode:
-            _logger.error("HDFS is_exist HDFS path: {} failed".format(
-                hdfs_path))
-            return False
-        else:
-            _logger.info("HDFS is_exist HDFS path: {} successfully".format(
-                hdfs_path))
-            return True
+    def _run_cmd(self, cmd, redirect_stderr=False):
+        ret, output = fluid.core.shell_execute_cmd(cmd, 0, 0, redirect_stderr)
+        return int(ret), output.splitlines()
 
-    def is_dir(self, hdfs_path=None):
-        """
-        whether the remote HDFS path is directory
+    def list_dirs(self, fs_path):
+        if not self.is_exist(fs_path):
+            return []
 
-        Args:
-            hdfs_path(str): the hdfs file path
+        dirs, _ = self.ls_dir(fs_path)
+        return dirs
 
-        Returns:
-            True or False
+    @_handle_errors
+    def ls_dir(self, fs_path):
+        """	
+        list directory under fs_path, and only give the pure name, not include the fs_path	
         """
+        if not self.is_exist(fs_path):
+            return [], []
 
-        if not self.is_exist(hdfs_path):
-            return False
+        cmd = "{} -ls {}".format(self._base_cmd, fs_path)
+        ret, lines = self._run_cmd(cmd)
 
-        dir_cmd = ['-test', '-d', hdfs_path]
-        returncode, output, errors = self.__run_hdfs_cmd(dir_cmd, retry_times=1)
+        if ret != 0:
+            raise ExecuteError
 
-        if returncode:
-            _logger.error("HDFS path: {} failed is not a directory".format(
-                hdfs_path))
-            return False
-        else:
-            _logger.info("HDFS path: {} successfully is a directory".format(
-                hdfs_path))
-            return True
+        dirs = []
+        files = []
+        for line in lines:
+            arr = line.split()
+            if len(arr) != 8:
+                continue
 
-    def is_file(self, hdfs_path=None):
-        """
-        whether the remote HDFS path is file
+            if fs_path not in arr[7]:
+                continue
 
-        Args:
-            hdfs_path(str): the hdfs file path
+            p = PurePosixPath(arr[7])
+            if arr[0][0] == 'd':
+                dirs.append(p.name)
+            else:
+                files.append(p.name)
 
-        Returns:
-            True or False
-        """
+        return dirs, files
 
-        if not self.is_exist(hdfs_path):
-            return False
+    def _test_match(self, lines):
+        for l in lines:
+            m = self._bd_err_re.match(l)
+            if m != None:
+                return m
 
-        dir_cmd = ['-test', '-d', hdfs_path]
-        returncode, output, errors = self.__run_hdfs_cmd(dir_cmd, retry_times=1)
+        return None
 
-        if returncode == 0:
-            _logger.error("HDFS path: {} failed is not a file".format(
-                hdfs_path))
+    @_handle_errors
+    def is_dir(self, fs_path):
+        if not self.is_exist(fs_path):
             return False
-        else:
-            _logger.info("HDFS path: {} successfully is a file".format(
-                hdfs_path))
-            return True
-
-    def delete(self, hdfs_path):
-        """
-        Remove a file or directory from HDFS.
-
-        whether the remote HDFS path exists
-
-        Args:
-            hdfs_path(str): HDFS path.
-
-        Returns:
-            True or False
-            This function returns `True` if the deletion was successful and `False` if
-            no file or directory previously existed at `hdfs_path`.
-        """
-        _logger.info('Deleting %r.', hdfs_path)
-
-        if not self.is_exist(hdfs_path):
-            _logger.warn("HDFS path: {} do not exist".format(hdfs_path))
-            return True
-
-        if self.is_dir(hdfs_path):
-            del_cmd = ['-rmr', hdfs_path]
-        else:
-            del_cmd = ['-rm', hdfs_path]
 
-        returncode, output, errors = self.__run_hdfs_cmd(del_cmd, retry_times=0)
+        cmd = "{} -test -d {}".format(
+            self._base_cmd, fs_path, redirect_stderr=True)
+        ret, lines = self._run_cmd(cmd)
+        if ret:
+            # other error
+            if self._test_match(lines) != None:
+                raise ExecuteError
 
-        if returncode:
-            _logger.error("HDFS path: {} delete files failure".format(
-                hdfs_path))
             return False
-        else:
-            _logger.info("HDFS path: {} delete files successfully".format(
-                hdfs_path))
-            return True
-
-    def rename(self, hdfs_src_path, hdfs_dst_path, overwrite=False):
-        """
-        Move a file or folder on HDFS.
-
-        Args:
-            hdfs_src_path(str): HDFS path
-            hdfs_dst_path(str): HDFS path
-            overwrite(bool|False): If the path already exists and overwrite is
-                                   False, will return False.
-        Returns:
-            True or False
-        """
-        assert hdfs_src_path is not None
-        assert hdfs_dst_path is not None
-
-        if not self.is_exist(hdfs_src_path):
-            _logger.info("HDFS path do not exist: {}".format(hdfs_src_path))
-        if self.is_exist(hdfs_dst_path) and not overwrite:
-            _logger.error("HDFS path is exist: {} and overwrite=False".format(
-                hdfs_dst_path))
-
-        rename_command = ['-mv', hdfs_src_path, hdfs_dst_path]
-        returncode, output, errors = self.__run_hdfs_cmd(
-            rename_command, retry_times=1)
-
-        if returncode:
-            _logger.error("HDFS rename path: {} to {} failed".format(
-                hdfs_src_path, hdfs_dst_path))
-            return False
-        else:
-            _logger.info("HDFS rename path: {} to {} successfully".format(
-                hdfs_src_path, hdfs_dst_path))
-            return True
-
-    @staticmethod
-    def make_local_dirs(local_path):
-        """
-        create a directory local, is same to mkdir
-
-        Args:
-            local_path(str): local path that wants to create a directory.
-        """
-        try:
-            os.makedirs(local_path)
-        except OSError as e:
-            if e.errno != errno.EEXIST:
-                raise
-
-    def makedirs(self, hdfs_path):
-        """
-        Create a remote directory, recursively if necessary.
-
-        Args:
-            hdfs_path(str): Remote path. Intermediate directories will be
-                            created appropriately.
 
-        Returns:
-            True or False
-        """
-        _logger.info('Creating directories to %r.', hdfs_path)
-        assert hdfs_path is not None
-
-        if self.is_exist(hdfs_path):
-            _logger.error("HDFS path is exist: {}".format(hdfs_path))
-            return
-
-        mkdirs_commands = ['-mkdir', hdfs_path]
-        returncode, output, errors = self.__run_hdfs_cmd(
-            mkdirs_commands, retry_times=1)
+        return True
 
-        if returncode:
-            _logger.error("HDFS mkdir path: {} failed".format(hdfs_path))
+    def is_file(self, fs_path):
+        if not self.is_exist(fs_path):
             return False
-        else:
-            _logger.info("HDFS mkdir path: {} successfully".format(hdfs_path))
-            return True
-
-    def ls(self, hdfs_path):
-        """
-        ls directory contents about HDFS hdfs_path
-
-        Args:
-            hdfs_path(str): Remote HDFS path will be ls.
-
-        Returns:
-            List: a contents list about hdfs_path.
-        """
-        assert hdfs_path is not None
-
-        if not self.is_exist(hdfs_path):
-            return []
 
-        ls_commands = ['-ls', hdfs_path]
-        returncode, output, errors = self.__run_hdfs_cmd(
-            ls_commands, retry_times=10)
+        return not self.is_dir(fs_path)
 
-        if returncode:
-            _logger.error("HDFS list path: {} failed".format(hdfs_path))
-            return []
-        else:
-            _logger.info("HDFS list path: {} successfully".format(hdfs_path))
-
-            ret_lines = []
-            regex = re.compile('\s+')
-            out_lines = output.strip().split("\n")
-            for line in out_lines:
-                re_line = regex.split(line)
-                if len(re_line) == 8:
-                    ret_lines.append(re_line[7])
-            return ret_lines
-
-    def lsr(self, hdfs_path, excludes=[]):
-        """
-        list directory contents about HDFS hdfs_path recursively
-
-        Args:
-            hdfs_path(str): Remote HDFS path.
-            excludes(list): excludes
-
-        Returns:
-            List: a contents list about hdfs_path.
-        """
-
-        assert hdfs_path is not None
-
-        if not self.is_exist(hdfs_path):
-            return []
-
-        ls_commands = ['-lsr', hdfs_path]
-        returncode, output, errors = self.__run_hdfs_cmd(
-            ls_commands, retry_times=1)
-
-        if returncode:
-            _logger.error("HDFS list all files: {} failed".format(hdfs_path))
-            return []
-        else:
-            _logger.info("HDFS list all files: {} successfully".format(
-                hdfs_path))
-            lines = []
-            regex = re.compile('\s+')
-            out_lines = output.strip().split("\n")
-            for line_id, line in enumerate(out_lines):
-                re_line = regex.split(line)
-                if len(re_line) == 8:
-                    if re_line[0][0] == "d":
-                        continue
-                    if re_line[7] in excludes:
-                        continue
-                    else:
-                        lines.append((re_line[7], re_line[5] + " " + re_line[6],
-                                      line_id))
-            lines = sorted(lines, key=lambda line: line[2])
-            ret_lines = [ret[0] for ret in lines]
-            return ret_lines
-
-    @staticmethod
-    def split_files(files, trainer_id, trainers):
-        """
-        split file list
-
-        Args:
-            files(list): file list
-            trainer_id(int): trainer mpi rank id
-            trainers(int): all trainers num
-
-        Returns:
-            fileist(list): file list of current trainer
-        """
-        remainder = len(files) % trainers
-        blocksize = len(files) / trainers
-
-        blocks = [blocksize] * trainers
-        for i in range(remainder):
-            blocks[i] += 1
-
-        trainer_files = [[]] * trainers
-        begin = 0
-        for i in range(trainers):
-            trainer_files[i] = files[begin:begin + blocks[i]]
-            begin += blocks[i]
-
-        return trainer_files[trainer_id]
-
-    def download(self,
-                 hdfs_path,
-                 local_path,
-                 multi_processes=5,
-                 overwrite=False,
-                 retry_times=5):
-        """
-        Download files from HDFS using multi process.
-
-        Args:
-            hdfs_path(str): path on hdfs
-            local_path(str): path on local
-            multi_processes(int|5): the download data process at the same time, default=5
-            overwrite(bool): is overwrite
-            retry_times(int): retry times
-
-        Returns:
-            List:
-            Download files in local folder.
-        """
-
-        def __subprocess_download(local_path, datas):
-            """
-            download file from HDFS
-
-            Args:
-                hdfs_path(str): the hdfs file path
-                local_path(str): the local file path
-                overwrite(bool|None): will overwrite the file on HDFS or not
-                retry_times(int|5): retry times
-
-            Returns:
-                True or False
-            """
-            for data in datas:
-                download_commands = ["-get", data, local_path]
-
-                returncode, output, errors = self.__run_hdfs_cmd(
-                    download_commands, retry_times=retry_times)
-
-                if returncode:
-                    _logger.error(
-                        "Get local path: {} from HDFS path: {} failed".format(
-                            local_path, hdfs_path))
+    @_handle_errors
+    def is_exist(self, fs_path):
+        cmd = "{} -ls {} ".format(self._base_cmd, fs_path)
+        ret, out = self._run_cmd(cmd, redirect_stderr=True)
+        if ret != 0:
+            for l in out:
+                if "No such file or directory" in l:
                     return False
-            return True
-
-        self.make_local_dirs(local_path)
-
-        all_files = self.ls(hdfs_path)
-
-        procs = []
-        for i in range(multi_processes):
-            process_datas = HDFSClient.split_files(all_files, i,
-                                                   multi_processes)
-            p = multiprocessing.Process(
-                target=__subprocess_download,
-                args=(
-                    local_path,
-                    process_datas, ))
-            procs.append(p)
-            p.start()
-
-        # complete the processes
-        for proc in procs:
-            proc.join()
-
-        _logger.info("Finish {} multi process to download datas".format(
-            multi_processes))
-
-        local_downloads = []
-        for dirname, folder, files in os.walk(local_path):
-            for i in files:
-                t = os.path.join(dirname, i)
-                local_downloads.append(t)
-        return local_downloads
-
-    def upload(self,
-               hdfs_path,
-               local_path,
-               multi_processes=5,
-               overwrite=False,
-               retry_times=5):
-        """
-        Upload files to HDFS using multi process.
-
-        Args:
-            hdfs_path(str): path on hdfs
-            local_path(str): path on local
-            multi_processes(int|5): the upload data process at the same time, default=5
-            overwrite(bool|False): will overwrite file on HDFS or not
-            retry_times(int): upload file max retry time.
-
-        Returns:
-            None
-        """
+            raise ExecuteError
 
-        def __subprocess_upload(hdfs_path_single, datas):
-            for data in datas:
-                put_commands = ["-put", data, hdfs_path_single]
-                returncode, output, errors = self.__run_hdfs_cmd(put_commands,
-                                                                 retry_times)
+        return True
 
-                if returncode:
-                    _logger.error("Put local path: {} to HDFS path: {} failed".
-                                  format(data, hdfs_path_single))
-                    return False
-            return True
+    @_handle_errors
+    def upload(self, local_path, fs_path):
+        if self.is_exist(fs_path):
+            raise FSFileExistsError
 
-        def get_local_files(path):
-            """
-            get local files
+        local = LocalFS()
+        if not local.is_exist(local_path):
+            raise FSFileNotExistsError
 
-            Args:
-                path(str): local path
+        cmd = "{} -put {} {}".format(self._base_cmd, local_path, fs_path)
+        ret, lines = self._run_cmd(cmd)
+        if ret != 0:
+            raise ExecuteError
 
-            Returns:
-                list of local files
-            """
-            rlist = []
+    @_handle_errors
+    def download(self, fs_path, local_path):
+        if self.is_exist(local_path):
+            raise FSFileExistsError
 
-            if not os.path.exists(path):
-                return rlist
+        if not self.is_exist(fs_path):
+            raise FSFileNotExistsError
 
-            if os.path.isdir(path):
-                for file in os.listdir(path):
-                    t = os.path.join(path, file)
-                    rlist.append(t)
-            else:
-                rlist.append(path)
-            return rlist
+        cmd = "{} -get {} {}".format(self._base_cmd, fs_path, local_path)
+        ret, lines = self._run_cmd(cmd)
+        if ret != 0:
+            raise ExecuteError
 
-        all_files = get_local_files(local_path)
-        if not all_files:
-            _logger.info("there are nothing need to upload, exit")
+    @_handle_errors
+    def mkdirs(self, fs_path):
+        if self.is_exist(fs_path):
             return
 
-        if self.is_exist(hdfs_path) and overwrite:
-            self.delete(hdfs_path)
-            self.makedirs(hdfs_path)
-
-        procs = []
-        for i in range(multi_processes):
-            process_datas = HDFSClient.split_files(all_files, i,
-                                                   multi_processes)
-            p = multiprocessing.Process(
-                target=__subprocess_upload, args=(
-                    hdfs_path,
-                    process_datas, ))
-            procs.append(p)
-            p.start()
-
-        # complete the processes
-        for proc in procs:
-            proc.join()
-
-        _logger.info("Finish upload datas from {} to {}".format(local_path,
-                                                                hdfs_path))
-
-    def upload_dir(self, dest_dir, local_dir, overwrite=False):
-        """
-        upload dir to hdfs
-        Args:
-            dest_dir(str): hdfs dest dir
-            local_dir(str): hdfs local dir
-            overwrite(bool): is overwrite
-        Returns:
-            return code
-        """
-        local_dir = local_dir.rstrip("/")
-        dest_dir = dest_dir.rstrip("/")
-        local_basename = os.path.basename(local_dir)
-        if self.is_exist(dest_dir + "/" + local_basename) and overwrite:
-            self.delete(dest_dir + "/" + local_basename)
-        if not self.is_exist(dest_dir):
-            self.makedirs(dest_dir)
-        put_command = ["-put", local_dir, dest_dir]
-        returncode, output, errors = self.__run_hdfs_cmd(put_command)
-        if returncode != 0:
-            _logger.error("Put local dir: {} to HDFS dir: {} failed".format(
-                local_dir, dest_dir))
-            return False
-        return True
-
-
-if __name__ == "__main__":
-    hadoop_home = "/home/client/hadoop-client/hadoop/"
+        cmd = "{} -mkdir {}".format(self._base_cmd, fs_path)
+        ret, lines = self._run_cmd(cmd)
+        if ret != 0:
+            raise ExecuteError
+
+    @_handle_errors
+    def mv(self, fs_src_path, fs_dst_path, test_exists=True):
+        if test_exists:
+            if not self.is_exist(fs_src_path):
+                raise FSFileNotExistsError
+
+            if self.is_exist(fs_dst_path):
+                raise FSFileExistsError
+
+        cmd = "{} -mv {} {}".format(self._base_cmd, fs_src_path, fs_dst_path)
+        ret, _ = self._run_cmd(cmd)
+        if ret != 0:
+            raise ExecuteError
+
+    @_handle_errors
+    def _rmr(self, fs_path):
+        cmd = "{} -rmr {}".format(self._base_cmd, fs_path)
+        ret, _ = self._run_cmd(cmd)
+        if ret != 0:
+            raise ExecuteError
+
+    @_handle_errors
+    def _rm(self, fs_path):
+        cmd = "{} -rm {}".format(self._base_cmd, fs_path)
+        ret, _ = self._run_cmd(cmd)
+        if ret != 0:
+            raise ExecuteError
+
+    def delete(self, fs_path):
+        if not self.is_exist(fs_path):
+            return
 
-    configs = {
-        "fs.default.name": "hdfs://xxx.hadoop.com:54310",
-        "hadoop.job.ugi": "hello,hello123"
-    }
+        is_dir = self.is_dir(fs_path)
+        if is_dir:
+            return self._rmr(fs_path)
 
-    client = HDFSClient(hadoop_home, configs)
+        return self._rm(fs_path)
 
-    client.ls("/user/com/train-25")
-    files = client.lsr("/user/com/train-25/models")
+    def need_upload_download(self):
+        return True
diff --git a/python/paddle/fluid/layers/detection.py b/python/paddle/fluid/layers/detection.py
index 94433cb2d90abfb56e8f6440a42a26e4cf36e26f..7906f563c0009ac37695f50c9dc2b035b8f004aa 100644
--- a/python/paddle/fluid/layers/detection.py
+++ b/python/paddle/fluid/layers/detection.py
@@ -57,6 +57,7 @@ __all__ = [
     'box_clip',
     'multiclass_nms',
     'locality_aware_nms',
+    'matrix_nms',
     'retinanet_detection_output',
     'distribute_fpn_proposals',
     'box_decoder_and_assign',
@@ -3523,6 +3524,133 @@ def locality_aware_nms(bboxes,
     return output
 
 
+def matrix_nms(bboxes,
+               scores,
+               score_threshold,
+               post_threshold,
+               nms_top_k,
+               keep_top_k,
+               use_gaussian=False,
+               gaussian_sigma=2.,
+               background_label=0,
+               normalized=True,
+               return_index=False,
+               name=None):
+    """
+    **Matrix NMS**
+
+    This operator does matrix non maximum suppression (NMS).
+
+    First selects a subset of candidate bounding boxes that have higher scores
+    than score_threshold (if provided), then the top k candidate is selected if
+    nms_top_k is larger than -1. Score of the remaining candidate are then
+    decayed according to the Matrix NMS scheme.
+    Aftern NMS step, at most keep_top_k number of total bboxes are to be kept
+    per image if keep_top_k is larger than -1.
+
+    Args:
+        bboxes (Variable): A 3-D Tensor with shape [N, M, 4] represents the
+                           predicted locations of M bounding bboxes,
+                           N is the batch size. Each bounding box has four
+                           coordinate values and the layout is
+                           [xmin, ymin, xmax, ymax], when box size equals to 4.
+                           The data type is float32 or float64.
+        scores (Variable): A 3-D Tensor with shape [N, C, M]
+                           represents the predicted confidence predictions.
+                           N is the batch size, C is the class number, M is
+                           number of bounding boxes. For each category there
+                           are total M scores which corresponding M bounding
+                           boxes. Please note, M is equal to the 2nd dimension
+                           of BBoxes. The data type is float32 or float64.
+        score_threshold (float): Threshold to filter out bounding boxes with
+                                 low confidence score.
+        post_threshold (float): Threshold to filter out bounding boxes with
+                                low confidence score AFTER decaying.
+        nms_top_k (int): Maximum number of detections to be kept according to
+                         the confidences after the filtering detections based
+                         on score_threshold.
+        keep_top_k (int): Number of total bboxes to be kept per image after NMS
+                          step. -1 means keeping all bboxes after NMS step.
+        use_gaussian (bool): Use Gaussian as the decay function. Default: False
+        gaussian_sigma (float): Sigma for Gaussian decay function. Default: 2.0
+        background_label (int): The index of background label, the background
+                                label will be ignored. If set to -1, then all
+                                categories will be considered. Default: 0
+        normalized (bool): Whether detections are normalized. Default: True
+        return_index(bool): Whether return selected index. Default: False
+        name(str): Name of the matrix nms op. Default: None.
+
+    Returns:
+        A tuple with two Variables: (Out, Index) if return_index is True,
+        otherwise, one Variable(Out) is returned.
+
+        Out (Variable): A 2-D LoDTensor with shape [No, 6] containing the
+             detection results.
+             Each row has 6 values: [label, confidence, xmin, ymin, xmax, ymax]
+             (After version 1.3, when no boxes detected, the lod is changed
+             from {0} to {1})
+
+        Index (Variable): A 2-D LoDTensor with shape [No, 1] containing the
+            selected indices, which are absolute values cross batches.
+
+    Examples:
+        .. code-block:: python
+
+
+            import paddle.fluid as fluid
+            boxes = fluid.data(name='bboxes', shape=[None,81, 4],
+                                      dtype='float32', lod_level=1)
+            scores = fluid.data(name='scores', shape=[None,81],
+                                      dtype='float32', lod_level=1)
+            out = fluid.layers.matrix_nms(bboxes=boxes,
+                                          scores=scores,
+                                          background_label=0,
+                                          score_threshold=0.5,
+                                          post_threshold=0.1,
+                                          nms_top_k=400,
+                                          keep_top_k=200,
+                                          normalized=False)
+    """
+    check_variable_and_dtype(bboxes, 'BBoxes', ['float32', 'float64'],
+                             'matrix_nms')
+    check_variable_and_dtype(scores, 'Scores', ['float32', 'float64'],
+                             'matrix_nms')
+    check_type(score_threshold, 'score_threshold', float, 'matrix_nms')
+    check_type(post_threshold, 'post_threshold', float, 'matrix_nms')
+    check_type(nms_top_k, 'nums_top_k', int, 'matrix_nms')
+    check_type(keep_top_k, 'keep_top_k', int, 'matrix_nms')
+    check_type(normalized, 'normalized', bool, 'matrix_nms')
+    check_type(use_gaussian, 'use_gaussian', bool, 'matrix_nms')
+    check_type(gaussian_sigma, 'gaussian_sigma', float, 'matrix_nms')
+    check_type(background_label, 'background_label', int, 'matrix_nms')
+
+    helper = LayerHelper('matrix_nms', **locals())
+    output = helper.create_variable_for_type_inference(dtype=bboxes.dtype)
+    index = helper.create_variable_for_type_inference(dtype='int')
+    helper.append_op(
+        type="matrix_nms",
+        inputs={'BBoxes': bboxes,
+                'Scores': scores},
+        attrs={
+            'background_label': background_label,
+            'score_threshold': score_threshold,
+            'post_threshold': post_threshold,
+            'nms_top_k': nms_top_k,
+            'gaussian_sigma': gaussian_sigma,
+            'use_gaussian': use_gaussian,
+            'keep_top_k': keep_top_k,
+            'normalized': normalized
+        },
+        outputs={'Out': output,
+                 'Index': index})
+    output.stop_gradient = True
+
+    if return_index:
+        return output, index
+    else:
+        return output
+
+
 def distribute_fpn_proposals(fpn_rois,
                              min_level,
                              max_level,
diff --git a/python/paddle/fluid/layers/layer_function_generator.py b/python/paddle/fluid/layers/layer_function_generator.py
index 1ee8ba12d83e976833cded176b0e3fa8376b7b84..5c14d26f3fe242c87676c2d2c27aa12f555e5638 100755
--- a/python/paddle/fluid/layers/layer_function_generator.py
+++ b/python/paddle/fluid/layers/layer_function_generator.py
@@ -281,22 +281,18 @@ def generate_activation_fn(op_type):
 
 Return type
   Variable
+
 Examples:
     .. code-block:: python
 
-        import paddle.fluid as fluid
+        import paddle
         import numpy as np
 
-        inputs = fluid.data(name="x", shape = [None, 4], dtype='float32')
-        output = fluid.layers.%s(inputs)
-
-        exe = fluid.Executor(fluid.CPUPlace())
-        exe.run(fluid.default_startup_program())
-
-        #input.shape=1X4, batch_size=1
-        img = np.array([[1.0, 2.0, 3.0, 4.0]]).astype(np.float32)
-        res = exe.run(fluid.default_main_program(), feed={'x':img}, fetch_list=[output])
-        print(res)
+        paddle.enable_imperative()
+        x_data = np.array([1, 2, 3, 4]).astype(np.float32)
+        x = paddle.imperative.to_variable(x_data)
+        res = paddle.%s(x)
+        print(res.numpy())
 """ % op_type
     return func
 
diff --git a/python/paddle/fluid/layers/learning_rate_scheduler.py b/python/paddle/fluid/layers/learning_rate_scheduler.py
index 5188051a9a6bbb40f538af5e0b8c8a4796de6e66..47e62016a20d78ef1209da92d42ceed726e482d6 100644
--- a/python/paddle/fluid/layers/learning_rate_scheduler.py
+++ b/python/paddle/fluid/layers/learning_rate_scheduler.py
@@ -52,9 +52,9 @@ def _decay_step_counter(begin=0):
 
 def noam_decay(d_model, warmup_steps, learning_rate=1.0):
     """
-	:alias_main: paddle.nn.functional.noam_decay
-	:alias: paddle.nn.functional.noam_decay,paddle.nn.functional.learning_rate.noam_decay
-	:old_api: paddle.fluid.layers.noam_decay
+	:alias_main: paddle.nn.functional.noam_decay
+	:alias: paddle.nn.functional.noam_decay,paddle.nn.functional.learning_rate.noam_decay
+	:old_api: paddle.fluid.layers.noam_decay
 
     Noam decay method. The numpy implementation of noam decay as follows.
 
@@ -115,9 +115,9 @@ def noam_decay(d_model, warmup_steps, learning_rate=1.0):
 
 def exponential_decay(learning_rate, decay_steps, decay_rate, staircase=False):
     """
-	:alias_main: paddle.nn.functional.exponential_decay
-	:alias: paddle.nn.functional.exponential_decay,paddle.nn.functional.learning_rate.exponential_decay
-	:old_api: paddle.fluid.layers.exponential_decay
+	:alias_main: paddle.nn.functional.exponential_decay
+	:alias: paddle.nn.functional.exponential_decay,paddle.nn.functional.learning_rate.exponential_decay
+	:old_api: paddle.fluid.layers.exponential_decay
 
     Applies exponential decay to the learning rate.
 
@@ -176,9 +176,9 @@ def exponential_decay(learning_rate, decay_steps, decay_rate, staircase=False):
 
 def natural_exp_decay(learning_rate, decay_steps, decay_rate, staircase=False):
     """
-	:alias_main: paddle.nn.functional.natural_exp_decay
-	:alias: paddle.nn.functional.natural_exp_decay,paddle.nn.functional.learning_rate.natural_exp_decay
-	:old_api: paddle.fluid.layers.natural_exp_decay
+	:alias_main: paddle.nn.functional.natural_exp_decay
+	:alias: paddle.nn.functional.natural_exp_decay,paddle.nn.functional.learning_rate.natural_exp_decay
+	:old_api: paddle.fluid.layers.natural_exp_decay
 
 Applies natural exponential decay to the initial learning rate.
 
@@ -237,9 +237,9 @@ Applies natural exponential decay to the initial learning rate.
 
 def inverse_time_decay(learning_rate, decay_steps, decay_rate, staircase=False):
     """
-	:alias_main: paddle.nn.functional.inverse_time_decay
-	:alias: paddle.nn.functional.inverse_time_decay,paddle.nn.functional.learning_rate.inverse_time_decay
-	:old_api: paddle.fluid.layers.inverse_time_decay
+	:alias_main: paddle.nn.functional.inverse_time_decay
+	:alias: paddle.nn.functional.inverse_time_decay,paddle.nn.functional.learning_rate.inverse_time_decay
+	:old_api: paddle.fluid.layers.inverse_time_decay
 
     Applies inverse time decay to the initial learning rate.
 
@@ -302,9 +302,9 @@ def polynomial_decay(learning_rate,
                      power=1.0,
                      cycle=False):
     """
-	:alias_main: paddle.nn.functional.polynomial_decay
-	:alias: paddle.nn.functional.polynomial_decay,paddle.nn.functional.learning_rate.polynomial_decay
-	:old_api: paddle.fluid.layers.polynomial_decay
+	:alias_main: paddle.nn.functional.polynomial_decay
+	:alias: paddle.nn.functional.polynomial_decay,paddle.nn.functional.learning_rate.polynomial_decay
+	:old_api: paddle.fluid.layers.polynomial_decay
 2
     Applies polynomial decay to the initial learning rate.
 
@@ -371,9 +371,9 @@ def polynomial_decay(learning_rate,
 
 def piecewise_decay(boundaries, values):
     """
-	:alias_main: paddle.nn.functional.piecewise_decay
-	:alias: paddle.nn.functional.piecewise_decay,paddle.nn.functional.learning_rate.piecewise_decay
-	:old_api: paddle.fluid.layers.piecewise_decay
+	:alias_main: paddle.nn.functional.piecewise_decay
+	:alias: paddle.nn.functional.piecewise_decay,paddle.nn.functional.learning_rate.piecewise_decay
+	:old_api: paddle.fluid.layers.piecewise_decay
 
 Applies piecewise decay to the initial learning rate.
 
@@ -450,9 +450,9 @@ Applies piecewise decay to the initial learning rate.
 
 def cosine_decay(learning_rate, step_each_epoch, epochs):
     """
-	:alias_main: paddle.nn.functional.cosine_decay
-	:alias: paddle.nn.functional.cosine_decay,paddle.nn.functional.learning_rate.cosine_decay
-	:old_api: paddle.fluid.layers.cosine_decay
+	:alias_main: paddle.nn.functional.cosine_decay
+	:alias: paddle.nn.functional.cosine_decay,paddle.nn.functional.learning_rate.cosine_decay
+	:old_api: paddle.fluid.layers.cosine_decay
 
     Applies cosine decay to the learning rate.
 
@@ -499,9 +499,9 @@ def cosine_decay(learning_rate, step_each_epoch, epochs):
 
 def linear_lr_warmup(learning_rate, warmup_steps, start_lr, end_lr):
     """
-	:alias_main: paddle.nn.functional.linear_lr_warmup
-	:alias: paddle.nn.functional.linear_lr_warmup,paddle.nn.functional.learning_rate.linear_lr_warmup
-	:old_api: paddle.fluid.layers.linear_lr_warmup
+	:alias_main: paddle.nn.functional.linear_lr_warmup
+	:alias: paddle.nn.functional.linear_lr_warmup,paddle.nn.functional.learning_rate.linear_lr_warmup
+	:old_api: paddle.fluid.layers.linear_lr_warmup
 
     This operator use the linear learning rate warm up strategy to adjust the learning rate preliminarily before the normal learning rate scheduling.
     For more information, please refer to `Bag of Tricks for Image Classification with Convolutional Neural Networks <https://arxiv.org/abs/1812.01187>`_
diff --git a/python/paddle/fluid/layers/loss.py b/python/paddle/fluid/layers/loss.py
index 805c8f81688ecf7feebe07ef4848a189ecb114d6..4217a98798ebbb46cb5b84e4c15fea4b4f0840ac 100644
--- a/python/paddle/fluid/layers/loss.py
+++ b/python/paddle/fluid/layers/loss.py
@@ -725,7 +725,7 @@ def nce(input,
 
             window_size = 5
             words = []
-            for i in xrange(window_size):
+            for i in range(window_size):
                 words.append(fluid.data(
                     name='word_{0}'.format(i), shape=[-1, 1], dtype='int64'))
 
@@ -733,7 +733,7 @@ def nce(input,
             label_word = int(window_size / 2) + 1
 
             embs = []
-            for i in xrange(window_size):
+            for i in range(window_size):
                 if i == label_word:
                     continue
 
@@ -746,14 +746,14 @@ def nce(input,
                       num_total_classes=dict_size, param_attr='nce.w_0',
                       bias_attr='nce.b_0')
 
-             #or use custom distribution
-             dist = np.array([0.05,0.5,0.1,0.3,0.05])
-             loss = fluid.layers.nce(input=embs, label=words[label_word],
-                       num_total_classes=5, param_attr='nce.w_1',
-                       bias_attr='nce.b_1',
-                       num_neg_samples=3,
-                       sampler="custom_dist",
-                       custom_dist=dist)
+            #or use custom distribution
+            dist = np.array([0.05,0.5,0.1,0.3,0.05])
+            loss = fluid.layers.nce(input=embs, label=words[label_word],
+                    num_total_classes=5, param_attr='nce.w_1',
+                    bias_attr='nce.b_1',
+                    num_neg_samples=3,
+                    sampler="custom_dist",
+                    custom_dist=dist)
     """
     helper = LayerHelper('nce', **locals())
     check_variable_and_dtype(input, 'input', ['float32', 'float64'], 'nce')
diff --git a/python/paddle/fluid/layers/nn.py b/python/paddle/fluid/layers/nn.py
index 11a4d933245dde73a092d0d014f2fc63a175f999..083c2ffbe3609220bf8484474d96f82171abf6d7 100644
--- a/python/paddle/fluid/layers/nn.py
+++ b/python/paddle/fluid/layers/nn.py
@@ -185,6 +185,7 @@ __all__ = [
     'filter_by_instag',
     'shard_index',
     'hard_swish',
+    'mish',
     'gather_tree',
     'uniform_random',
     'unbind',
@@ -3113,15 +3114,17 @@ def instance_norm(input,
             The data type is float32 or float64.
         epsilon(float, Default 1e-05): A value added to the denominator for
             numerical stability. Default is 1e-5.
-        param_attr(ParamAttr|None): The parameter attribute for Parameter `scale`
+        param_attr(ParamAttr|None|bool, optional): The parameter attribute for Parameter `scale`
              of instance_norm. If it is set to None or one attribute of ParamAttr, instance_norm
 	     will create ParamAttr as param_attr, the name of scale can be set in ParamAttr.
 	     If the Initializer of the param_attr is not set, the parameter is initialized
-	     with Xavier. Default: None.
-        bias_attr(ParamAttr|None): The parameter attribute for the bias of instance_norm.
+	     with Xavier. If the param_attr is set to False, instance_norm will not create param_attr.
+             Default: None.
+        bias_attr(ParamAttr|None|bool, optional): The parameter attribute for the bias of instance_norm.
              If it is set to None or one attribute of ParamAttr, instance_norm
 	     will create ParamAttr as bias_attr, the name of bias can be set in ParamAttr.
 	     If the Initializer of the bias_attr is not set, the bias is initialized zero.
+             If the bias_attr is set to False, instance_norm will not create bias_attr.
 	     Default: None.
         name(string, Default None): A name for this layer(optional). If set None, the layer
             will be named automatically.
@@ -3141,7 +3144,9 @@ def instance_norm(input,
     """
     check_variable_and_dtype(input, 'input', ['float32', 'float64'],
                              'instance_norm')
-    assert bias_attr is not False, "bias_attr should not be False in instance_norm."
+    if param_attr is False:
+        assert bias_attr is False, "param_attr and bias_attr must be set to Fasle at the same time in instance_norm"
+
     helper = LayerHelper('instance_norm', **locals())
     dtype = helper.input_dtype()
 
@@ -3154,18 +3159,19 @@ def instance_norm(input,
 
     param_shape = [channel_num]
 
-    # create parameter
-    scale = helper.create_parameter(
-        attr=helper.param_attr,
-        shape=param_shape,
-        dtype=dtype,
-        default_initializer=Constant(1.0))
-    bias = helper.create_parameter(
-        attr=helper.bias_attr,
-        shape=param_shape,
-        dtype=dtype,
-        is_bias=True,
-        default_initializer=Constant(0.0))
+    if param_attr and bias_attr:
+        # create parameter
+        scale = helper.create_parameter(
+            attr=helper.param_attr,
+            shape=param_shape,
+            dtype=dtype,
+            default_initializer=Constant(1.0))
+        bias = helper.create_parameter(
+            attr=helper.bias_attr,
+            shape=param_shape,
+            dtype=dtype,
+            is_bias=True,
+            default_initializer=Constant(0.0))
 
     # create output
     saved_mean = helper.create_variable_for_type_inference(
@@ -3175,13 +3181,14 @@ def instance_norm(input,
 
     instance_norm_out = helper.create_variable_for_type_inference(dtype)
 
+    inputs = {"X": input}
+    if param_attr and bias_attr:
+        inputs["Scale"] = scale
+        inputs["Bias"] = bias
+
     helper.append_op(
         type="instance_norm",
-        inputs={
-            "X": input,
-            "Scale": scale,
-            "Bias": bias,
-        },
+        inputs=inputs,
         outputs={
             "Y": instance_norm_out,
             "SavedMean": saved_mean,
@@ -10409,20 +10416,28 @@ def uniform_random_batch_size_like(input,
 
 
 @templatedoc()
-def gaussian_random(shape, mean=0.0, std=1.0, seed=0, dtype='float32'):
+def gaussian_random(shape,
+                    mean=0.0,
+                    std=1.0,
+                    seed=0,
+                    dtype='float32',
+                    name=None):
     """
     Generate a random tensor whose data is drawn from a Gaussian distribution.
 
     Args:
-        shape (tuple[int] | list[int] | Variable | list[Variable]): Shape of the generated random tensor.
-        
-        mean (float): Mean of the random tensor, defaults to 0.0.
-            
-        std (float): Standard deviation of the random tensor, defaults to 1.0.
-        
-        seed (int): ${seed_comment}
-        
-        dtype(np.dtype | core.VarDesc.VarType | str): Output data type, float32 or float64.
+        shape(list|tuple|Variable): Shape of the Tensor to be created. The data
+            type is ``int32`` or ``int64`` . If ``shape`` is a list or tuple,
+            the elements of it should be integers or Tensors with shape [1]. If
+            ``shape`` is a Variable, it should be an 1-D Tensor .
+        mean(float): Mean of the random tensor, defaults to 0.0.
+        std(float): Standard deviation of the random tensor, defaults to 1.0.
+        seed(int): ${seed_comment}
+        dtype(np.dtype|core.VarDesc.VarType|str, optional): Data type of the output
+            tensor, which can be float32, float64. Default is float32.
+        name(str, optional): Normally there is no need for user to set this property.
+            For more information, please refer to :ref:`api_guide_Name` .
+            Default is None.
 
     Returns:
         Variable: Random tensor whose data is drawn from a Gaussian distribution, dtype: flaot32 or float64 as specified.
@@ -10485,30 +10500,33 @@ def gaussian_random(shape, mean=0.0, std=1.0, seed=0, dtype='float32'):
            # array([[2.3060477 , 2.676496  , 3.9911983 , 0.9990833 ],
            #        [2.8675377 , 2.2279181 , 0.79029655, 2.8447366 ]], dtype=float32)
     """
+    if not isinstance(dtype, core.VarDesc.VarType):
+        dtype = convert_np_dtype_to_dtype_(dtype)
 
-    helper = LayerHelper('gaussian_random', **locals())
-    out = helper.create_variable_for_type_inference(dtype)
-    if not isinstance(shape, (list, tuple, Variable)):
-        raise TypeError(
-            "The type of 'shape' in fill_constant must be Variable, list or tuple, but "
-            "received %s." % (type(shape)))
-    c_dtype = convert_np_dtype_to_dtype_(dtype)
+    if in_dygraph_mode():
+        shape = utils._convert_shape_to_list(shape)
+        return core.ops.gaussian_random('shape', shape, 'mean', mean, 'std',
+                                        std, 'seed', seed, 'dtype', dtype)
+
+    check_type(shape, 'shape', (list, tuple, Variable), 'gaussian_random/randn')
+    check_dtype(dtype, 'dtype', ['float32', 'float64'], 'gaussian_random/randn')
+
+    inputs = {}
     attrs = {
         'mean': mean,
         'std': std,
         'seed': seed,
-        'dtype': c_dtype,
+        'dtype': dtype,
         'use_mkldnn': False
     }
-
-    inputs = {}
     utils._get_shape_tensor_inputs(
         inputs=inputs,
-        helper=helper,
         attrs=attrs,
         shape=shape,
-        op_type='gaussian_random')
+        op_type='gaussian_random/randn')
 
+    helper = LayerHelper('gaussian_random', **locals())
+    out = helper.create_variable_for_type_inference(dtype)
     helper.append_op(
         type='gaussian_random',
         inputs=inputs,
@@ -11988,23 +12006,21 @@ def _logical_op(op_name, x, y, out=None, name=None, binary_op=True):
 def logical_and(x, y, out=None, name=None):
     """
     :alias_main: paddle.logical_and
-	:alias: paddle.logical_and,paddle.tensor.logical_and,paddle.tensor.logic.logical_and
-	:old_api: paddle.fluid.layers.logical_and
+    :alias: paddle.logical_and, paddle.tensor.logical_and, paddle.tensor.logic.logical_and
+    :old_api: paddle.fluid.layers.logical_and
 
-    logical_and Operator
-
-    It operates element-wise on X and Y, and returns the Out. X, Y and Out are N-dim boolean LoDTensor or Tensor.
-    Each element of Out is calculated by
+    ``logical_and`` operator computes element-wise logical AND on ``x`` and ``y``, and returns ``out``. ``x``, ``y`` and ``out`` are N-dim boolean ``Variable``.
+    Each element of ``out`` is calculated by
 
     .. math::
 
-        Out = X \land Y
+        out = x \&\& y
 
     Args:
-        x(${x_type}): ${x_comment}
-        y(${y_type}): ${y_comment}
-        out(LoDTensor or Tensor): The LoDTensor or Tensor that specifies the output of the operator, which can be any Variable that has been created in the program. The default value is None, and a new Variable will be created to save the output.
-        name(str|None): The default value is None. Normally there is no need for user to set this property. For more information, please refer to :ref:`api_guide_Name`
+        x(${x_type}): ${x_comment}.
+        y(${y_type}): ${y_comment}.
+        out(Variable): The ``Variable`` that specifies the output of the operator, which can be any ``Variable`` that has been created in the program. The default value is None, and a new ``Variable`` will be created to save the output.
+        name(str|None): The default value is None. Normally there is no need for users to set this property. For more information, please refer to :ref:`api_guide_Name`.
 
     Returns:
         ${out_type}: ${out_comment}
@@ -12012,25 +12028,16 @@ def logical_and(x, y, out=None, name=None):
     Examples:
         .. code-block:: python
 
-            import paddle.fluid as fluid
+            import paddle
             import numpy as np
 
-            # Graph organizing
-            x = fluid.layers.data(name='x', shape=[2], dtype='bool')
-            y = fluid.layers.data(name='y', shape=[2], dtype='bool')
-            res = fluid.layers.logical_and(x=x, y=y)
-            # The comment lists another available method.
-            # res = fluid.layers.fill_constant(shape=[2], dtype='bool', value=0)
-            # fluid.layers.logical_and(x=x, y=y, out=res)
-
-            # Create an executor using CPU as an example
-            exe = fluid.Executor(fluid.CPUPlace())
-
-            # Execute
-            x_i = np.array([[1, 0], [0, 1]]).astype(np.bool)
-            y_i = np.array([[1, 1], [0, 0]]).astype(np.bool)
-            res_val, = exe.run(fluid.default_main_program(), feed={'x':x_i, 'y':y_i}, fetch_list=[res])
-            print(res_val) # [[True, False], [False, False]]
+            paddle.enable_imperative()
+            x_data = np.array([True, True, False, False], dtype=np.bool)
+            y_data = np.array([True, False, True, False], dtype=np.bool)
+            x = paddle.imperative.to_variable(x_data)
+            y = paddle.imperative.to_variable(y_data)
+            res = paddle.logical_and(x, y)
+            print(res.numpy()) # [True False False False]
     """
 
     return _logical_op(
@@ -12041,23 +12048,21 @@ def logical_and(x, y, out=None, name=None):
 def logical_or(x, y, out=None, name=None):
     """
     :alias_main: paddle.logical_or
-	:alias: paddle.logical_or,paddle.tensor.logical_or,paddle.tensor.logic.logical_or
-	:old_api: paddle.fluid.layers.logical_or
+    :alias: paddle.logical_or, paddle.tensor.logical_or, paddle.tensor.logic.logical_or
+    :old_api: paddle.fluid.layers.logical_or
 
-    logical_or Operator
-
-    It operates element-wise on X and Y, and returns the Out. X, Y and Out are N-dim boolean LoDTensor or Tensor.
-    Each element of Out is calculated by
+    ``logical_or`` operator computes element-wise logical OR on ``x`` and ``y``, and returns ``out``. ``x``, ``y`` and ``out`` are N-dim boolean ``Variable``.
+    Each element of ``out`` is calculated by
 
     .. math::
 
-        Out = X \lor Y
+        out = x || y
 
     Args:
-        x(${x_type}): ${x_comment}
-        y(${y_type}): ${y_comment}
-        out(LoDTensor or Tensor): The LoDTensor or Tensor that specifies the output of the operator, which can be any Variable that has been created in the program. The default value is None, and a new Variable will be created to save the output.
-        name(str|None): The default value is None. Normally there is no need for user to set this property. For more information, please refer to :ref:`api_guide_Name`
+        x(${x_type}): ${x_comment}.
+        y(${y_type}): ${y_comment}.
+        out(Variable): The ``Variable`` that specifies the output of the operator, which can be any ``Variable`` that has been created in the program. The default value is None, and a new ``Variable`` will be created to save the output.
+        name(str|None): The default value is None. Normally there is no need for users to set this property. For more information, please refer to :ref:`api_guide_Name`.
 
     Returns:
         ${out_type}: ${out_comment}
@@ -12065,25 +12070,16 @@ def logical_or(x, y, out=None, name=None):
     Examples:
         .. code-block:: python
 
-            import paddle.fluid as fluid
+            import paddle
             import numpy as np
 
-            # Graph organizing
-            x = fluid.layers.data(name='x', shape=[2], dtype='bool')
-            y = fluid.layers.data(name='y', shape=[2], dtype='bool')
-            res = fluid.layers.logical_or(x=x, y=y)
-            # The comment lists another available method.
-            # res = fluid.layers.fill_constant(shape=[2], dtype='bool', value=0)
-            # fluid.layers.logical_or(x=x, y=y, out=res)
-
-            # Create an executor using CPU as an example
-            exe = fluid.Executor(fluid.CPUPlace())
-
-            # Execute
-            x_i = np.array([[1, 0], [0, 1]]).astype(np.bool)
-            y_i = np.array([[1, 1], [0, 0]]).astype(np.bool)
-            res_val, = exe.run(fluid.default_main_program(), feed={'x':x_i, 'y':y_i}, fetch_list=[res])
-            print(res_val) # [[True, True], [False, True]]
+            paddle.enable_imperative()
+            x_data = np.array([True, True, False, False], dtype=np.bool)
+            y_data = np.array([True, False, True, False], dtype=np.bool)
+            x = paddle.imperative.to_variable(x_data)
+            y = paddle.imperative.to_variable(y_data)
+            res = paddle.logical_or(x, y)
+            print(res.numpy()) # [True  True  True False]
     """
 
     return _logical_op(
@@ -12094,23 +12090,21 @@ def logical_or(x, y, out=None, name=None):
 def logical_xor(x, y, out=None, name=None):
     """
     :alias_main: paddle.logical_xor
-	:alias: paddle.logical_xor,paddle.tensor.logical_xor,paddle.tensor.logic.logical_xor
-	:old_api: paddle.fluid.layers.logical_xor
-
-    logical_xor Operator
+    :alias: paddle.logical_xor, paddle.tensor.logical_xor, paddle.tensor.logic.logical_xor
+    :old_api: paddle.fluid.layers.logical_xor
 
-    It operates element-wise on X and Y, and returns the Out. X, Y and Out are N-dim boolean LoDTensor or Tensor.
-    Each element of Out is calculated by
+    ``logical_xor`` operator computes element-wise logical XOR on ``x`` and ``y``, and returns ``out``. ``x``, ``y`` and ``out`` are N-dim boolean ``Variable``.
+    Each element of ``out`` is calculated by
 
     .. math::
 
-        Out = (X \lor Y) \land \lnot (X \land Y)
+        out = (x || y) \&\& !(x \&\& y)
 
     Args:
-        x(${x_type}): ${x_comment}
-        y(${y_type}): ${y_comment}
-        out(LoDTensor or Tensor): The LoDTensor or Tensor that specifies the output of the operator, which can be any Variable that has been created in the program. The default value is None, and a new Variable will be created to save the output.
-        name(str|None): The default value is None. Normally there is no need for user to set this property. For more information, please refer to :ref:`api_guide_Name`
+        x(${x_type}): ${x_comment}.
+        y(${y_type}): ${y_comment}.
+        out(Variable): The ``Variable`` that specifies the output of the operator, which can be any ``Variable`` that has been created in the program. The default value is None, and a new ``Variable`` will be created to save the output.
+        name(str|None): The default value is None. Normally there is no need for users to set this property. For more information, please refer to :ref:`api_guide_Name`.
 
     Returns:
         ${out_type}: ${out_comment}
@@ -12118,25 +12112,16 @@ def logical_xor(x, y, out=None, name=None):
     Examples:
         .. code-block:: python
 
-            import paddle.fluid as fluid
+            import paddle
             import numpy as np
 
-            # Graph organizing
-            x = fluid.layers.data(name='x', shape=[2], dtype='bool')
-            y = fluid.layers.data(name='y', shape=[2], dtype='bool')
-            res = fluid.layers.logical_xor(x=x, y=y)
-            # The comment lists another available method.
-            # res = fluid.layers.fill_constant(shape=[2], dtype='bool', value=0)
-            # fluid.layers.logical_xor(x=x, y=y, out=res)
-
-            # Create an executor using CPU as an example
-            exe = fluid.Executor(fluid.CPUPlace())
-
-            # Execute
-            x_i = np.array([[1, 0], [0, 1]]).astype(np.bool)
-            y_i = np.array([[1, 1], [0, 0]]).astype(np.bool)
-            res_val, = exe.run(fluid.default_main_program(), feed={'x':x_i, 'y':y_i}, fetch_list=[res])
-            print(res_val) # [[False, True], [False, True]]
+            paddle.enable_imperative()
+            x_data = np.array([True, True, False, False], dtype=np.bool)
+            y_data = np.array([True, False, True, False], dtype=np.bool)
+            x = paddle.imperative.to_variable(x_data)
+            y = paddle.imperative.to_variable(y_data)
+            res = paddle.logical_xor(x, y)
+            print(res.numpy()) # [False  True  True False]
     """
 
     return _logical_op(
@@ -12147,46 +12132,34 @@ def logical_xor(x, y, out=None, name=None):
 def logical_not(x, out=None, name=None):
     """
     :alias_main: paddle.logical_not
-	:alias: paddle.logical_not,paddle.tensor.logical_not,paddle.tensor.logic.logical_not
-	:old_api: paddle.fluid.layers.logical_not
-
-    logical_not Operator
+    :alias: paddle.logical_not, paddle.tensor.logical_not, paddle.tensor.logic.logical_not
+    :old_api: paddle.fluid.layers.logical_not
 
-    It operates element-wise on X, and returns the Out. X and Out are N-dim boolean LoDTensor or Tensor.
-    Each element of Out is calculated by
+    ``logical_not`` operator computes element-wise logical NOT on ``x``, and returns ``out``. ``x`` and ``out`` are N-dim boolean ``Variable``.
+    Each element of ``out`` is calculated by
 
     .. math::
 
-        Out = \lnot X
+        out = !x
 
     Args:
-        x(${x_type}): ${x_comment}
-        out(LoDTensor/Tensor): The LoDTensor/Tensor that specifies the output of the operator, which can be any Variable that has been created in the program. The default value is None, and a new Variable will be created to save the output.
-        name(str|None): The default value is None. Normally there is no need for user to set this property. For more information, please refer to :ref:`api_guide_Name`
+        x(${x_type}): ${x_comment}.
+        out(Variable): The ``Variable`` that specifies the output of the operator, which can be any ``Variable`` that has been created in the program. The default value is None, and a new ``Variable` will be created to save the output.
+        name(str|None): The default value is None. Normally there is no need for users to set this property. For more information, please refer to :ref:`api_guide_Name`.
 
     Returns:
         ${out_type}: ${out_comment}
 
     Examples:
         .. code-block:: python
-
-            import paddle.fluid as fluid
+            import paddle
             import numpy as np
 
-            # Graph organizing
-            x = fluid.layers.data(name='x', shape=[2], dtype='bool')
-            res = fluid.layers.logical_not(x)
-            # The comment lists another avaliable method.
-            # res = fluid.layers.fill_constant(shape=[2], dtype='bool', value=0)
-            # fluid.layers.logical_not(x, out=res)
-
-            # Create an executor using CPU as an example
-            exe = fluid.Executor(fluid.CPUPlace())
-
-            # Execute
-            x_i = np.array([[1, 0]]).astype(np.bool)
-            res_val, = exe.run(fluid.default_main_program(), feed={'x':x_i}, fetch_list=[res])
-            print(res_val) # [[False, True]]
+            paddle.enable_imperative()
+            x_data = np.array([True, False, True, False], dtype=np.bool)
+            x = paddle.imperative.to_variable(x_data)
+            res = paddle.logical_not(x)
+            print(res.numpy()) # [False  True False  True]
     """
 
     return _logical_op(
@@ -12796,16 +12769,14 @@ def hash(input, hash_size, num_hash=1, name=None):
 
             place = fluid.core.CPUPlace()
 
-            x = fluid.data(name="x", shape=[1], dtype="int32", lod_level=1)
-            res = fluid.layers.hash(name="res",input=x, hash_size=1000, num_hash=4)
+            x = fluid.data(name="x", shape=[2,2], dtype="int32", lod_level=1)
+            res = fluid.layers.hash(name="res", input=x, hash_size=1000, num_hash=4)
 
             exe = fluid.Executor(place)
             exe.run(fluid.default_startup_program())
             in1 = np.array([[1,2],[3,4]]).astype("int32")
             print(in1)
-            x_i = fluid.core.LoDTensor()
-            x_i.set(in1,place)
-            x_i.set_recursive_sequence_lengths([[0,2]])
+            x_i = fluid.create_lod_tensor(in1, [[0, 2]], place)
             res = exe.run(fluid.default_main_program(), feed={'x':x_i}, fetch_list=[res], return_numpy=False)
             print(np.array(res[0]))
             # [[[722]
@@ -12818,8 +12789,8 @@ def hash(input, hash_size, num_hash=1, name=None):
             #   [901]]]
     """
     check_variable_and_dtype(input, 'input', ['int32', 'int64'], 'hash')
-    check_type(hash_size, 'hash_size', ['int32', 'int64'], 'hash')
-    check_type(num_hash, 'num_hash', ['int32', 'int64'], 'hash')
+    check_type(hash_size, 'hash_size', int, 'hash')
+    check_type(num_hash, 'num_hash', int, 'hash')
     helper = LayerHelper('hash', **locals())
     out = helper.create_variable_for_type_inference(
         helper.input_dtype(), stop_gradient=True)
@@ -14038,7 +14009,7 @@ def unique(x, dtype='int32'):
 
              import numpy as np
              import paddle.fluid as fluid
-             x = fluid.assign(np.array([2, 3, 3, 1, 5, 3], dtype='int32'))
+             x = fluid.layers.assign(np.array([2, 3, 3, 1, 5, 3], dtype='int32'))
              out, index = fluid.layers.unique(x) # out is [2, 3, 1, 5]; index is [0, 1, 1, 2, 3, 1]
     """
 
@@ -14784,6 +14755,81 @@ def hard_swish(x, threshold=6.0, scale=6.0, offset=3.0, name=None):
     return out
 
 
+@templatedoc()
+def mish(x, threshold=20, name=None):
+    """
+    This operator implements the mish activation function.
+    Refer to `Mish: A Self Regularized Non-Monotonic Neural
+    Activation Function <https://arxiv.org/abs/1908.08681>`_
+
+
+    The formula is as follows if :attr:`threshold` is :code:`None` or negative:
+
+    .. math::
+
+        out = x * \\tanh(\\ln(1 + e^{x}))
+
+    The formula is as follows if :attr:`threshold` is set as positive value:
+
+    .. math::
+
+	out = \\begin{cases}
+		x \\ast \\tanh(x), \\text{if } x > \\text{threshold} \\\\
+		x \\ast \\tanh(e^{x}), \\text{if } x < -\\text{threshold} \\\\
+		x \\ast \\tanh(\\ln(1 + e^{x})),  \\text{otherwise}
+	      \\end{cases}
+
+    Args:
+        x (Variable): Input feature, multi-dimensional Tensor. The data type
+                      should be float16, float32 or float64.
+        threshold (float|None): threshold for softplus in Mish operator.
+                Approximate value of softplus will be used if absolute value
+                of input is greater than :attr:threshold and :attr:threshold
+                is set as positive value. For none or negative threshold,
+                approximate value is not used. Default 20.
+        name (str, optional): The default value is None. Normally there is no
+                need for user to set this property. For more information, please
+                refer to :ref:`api_guide_Name`
+
+    Returns:
+        Variable: The output tensor with the same shape and data type as input.
+
+
+    Examples:
+
+    .. code-block:: python
+
+        import paddle.fluid as fluid
+        import numpy as np
+
+        DATATYPE='float32'
+
+        x_data = np.array([i for i in range(1,5)]).reshape([1,1,4]).astype(DATATYPE)
+
+        x = fluid.data(name="x", shape=[None,1,4], dtype=DATATYPE)
+        y = fluid.layers.mish(x)
+
+        place = fluid.CPUPlace()
+        # place = fluid.CUDAPlace(0)
+        exe = fluid.Executor(place)
+        out, = exe.run(feed={'x':x_data}, fetch_list=[y.name])
+        print(out)  # [[0.66666667, 1.66666667, 3., 4.]]
+    """
+    check_variable_and_dtype(x, 'x', ['float32', 'float64'], 'mish')
+    check_type(threshold, 'threshold', (float, int), 'mish')
+    assert threshold > 0, "threshold of mish should be greater than 0, " \
+                          "but got {}".format(threshold)
+
+    helper = LayerHelper('mish', **locals())
+    out = helper.create_variable_for_type_inference(dtype=x.dtype)
+    helper.append_op(
+        type='mish',
+        inputs={'X': x},
+        outputs={'Out': out},
+        attrs={'threshold': threshold or -1})
+    return out
+
+
 def gather_tree(ids, parents):
     """
     To be used after beam search. After beam search, we get selected ids at
@@ -14863,7 +14909,8 @@ def gather_tree(ids, parents):
 
 
 @templatedoc()
-def uniform_random(shape, dtype='float32', min=-1.0, max=1.0, seed=0):
+def uniform_random(shape, dtype='float32', min=-1.0, max=1.0, seed=0,
+                   name=None):
     """
     This OP initializes a variable with random values sampled from a
     uniform distribution in the range [min, max).
@@ -14878,18 +14925,24 @@ def uniform_random(shape, dtype='float32', min=-1.0, max=1.0, seed=0):
           result=[[0.8505902, 0.8397286]]
 
     Args:
-        shape (list|tuple|Variable): The shape of the output Tensor,  if the shape is a list or tuple,
-                                     its elements can be an integer
-                                     or a Tensor with the shape [1], and the type of the Tensor must be int32 or int64.
-                                     If the shape is a Variable, it is a 1-D Tensor, and the type of the Tensor must be int32 or int64.
-        dtype(np.dtype|core.VarDesc.VarType|str, optional): The type of the output Tensor. Supported data types: float32, float64.
-                                                  Default: float32.
-        min (float, optional): The lower bound on the range of random values to generate, the min is included in the range. Default -1.0.
-        max (float, optional): The upper bound on the range of random values to generate, the max is excluded in the range. Default 1.0.
-        seed (int, optional): Random seed used for generating samples. 0 means use a
-            seed generated by the system. Note that if seed is not 0, this
-            operator will always generate the same random numbers every time.
-            Default 0.
+        shape (list|tuple|Variable): The shape of the output Tensor,  if the
+            shape is a list or tuple, its elements can be an integer or a
+            Tensor with the shape [1], and the type of the Tensor must be
+            int32 or int64. If the shape is a Variable, it is a 1-D Tensor, and
+            the type of the Tensor must be int32 or int64.
+        dtype(np.dtype|core.VarDesc.VarType|str, optional): The type of the
+            output Tensor. Supported data types: float32, float64. Default: float32.
+        min (float, optional): The lower bound on the range of random values
+            to generate, the min is included in the range. Default -1.0.
+        max (float, optional): The upper bound on the range of random values
+            to generate, the max is excluded in the range. Default 1.0.
+        seed (int, optional): Random seed used for generating samples. 0 means
+            use a seed generated by the system. Note that if seed is not 0,
+            this operator will always generate the same random numbers every
+            time. Default 0.
+        name(str, optional): The default value is None. Normally there is no
+            need for user to set this property. For more information, please
+            refer to :ref:`api_guide_Name`.
 
     Returns:
         Variable: A Tensor of the specified shape filled with uniform_random values.
@@ -14919,62 +14972,30 @@ def uniform_random(shape, dtype='float32', min=-1.0, max=1.0, seed=0):
             var_shape_int32 = fluid.data(name='var_shape_int32', shape=[2], dtype="int32")
             result_4 = fluid.layers.uniform_random(var_shape_int32)
 
-
-
     """
-    check_type(shape, 'shape', (list, tuple, Variable), 'uniform_random')
     if not isinstance(dtype, core.VarDesc.VarType):
         dtype = convert_np_dtype_to_dtype_(dtype)
-    check_dtype(dtype, 'dtype', ('float32', 'float64'), 'uniform_random')
 
-    def get_new_shape_tensor(list_shape):
-        new_shape_tensor = []
-        for dim in list_shape:
-            if isinstance(dim, Variable):
-                dim.stop_gradient = True
-                new_shape_tensor.append(dim)
-            else:
-                assert (isinstance(dim, int))
-                temp_out = helper.create_variable_for_type_inference('int64')
-                fill_constant([1], 'int64', dim, force_cpu=True, out=temp_out)
-                new_shape_tensor.append(temp_out)
-        return new_shape_tensor
+    if in_dygraph_mode():
+        shape = utils._convert_shape_to_list(shape)
+        return core.ops.uniform_random('shape', shape, 'min',
+                                       float(min), 'max',
+                                       float(max), 'seed', seed, 'dtype', dtype)
 
-    def get_attr_shape(list_shape):
-        unk_dim_idx = -1
-        attrs_shape = []
-        for dim_idx, dim_size in enumerate(list_shape):
-            if isinstance(dim_size, Variable):
-                attrs_shape.append(-1)
-            else:
-                attrs_shape.append(dim_size)
-                assert dim_size > 0, (
-                    "Each dimension size given in shape must not be negative "
-                    "except one unknown dimension.")
-        return attrs_shape
+    check_type(shape, 'shape', (list, tuple, Variable), 'uniform_random/rand')
+    check_dtype(dtype, 'dtype', ('float32', 'float64'), 'uniform_random/rand')
 
-    helper = LayerHelper("uniform_random", **locals())
     inputs = dict()
     attrs = {'seed': seed, 'min': min, 'max': max, 'dtype': dtype}
-    if in_dygraph_mode():
-        attrs['shape'] = shape
-    else:
-        if isinstance(shape, Variable):
-            shape.stop_gradient = True
-            inputs["ShapeTensor"] = shape
-        elif isinstance(shape, (list, tuple)):
-            assert len(shape) > 0, (
-                "The size of argument(shape) can't be zero.")
-            attrs["shape"] = get_attr_shape(shape)
-            if utils._contain_var(shape):
-                inputs['ShapeTensorList'] = get_new_shape_tensor(shape)
+    utils._get_shape_tensor_inputs(
+        inputs=inputs, attrs=attrs, shape=shape, op_type='uniform_random/rand')
 
+    helper = LayerHelper("uniform_random", **locals())
     out = helper.create_variable_for_type_inference(dtype)
     helper.append_op(
         type="uniform_random", inputs=inputs, attrs=attrs,
         outputs={"Out": out})
-
-    return helper.append_activation(out)
+    return out
 
 
 def unbind(input, axis=0):
diff --git a/python/paddle/fluid/layers/tensor.py b/python/paddle/fluid/layers/tensor.py
index a7f10584b73f99e65f3a39a94c77e0b5980e2b0c..eea2d82bf816cf0195509381a44b32f35170ed53 100644
--- a/python/paddle/fluid/layers/tensor.py
+++ b/python/paddle/fluid/layers/tensor.py
@@ -620,7 +620,7 @@ def assign(input, output=None):
     return output
 
 
-def fill_constant(shape, dtype, value, force_cpu=False, out=None):
+def fill_constant(shape, dtype, value, force_cpu=False, out=None, name=None):
     """
 	:alias_main: paddle.fill_constant
 	:alias: paddle.fill_constant,paddle.tensor.fill_constant,paddle.tensor.creation.fill_constant
@@ -638,12 +638,14 @@ def fill_constant(shape, dtype, value, force_cpu=False, out=None):
                 If ``shape`` is an Variable, it should be an 1-D Tensor .
         dtype(np.dtype|core.VarDesc.VarType|str): Data type of the output tensor which can
             be float16, float32, float64, int32, int64.
-        value(float16|float32|float64|int32|int64|Variable): The constant value used to initialize 
+        value(bool|float|int|Variable): The constant value used to initialize 
             the Tensor to be created. If value is an Variable, it should be an 1-D Tensor.
         force_cpu(bool): data should be on CPU if it's true, default value is False.
         out(Variable, optional): Optional output which can be any created 
             Variable that meets the requirements to store the result of operation.
             if out is None, a new Varibale will be create to store the result.
+        name(str, optional): The default value is None.  Normally there is no need for user to set this
+            property.  For more information, please refer to :ref:`api_guide_Name`.
 
     Returns:
         Variable: Tensor which is created according to shape and dtype.
@@ -666,31 +668,23 @@ def fill_constant(shape, dtype, value, force_cpu=False, out=None):
           data3 = fluid.layers.fill_constant(shape=[1, positive_2], dtype='float32', value=1.5) # data3=[1.5, 1.5]
 
           # attr shape is an Variable Tensor.
-          shape = fluid.layers.fill_constant([1,2], "int32", 2) # shape=[2,2]
+          shape = fluid.layers.fill_constant([2], "int32", 2) # shape=[2,2]
           data4 = fluid.layers.fill_constant(shape=shape, dtype='bool', value=True) # data4=[[True,True],[True,True]]
           
           # attr value is an Variable Tensor.
           val = fluid.layers.fill_constant([1], "float32", 2.0) # val=[2.0]
           data5 = fluid.layers.fill_constant(shape=[2,1], value=val, dtype='float32') #data5=[[2.0],[2.0]]
     """
-    inputs = {}
+
     attrs = {'force_cpu': force_cpu}
-    if isinstance(value, Variable):
-        inputs['ValueTensor'] = value
-    else:
-        attrs['value'] = float(value)
+    if not isinstance(value, Variable):
         if convert_dtype(dtype) in ['int64', 'int32']:
             attrs['str_value'] = str(int(value))
         else:
             attrs['str_value'] = str(float(value))
 
     if in_dygraph_mode():
-        if isinstance(shape, (list, tuple)):
-            shape = list(
-                map(lambda x: x.numpy()[0] if isinstance(x, Variable) else x,
-                    shape))
-        else:
-            shape = list(shape.numpy().astype(int))
+        shape = utils._convert_shape_to_list(shape)
         if out is None:
             out = _varbase_creator(dtype=dtype)
 
@@ -707,24 +701,26 @@ def fill_constant(shape, dtype, value, force_cpu=False, out=None):
         out.stop_gradient = True
         return out
 
+    helper = LayerHelper("fill_constant", **locals())
+    inputs = {}
+    if isinstance(value, Variable):
+        inputs['ValueTensor'] = value
+
     check_dtype(dtype, 'dtype',
                 ['bool', 'float16', 'float32', 'float64', 'int32', 'int64'],
                 'fill_constant')
     check_type(shape, 'shape', (Variable, list, tuple), 'fill_constant')
+
     if isinstance(shape, Variable):
-        check_variable_and_dtype(shape, 'shape', ['int32', 'int64'],
-                                 'fill_constant')
+        check_dtype(shape.dtype, 'shape', ['int32', 'int64'], 'fill_constant')
+
     if out is not None:
         check_variable_and_dtype(out, 'out', [convert_dtype(dtype)],
                                  'fill_constant')
 
     helper = LayerHelper("fill_constant", **locals())
-    inputs = utils._get_shape_tensor_inputs(
-        inputs=inputs,
-        helper=helper,
-        attrs=attrs,
-        shape=shape,
-        op_type='fill_constant')
+    utils._get_shape_tensor_inputs(
+        inputs=inputs, attrs=attrs, shape=shape, op_type='fill_constant')
 
     if out is None:
         out = helper.create_variable_for_type_inference(dtype=dtype)
@@ -1057,7 +1053,7 @@ def ones(shape, dtype, force_cpu=False):
     return fill_constant(value=1.0, **locals())
 
 
-def zeros(shape, dtype, force_cpu=False):
+def zeros(shape, dtype, force_cpu=False, name=None):
     """
     The OP creates a tensor of specified :attr:`shape` and :attr:`dtype`, and fills it with 0.
     Its :attr:`stop_gradient` will be set to True to stop gradient computation.
@@ -1069,6 +1065,8 @@ def zeros(shape, dtype, force_cpu=False):
         force_cpu (bool, optional): Whether force to store the output tensor in CPU memory.
             If :attr:`force_cpu` is False, the output tensor will be stored in running device memory.
             Default: False.
+        name(str, optional): The default value is None.  Normally there is no need for user to set this
+            property.  For more information, please refer to :ref:`api_guide_Name`.
 
     Returns:
         Variable: A tensor of data type :attr:`dtype` with shape :attr:`shape` and all elements set to 0.
@@ -1079,10 +1077,6 @@ def zeros(shape, dtype, force_cpu=False):
           import paddle.fluid as fluid
           data = fluid.layers.zeros(shape=[3, 2], dtype='float32') # [[0., 0.], [0., 0.], [0., 0.]]
     """
-    check_type(shape, 'shape', (list, tuple), 'zeros')
-    check_dtype(dtype, 'create data type',
-                ['bool', 'float16', 'float32', 'float64', 'int32', 'int64'],
-                'zeros')
     return fill_constant(value=0.0, **locals())
 
 
@@ -1328,25 +1322,35 @@ def isfinite(x):
     return out
 
 
-def range(start, end, step, dtype):
+def range(start, end, step, dtype, name=None):
     """
     Return evenly spaced values within a given interval.
 
-    Values are generated within the half-open interval [start, stop) (in other words,
-    the interval including start but excluding stop).
+    Values are generated within the half-open interval [start, stop) (in other
+    words, the interval including start but excluding stop).
+
+    If dtype is float32 or float64, we advise adding a small epsilon to end to
+    avoid floating point rounding errors when comparing against end.
 
     Parameters:
-        start(float32 | float64 | int32 | int64 | Variable): Start of interval. The interval includes this value.
-            when start is Variable, it is a 1-D Tensor with shape [1].
-        end(float32 | float64 | int32 | int64 | Variable): End of interval. The interval does not include this
-                                 value, except in some cases where step is not an integer
-                                 and floating point round-off affects the length of out. When end is Variable,
-                                 it is a 1-D Tensor with shape [1].
-        step(float32 | float64 | int32 | int64 | Variable): Spacing between values. For any output out, this is the
-                                  distance between two adjacent values, out[i+1] - out[i].
-        dtype(str|core.VarDesc.VarType): the data type of the output tensor, can be float32, float64, int32, int64.
-
-    Returns: a 1-D Tensor which is evenly spaced values within a given interval. Its data type is set by dtype.
+        start(float|int|Variable): Start of interval. The interval includes
+            this value. If start is Variable, it is a 1-D Tensor with shape [1],
+            and it's data type should be one of int32, int64, float32, float64.
+        end(float|int|Variable): End of interval. The interval does not include
+            this value. When end is Variable, it is a 1-D Tensor with shape [1],
+            and it's data type should be int32, int64, float32, float64.
+        step(float|int|Variable): Spacing between values. For any out, this is
+            the istance between two adjacent values, out[i+1] - out[i].
+            When end is Variable, it is a 1-D Tensor with shape [1], and it's
+            data type should be one of int32, int64, float32, float64.
+        dtype(str|np.dtype|core.VarDesc.VarType): The data type of the output
+            tensor, can be float32, float64, int32, int64.
+        name(str, optional): Normally there is no need for user to set this property.
+            For more information, please refer to :ref:`api_guide_Name` .
+            Default is None.
+
+    Returns: a 1-D Tensor which is evenly spaced values within a given interval.
+        Its data type is set by dtype.
     
     Return type: Variable
 
@@ -1354,49 +1358,52 @@ def range(start, end, step, dtype):
 
         .. code-block:: python
 
-             import paddle.fluid as fluid
-             data = fluid.layers.range(0, 10, 2, 'int32')
+            import paddle.fluid as fluid
 
-    """
-    check_type(start, 'start', (float, int, Variable), 'range')
-    check_type(end, 'end', (float, int, Variable), 'range')
-    check_type(step, 'step', (float, int, Variable), 'range')
-    helper = LayerHelper("range", **locals())
+            out1 = fluid.layers.range(0, 10, 2, 'int32')
+            # [0, 2, 4, 6, 8]
 
-    check_dtype(dtype, 'create data type',
-                ['float32', 'float64', 'int32', 'int64'], 'range')
+            start_var = fluid.layers.fill_constant([1], 'int64', 3)
+            out2 = fluid.layers.range(start_var, 7, 1, 'int64')
+            # [3, 4, 5, 6]
+
+    """
+    if not isinstance(dtype, core.VarDesc.VarType):
+        dtype = convert_np_dtype_to_dtype_(dtype)
 
-    dtype = convert_dtype(dtype)
     if not isinstance(start, Variable):
         start = fill_constant([1], dtype, start)
-    elif convert_dtype(start.dtype) != dtype:
-        # make sure that start, end, step has the same dtype as
-        # `dtype`
-        start = cast(x=start, dtype=dtype)
+    elif start.dtype != dtype:
+        start = cast(start, dtype)
 
     if not isinstance(end, Variable):
         end = fill_constant([1], dtype, end)
-    elif convert_dtype(end.dtype) != dtype:
-        end = cast(x=end, dtype=dtype)
+    elif end.dtype != dtype:
+        end = cast(end, dtype)
 
     if not isinstance(step, Variable):
         step = fill_constant([1], dtype, step)
-    elif convert_dtype(step.dtype) != dtype:
-        step = cast(x=step, dtype=dtype)
+    elif step.dtype != dtype:
+        step = cast(step, dtype)
 
-    out = helper.create_variable_for_type_inference(dtype=start.dtype)
+    if in_dygraph_mode():
+        return core.ops.range(start, end, step)
 
+    check_dtype(dtype, 'dtype', ['float32', 'float64', 'int32', 'int64'],
+                'range/arange')
+    helper = LayerHelper('range', **locals())
+    out = helper.create_variable_for_type_inference(dtype)
     helper.append_op(
         type='range',
         inputs={'Start': start,
                 'End': end,
                 'Step': step},
-        outputs={'Out': [out]})
+        outputs={'Out': out})
     out.stop_gradient = True
     return out
 
 
-def linspace(start, stop, num, dtype):
+def linspace(start, stop, num, dtype=None, name=None):
     """
     This OP return fixed number of evenly spaced values within a given interval.
 
@@ -1407,7 +1414,10 @@ def linspace(start, stop, num, dtype):
             or a tensor of shape [1] with input data type float32, float64.
         num(int|Variable): The input :attr:`num` is given num of the sequence. It is an int scalar, \
             or a tensor of shape [1] with type int32.
-        dtype(string): The data type of output tensor, it could be 'float32' and 'float64'.
+        dtype(np.dtype|core.VarDesc.VarType|str): The data type of output tensor, it could be 'float32' and 'float64'.
+            Default: if None, the data type is `float32`.
+        name(str, optional): Normally there is no need for user to set this property. 
+            For more information, please refer to :ref:`api_guide_Name`.Default: None.
 
     Returns:
         Variable, the output data type will be float32, float64.: The 1-D tensor with fixed number of evenly spaced values, \
@@ -1422,27 +1432,23 @@ def linspace(start, stop, num, dtype):
              data = fluid.layers.linspace(0, 10, 1, 'float32') # [0.0]
 
     """
-    helper = LayerHelper("linspace", **locals())
-
-    check_type(start, 'start', (Variable, float, int), linspace)
-    check_type(stop, 'stop', (Variable, float, int), linspace)
-    check_type(num, 'num', (Variable, float, int), linspace)
-
+    if dtype is None:
+        dtype = 'float32'
     if not isinstance(start, Variable):
         start = fill_constant([1], dtype, start)
-    else:
-        check_variable_and_dtype(start, "start", ["float32", "float64"],
-                                 "linspace")
-
     if not isinstance(stop, Variable):
         stop = fill_constant([1], dtype, stop)
-    else:
-        check_variable_and_dtype(stop, "stop", ["float32", "float64"],
-                                 "linspace")
     if not isinstance(num, Variable):
         num = fill_constant([1], 'int32', num)
-    else:
-        check_variable_and_dtype(num, "num", ["int32"], "linspace")
+    if in_dygraph_mode():
+        return core.ops.linspace(start, stop, num)
+
+    helper = LayerHelper("linspace", **locals())
+
+    check_dtype(start.dtype, 'start', ['float32', 'float64'], 'linspace')
+    check_dtype(stop.dtype, 'stop', ['float32', 'float64'], 'linspace')
+    check_dtype(num.dtype, 'num', ['int32', 'int64'], 'linspace')
+    check_dtype(dtype, 'dtype', ['float32', 'float64'], 'linspace')
 
     out = helper.create_variable_for_type_inference(dtype=start.dtype)
 
@@ -1461,14 +1467,17 @@ def zeros_like(x, out=None):
     with `x`.
 
     Args:
-        x(Variable): The input tensor which specifies shape and dtype, the input data dtype could be bool, float32, float64, int32, int64.
-        out(Variable, optional): If is :attr:`None` , the op will create the variable as output, the data type and shape of \
-            this variable will be same as input :attr:`x`. If is a tensor, the data type and shape need to be same as input :attr:`x`. 
-            The default value is :attr:`None` .
+        x(Variable): The input tensor which specifies shape and dtype, the
+            input data dtype could be bool, float32, float64, int32, int64.
+        out(Variable, optional): If is :attr:`None` , the op will create the
+            variable as output, the data type and shape of this variable will
+            be same as input :attr:`x`. If is a tensor, the data type and shape
+            need to be same as input :attr:`x`. The default value is :attr:`None` .
 
     Returns:
-        Variable: The N-D tensor, the element in tensor is related to input data type, if the input data type is bool, \
-            the output value is False, otherwise is zero. The output shape is the same as the input.
+        Variable: The N-D tensor, the element in tensor is related to input
+            data type, if the input data type is bool, the output value is
+            False, otherwise is zero. The output shape is the same as the input.
 
     Examples:
         .. code-block:: python
@@ -1487,7 +1496,7 @@ def zeros_like(x, out=None):
     else:
         check_variable_and_dtype(
             out, "out", ['bool', 'float32', 'float64', 'int32', 'int64'],
-            'ones_like')
+            'zeros_like')
 
     helper.append_op(
         type='fill_zeros_like', inputs={'X': [x]}, outputs={'Out': [out]})
diff --git a/python/paddle/fluid/layers/utils.py b/python/paddle/fluid/layers/utils.py
index df00a0f561ffcf1db214d52eef3ef4509bb6b074..0d6965239e14b92d3d4997a9cf8efbe3fa7048b7 100644
--- a/python/paddle/fluid/layers/utils.py
+++ b/python/paddle/fluid/layers/utils.py
@@ -282,7 +282,7 @@ def _contain_var(list_or_tuple):
     return False
 
 
-def _get_shape_tensor_inputs(inputs, helper, attrs, shape, op_type):
+def _get_shape_tensor_inputs(inputs, attrs, shape, op_type):
     from .tensor import fill_constant, cast
 
     def _get_attr_shape(list_shape):
@@ -295,7 +295,7 @@ def _get_shape_tensor_inputs(inputs, helper, attrs, shape, op_type):
         return attr_shape
 
     def _get_shape_tensor(list_shape):
-        new_shape_tensor = []
+        shape_tensor_list = []
         for idx, dim in enumerate(list_shape):
             if isinstance(dim, Variable):
                 dim.stop_gradient = True
@@ -305,11 +305,11 @@ def _get_shape_tensor_inputs(inputs, helper, attrs, shape, op_type):
                     '(When type of shape in' + op_type + 'is list or tuple.)')
                 if convert_dtype(dim.dtype) == 'int64':
                     dim = cast(x=dim, dtype='int32')
-                new_shape_tensor.append(dim)
+                shape_tensor_list.append(dim)
             else:
                 temp_out = fill_constant([1], 'int32', dim, force_cpu=True)
-                new_shape_tensor.append(temp_out)
-        return new_shape_tensor
+                shape_tensor_list.append(temp_out)
+        return shape_tensor_list
 
     if isinstance(shape, Variable):
         shape.stop_gradient = True
@@ -325,8 +325,8 @@ def _get_shape_tensor_inputs(inputs, helper, attrs, shape, op_type):
         attrs["shape"] = _get_attr_shape(shape)
         if _contain_var(shape):
             inputs['ShapeTensorList'] = _get_shape_tensor(shape)
-
-    return inputs
+    else:
+        raise TypeError("Shape only supports Variable, or list, or tuple.")
 
 
 def _convert_to_tensor_list(old_list, dtype="int32"):
@@ -345,3 +345,16 @@ def _convert_to_tensor_list(old_list, dtype="int32"):
             temp_out = fill_constant([1], dtype, ele, force_cpu=True)
             new_list_tensor.append(temp_out)
     return new_list_tensor
+
+
+def _convert_shape_to_list(shape):
+    """
+    Convert shape(list, tuple, variable) to list in imperative mode
+    """
+    if isinstance(shape, (list, tuple)):
+        shape = list(
+            map(lambda x: x.numpy()[0] if isinstance(x, Variable) else x,
+                shape))
+    else:
+        shape = list(shape.numpy().astype(int))
+    return shape
diff --git a/python/paddle/fluid/optimizer.py b/python/paddle/fluid/optimizer.py
index 04271d715f7bdc4cd087420756aacb8e0a8ae428..165c44b96407bf8e0d359c055ff6d13bf04665c9 100644
--- a/python/paddle/fluid/optimizer.py
+++ b/python/paddle/fluid/optimizer.py
@@ -33,7 +33,7 @@ from .layers import ops
 from .regularizer import append_regularization_ops
 from .dygraph import base as imperative_base
 from .dygraph import no_grad
-from .dygraph.learning_rate_scheduler import LearningRateDecay
+from .dygraph.learning_rate_scheduler import LearningRateDecay, _LearningRateEpochDecay
 from paddle.fluid import core
 from paddle.fluid.layers import tensor
 from functools import reduce
@@ -67,7 +67,8 @@ class Optimizer(object):
                  regularization=None,
                  grad_clip=None,
                  name=None):
-        self._parameter_list = parameter_list
+        self._parameter_list = list(
+            parameter_list) if parameter_list is not None else None
         self._name = name
         if framework.in_dygraph_mode():
             if not isinstance(learning_rate, float) and \
@@ -148,17 +149,17 @@ class Optimizer(object):
                 state_dict[var_tmp.name] = var_tmp
         # global step if use lr decay
         if isinstance(self._learning_rate, LearningRateDecay):
-            var_tmp = None
-            if framework.in_dygraph_mode():
+            state_dict["LR_Scheduler"] = self._learning_rate.state_dict()
+
+            if not isinstance(self._learning_rate, _LearningRateEpochDecay):
+                var_tmp = None
                 var_temp = framework._varbase_creator(
                     None, name='global_step', dtype='int32')
-            else:
-                var_temp = Variable(None, name='global_step', dtype='int32')
 
-            tensor.fill_constant(
-                [1], "int32", self._learning_rate.step_num, out=var_temp)
+                tensor.fill_constant(
+                    [1], "int32", self._learning_rate.step_num, out=var_temp)
 
-            state_dict['global_step'] = var_temp
+                state_dict['global_step'] = var_temp
         return state_dict
 
     @framework.dygraph_only
@@ -192,30 +193,28 @@ class Optimizer(object):
         '''
 
         if isinstance(self._learning_rate, LearningRateDecay):
-            assert 'global_step' in state_dict, \
-                    'Global step not in state dict, Dygraph use LearningRateDecay, global_step must in state_dict'
-            global_step = state_dict['global_step']
-
-            if isinstance(global_step, core.VarBase):
-                step_np = global_step
-                step_np = np.array(step_np.value().get_tensor())
-                assert step_np.shape == (1,),  \
-                        "global step shape is (1,), the shape is {}".format( step_np.shape )
-
-                self._learning_rate.step_num = int(step_np[0])
-            elif isinstance(global_step, Variable):
-                step_np = global_step.numpy()
-                assert step_np.shape == (1,),  \
-                        "global step shape is (1,), the shape is {}".format( step_np.shape )
-                self._learning_rate.step_num = step_np[0]
-            elif isinstance(global_step, np.ndarray):
-                assert global_step.shape == (1,),  \
-                        "global step shape is (1,), the shape is {}".format( global_step.shape )
-                self._learning_rate.step_num = global_step[0]
-            else:
-                raise RuntimeError(
-                    "Type not supprt, value in state dict must be [VarBase, Variable, numpy], the type is ",
-                    type(global_step))
+            self._learning_rate.set_dict(state_dict["LR_Scheduler"])
+
+            if not isinstance(self._learning_rate, _LearningRateEpochDecay):
+                assert 'global_step' in state_dict, \
+                        'Global step not in state dict, Dygraph use LearningRateDecay, global_step must in state_dict'
+                global_step = state_dict['global_step']
+
+                if isinstance(global_step, Variable):
+                    step_np = global_step
+                    step_np = np.array(step_np.value().get_tensor())
+                    assert step_np.shape == (1,),  \
+                            "global step shape is (1,), the shape is {}".format( step_np.shape )
+
+                    self._learning_rate.step_num = int(step_np[0])
+                elif isinstance(global_step, np.ndarray):
+                    assert global_step.shape == (1,),  \
+                            "global step shape is (1,), the shape is {}".format( global_step.shape )
+                    self._learning_rate.step_num = global_step[0]
+                else:
+                    raise RuntimeError(
+                        "Type not supprt, value in state dict must be [VarBase, Variable, numpy], the type is ",
+                        type(global_step))
 
         self._accumulators_holder = state_dict
         for k, v in self._accumulators.items():
@@ -296,11 +295,87 @@ class Optimizer(object):
                 dtype='float32' if self._dtype is None else self._dtype,
                 persistable=True)
 
+    @framework.dygraph_only
+    def set_lr(self, value):
+        """
+        :api_attr: imperative
+        
+        Set the value of the learning rate manually in the optimizer. If the optimizer use LearningRateDecay,
+        this API cannot be invoked, because it will lead to conflict.
+
+        Args:
+            value (float|Variable): the value of learning rate
+
+        Returns:
+            None
+          
+        Examples:
+            .. code-block:: python
+
+                import paddle.fluid as fluid
+                        
+                with fluid.dygraph.guard():
+                    linear = fluid.dygraph.nn.Linear(10, 10)
+
+                    adam = fluid.optimizer.Adam(0.1, parameter_list=linear.parameters())
+
+                    # set learning rate manually by python float value
+                    lr_list = [0.2, 0.3, 0.4, 0.5, 0.6]
+                    for i in range(5):
+                        adam.set_lr(lr_list[i])
+                        lr = adam.current_step_lr()
+                        print("current lr is {}".format(lr))
+                    # Print:
+                    #    current lr is 0.2
+                    #    current lr is 0.3
+                    #    current lr is 0.4
+                    #    current lr is 0.5
+                    #    current lr is 0.6
+
+
+                    # set learning rate manually by framework Variable
+                    lr_var = fluid.layers.create_global_var(
+                        shape=[1], value=0.7, dtype='float32')
+                    adam.set_lr(lr_var)
+                    lr = adam.current_step_lr()
+                    print("current lr is {}".format(lr))
+                    # Print:
+                    #    current lr is 0.7
+
+
+
+        """
+        if not isinstance(value, (framework.Variable, float)):
+            raise TypeError(
+                "The type of 'value' in optimizer.set_lr must be (float, Variable), but received %s."
+                % (type(value)))
+        if isinstance(self._learning_rate, LearningRateDecay):
+            raise RuntimeError(
+                "optimizer's learning rate can't be LearningRateDecay when invoke this API, because this will lead to conflict."
+            )
+        if isinstance(value, float):
+            self._learning_rate = value
+            current_lr = self._global_learning_rate()
+            if current_lr is not None:
+                global_block = framework.default_main_program().global_block()
+                global_block.append_op(
+                    type='fill_constant',
+                    outputs={'Out': [current_lr]},
+                    attrs={
+                        'dtype': current_lr.dtype,
+                        'shape': list(current_lr.shape),
+                        'value': float(value)
+                    },
+                    stop_gradient=True)
+        else:
+            assert len(value.shape) == 1 and value.shape[
+                0] == 1, "optimizer's learning rate must be 1-D Tensor with shape[1]"
+            self._learning_rate_map[framework.default_main_program()] = value
+
     @framework.dygraph_only
     def current_step_lr(self):
         """
-        .. note::
-          **This API is ONLY available in Dygraph mode**
+        :api_attr: imperative
         
         Get current step learning rate. The return value is all the same When LearningRateDecay is not used,
         otherwise return the step learning rate.
@@ -346,11 +421,14 @@ class Optimizer(object):
 
         """
         current_lr = self._global_learning_rate()
-        if current_lr:
+        if isinstance(current_lr, framework.Variable):
             return self._global_learning_rate().numpy()[0]
 
         if isinstance(self._learning_rate, float):
             return self._learning_rate
+        elif isinstance(self._learning_rate, _LearningRateEpochDecay):
+            step_lr = self._learning_rate()
+            return step_lr.numpy()[0]
         else:
             step_lr = self._learning_rate.step()
             if isinstance(step_lr, (float, int)):
@@ -629,7 +707,7 @@ class Optimizer(object):
             startup_program (Program, optional): :ref:`api_fluid_Program` for
                 initializing parameters in ``parameter_list``. The default value
                 is None, at this time :ref:`api_fluid_default_startup_program` will be used.
-            parameter_list (list, optional): List of ``Variable`` or ``Variable.name`` to update
+            parameter_list (Iterable, optional): Iterable of ``Variable`` or ``Variable.name`` to update
                 to minimize ``loss``. The default value is None, at this time all parameters
                 will be updated.
             no_grad_set (set, optional): Set of ``Variable``  or ``Variable.name`` that don't need
@@ -806,7 +884,7 @@ class Optimizer(object):
             startup_program (Program, optional): :ref:`api_fluid_Program` for
                 initializing parameters in ``parameter_list``. The default value
                 is None, at this time :ref:`api_fluid_default_startup_program` will be used.
-            parameter_list (list, optional): List of ``Variable`` or ``Variable.name`` to update
+            parameter_list (Iterable, optional): Iterable of ``Variable`` or ``Variable.name`` to update
                 to minimize ``loss``. The default value is None, at this time all parameters
                 will be updated.
             no_grad_set (set, optional): Set of ``Variable``  or ``Variable.name`` that don't need
@@ -850,7 +928,7 @@ class SGDOptimizer(Optimizer):
     Parameters:
         learning_rate (float|Variable): The learning rate used to update parameters. \
             Can be a float value or a Variable with one float value as data element.
-        parameter_list (list, optional):  List of ``Variable`` names to update to minimize ``loss``. \
+        parameter_list (Iterable, optional):  Iterable of ``Variable`` names to update to minimize ``loss``. \
             This parameter is required in dygraph mode. \
             The default value is None in static mode, at this time all parameters will be updated.
         regularization (WeightDecayRegularizer, optional): The strategy of regularization. There are two method: \
@@ -958,7 +1036,7 @@ class MomentumOptimizer(Optimizer):
         learning_rate (float|Variable): The learning rate used to update parameters. \
             Can be a float value or a Variable with one float value as data element.
         momentum (float): Momentum factor
-        parameter_list (list, optional):  List of ``Variable`` names to update to minimize ``loss``. \
+        parameter_list (Iterable, optional):  Iterable of ``Variable`` names to update to minimize ``loss``. \
             This parameter is required in dygraph mode. \
             The default value is None in static mode, at this time all parameters will be updated.
         use_nesterov (bool, optional): Enables Nesterov momentum, default is false.
@@ -1106,7 +1184,7 @@ class DGCMomentumOptimizer(Optimizer):
         sparsity (list[float]): Get top important element from gradient tensor, the ratio is (1 - current sparsity). \
             Default is [0.999]. For example, if the sparsity is [0.99, 0.999], \
                 the top [1%, 0.1%] important element will be transmitted.
-        parameter_list (list, optional):  List of ``Variable`` names to update to minimize ``loss``. \
+        parameter_list (Iterable, optional):  Iterable of ``Variable`` names to update to minimize ``loss``. \
             This parameter is required in dygraph mode. \
             The default value is None in static mode, at this time all parameters will be updated.
         use_nesterov (bool): Enables Nesterov momentum. True means use Nesterov. Default is False.
@@ -1504,7 +1582,7 @@ class LarsMomentumOptimizer(Optimizer):
             momentum (float): momentum factor
         lars_coeff (float): Defines how much we trust the layer to change its weights.
         lars_weight_decay (float): Weight decay coefficient for decaying using LARS.
-        parameter_list (list, optional):  List of ``Variable`` names to update to minimize ``loss``. \
+        parameter_list (Iterable, optional):  Iterable of ``Variable`` names to update to minimize ``loss``. \
             This parameter is required in dygraph mode. \
             The default value is None in static mode, at this time all parameters will be updated.
         regularization (WeightDecayRegularizer, optional): The strategy of regularization. There are two method: \
@@ -1623,7 +1701,7 @@ class AdagradOptimizer(Optimizer):
             It can be a float value or a ``Variable`` with a float type.
         epsilon (float, optional): A small float value for numerical stability.
             The default value is 1e-06.
-        parameter_list (list, optional):  List of ``Variable`` names to update to minimize ``loss``. \
+        parameter_list (Iterable, optional):  Iterable of ``Variable`` names to update to minimize ``loss``. \
             This parameter is required in dygraph mode. \
             The default value is None in static mode, at this time all parameters will be updated.
         regularization (WeightDecayRegularizer, optional): The strategy of regularization. There are two method: \
@@ -1748,7 +1826,7 @@ class AdamOptimizer(Optimizer):
             The default value is 0.999.
         epsilon (float, optional): A small float value for numerical stability.
             The default value is 1e-08.
-        parameter_list (list, optional):  List of ``Variable`` names to update to minimize ``loss``. \
+        parameter_list (Iterable, optional):  Iterable of ``Variable`` names to update to minimize ``loss``. \
             This parameter is required in dygraph mode. \
             The default value is None in static mode, at this time all parameters will be updated.
         regularization (WeightDecayRegularizer, optional): The strategy of regularization. There are two method: \
@@ -2014,7 +2092,7 @@ class AdamaxOptimizer(Optimizer):
             The default value is 0.999.
         epsilon (float, optional): A small float value for numerical stability.
             The default value is 1e-08.
-        parameter_list (list, optional):  List of ``Variable`` names to update to minimize ``loss``. \
+        parameter_list (Iterable, optional):  Iterable of ``Variable`` names to update to minimize ``loss``. \
             This parameter is required in dygraph mode. \
             The default value is None in static mode, at this time all parameters will be updated.
         regularization (WeightDecayRegularizer, optional): The strategy of regularization. There are two method: \
@@ -2189,7 +2267,7 @@ class DpsgdOptimizer(Optimizer):
         clip (float): clipping threshold
         batch_size (float): batch size.
         sigma (float): for gaussian noise.
-        parameter_list (list, optional):  List of ``Variable`` names to update to minimize ``loss``. \
+        parameter_list (Iterable, optional):  Iterable of ``Variable`` names to update to minimize ``loss``. \
             This parameter is required in dygraph mode. \
             The default value is None in static mode, at this time all parameters will be updated.
     Notes:
@@ -2272,7 +2350,7 @@ class DecayedAdagradOptimizer(Optimizer):
         decay (float, optional): The decay rate. The default value is 0.95.
         epsilon (float, optional): A small float value for numerical stability.
             The default value is 1e-06.
-        parameter_list (list, optional):  List of ``Variable`` names to update to minimize ``loss``. \
+        parameter_list (Iterable, optional):  Iterable of ``Variable`` names to update to minimize ``loss``. \
             This parameter is required in dygraph mode. \
             The default value is None in static mode, at this time all parameters will be updated.
         regularization (WeightDecayRegularizer, optional): The strategy of regularization. There are two method: \
@@ -2377,7 +2455,7 @@ class AdadeltaOptimizer(Optimizer):
         learning_rate (float|Variable): global learning rate.
         epsilon (float): a small float number for numeric stability. Default 1.0e-6.
         rho (float): a floating point value indicating the decay rate. Default 0.95.
-        parameter_list (list, optional):  List of ``Variable`` names to update to minimize ``loss``. \
+        parameter_list (Iterable, optional):  Iterable of ``Variable`` names to update to minimize ``loss``. \
             This parameter is required in dygraph mode. \
             The default value is None in static mode, at this time all parameters will be updated.
         regularization (WeightDecayRegularizer, optional): The strategy of regularization. There are two method: \
@@ -2534,7 +2612,7 @@ class RMSPropOptimizer(Optimizer):
             the gradient; if False, by the uncentered second moment. Setting this to
             True may help with training, but is slightly more expensive in terms of
             computation and memory. Defaults to False.
-        parameter_list (list, optional):  List of ``Variable`` names to update to minimize ``loss``. \
+        parameter_list (Iterable, optional):  Iterable of ``Variable`` names to update to minimize ``loss``. \
             This parameter is required in dygraph mode. \
             The default value is None in static mode, at this time all parameters will be updated.
         regularization (WeightDecayRegularizer, optional): The strategy of regularization. There are two method: \
@@ -2708,7 +2786,7 @@ class FtrlOptimizer(Optimizer):
         l1 (float): L1 regularization strength, default is 0.0.
         l2 (float): L2 regularization strength, default is 0.0.
         lr_power (float): Learning Rate Power, default is -0.5.
-        parameter_list (list, optional):  List of ``Variable`` names to update to minimize ``loss``. \
+        parameter_list (Iterable, optional):  Iterable of ``Variable`` names to update to minimize ``loss``. \
             This parameter is required in dygraph mode. \
             The default value is None in static mode, at this time all parameters will be updated.
         regularization (WeightDecayRegularizer, optional): The strategy of regularization. There are two method: \
@@ -2815,7 +2893,7 @@ class FtrlOptimizer(Optimizer):
                 "LinearAccumOut": linear_acc
             },
             attrs={"l1": self._l1,
-                   "l2": self._l1,
+                   "l2": self._l2,
                    "lr_power": self._lr_power},
             stop_gradient=True)
 
@@ -2856,7 +2934,7 @@ class LambOptimizer(AdamOptimizer):
         beta2 (float, optional): The exponential decay rate for the 2nd moment estimates.
             Default 0.999.
         epsilon (float, optional): A small float value for numerical stability. Default 1e-6.
-        parameter_list (list, optional):  List of ``Variable`` names to update to minimize ``loss``. \
+        parameter_list (Iterable, optional):  Iterable of ``Variable`` names to update to minimize ``loss``. \
             This parameter is required in dygraph mode. \
             The default value is None in static mode, at this time all parameters will be updated.
         regularization (WeightDecayRegularizer, optional): The strategy of regularization. There are two method: \
@@ -3557,302 +3635,851 @@ class PipelineOptimizer(object):
     """
 	:api_attr: Static Graph
 
-    Pipeline Optimizer
-
-    Train with pipeline mode. The program will be split by cut_list. 
-
-    If the len of cut_list is k, then the whole program (including \
-    backward part) will be split to 2*k-1 sections. 
-    
-    So the length of place_list and concurrency_list must be also 2*k-1.
-
-    Note: Though the asynchronous mode is applied in pipeline training to speed up, \
-    the final performance depends on the training progress of each pipeline heavily.
-
-    And we will try the synchronous mode in the future.
+    Pipeline Optimizer: Make a program to run as pipeline, that is splitting a
+    program into multiple sections (sub-programs) and each section run on a
+    device to enable the training of large scale models and the use of
+    heterogeneous devices. Meanwhile, all sections run in the stype of pipeline.
 
     Args:
-        optimizer (Optimizer): The based optimizer, such as SGD.
-        cut_list (list of Variable list): The cut variable of the main_program.
-        place_list (list of Place): The place where the section will run on.
-        concurrency_list (list of int): The concurrency degree.
-        queue_size (int): Each section will consume scopes from its in-scope queue 
-                        and produce scopes to out-scope queue. And this parameter 
-                        specify the scope queue size. [Optional. Default: 30].
-        sync_steps (int): The synchronization steps between different cards. [Optional. Default: 1].
-        start_cpu_core_id (int): specify the first cpu core id. [Optional. Default:0].
-
+        optimizer (Optimizer): The optimizer to use, such as SGD.
+        num_microbatches (int): Number of microbatches. [Optional. Default:1].
+        start_cpu_core_id (int): The first cpu core id to use. [Optional. Default:0].
+    
     Examples:
         .. code-block:: python
 
             import paddle.fluid as fluid
             import paddle.fluid.layers as layers
 
-            x = fluid.layers.data(name='x', shape=[1], dtype='int64', lod_level=0)
-            y = fluid.layers.data(name='y', shape=[1], dtype='int64', lod_level=0)
-            emb_x = layers.embedding(input=x, param_attr=fluid.ParamAttr(name="embx"), size=[10,2], is_sparse=False)
-            emb_y = layers.embedding(input=y, param_attr=fluid.ParamAttr(name="emby",learning_rate=0.9), size=[10,2], is_sparse=False)
-            concat = layers.concat([emb_x, emb_y], axis=1)
-            fc = layers.fc(input=concat, name="fc", size=1, num_flatten_dims=1, bias_attr=False)
-            loss = layers.reduce_mean(fc)
+            with fluid.device_guard("gpu:0"):
+                x = fluid.layers.data(name='x', shape=[1], dtype='int64', lod_level=0)
+                y = fluid.layers.data(name='y', shape=[1], dtype='int64', lod_level=0)
+                data_loader = fluid.io.DataLoader.from_generator(
+                    feed_list=[x, y],
+                    capacity=64,
+                    use_double_buffer=True,
+                    iterable=False)
+
+                emb_x = layers.embedding(input=x, param_attr=fluid.ParamAttr(name="embx"), size=[10,2], is_sparse=False)
+                emb_y = layers.embedding(input=y, param_attr=fluid.ParamAttr(name="emby",learning_rate=0.9), size=[10,2], is_sparse=False)
+
+            with fluid.device_guard("gpu:1"):
+                concat = layers.concat([emb_x, emb_y], axis=1)
+                fc = layers.fc(input=concat, name="fc", size=1, num_flatten_dims=1, bias_attr=False)
+                loss = layers.reduce_mean(fc)
             optimizer = fluid.optimizer.SGD(learning_rate=0.5)
-            optimizer = fluid.optimizer.PipelineOptimizer(optimizer,
-                    cut_list=[[emb_x, emb_y], [loss]],
-                    place_list=[fluid.CPUPlace(), fluid.CUDAPlace(0), fluid.CPUPlace()],
-                    concurrency_list=[1, 1, 4],
-                    queue_size=2,
-                    sync_steps=1,
-                    )
+            optimizer = fluid.optimizer.PipelineOptimizer(optimizer)
             optimizer.minimize(loss)
-            place = fluid.CPUPlace()
+
+            def train_reader():
+                for _ in range(4):
+                    x = np.random.random(size=[1]).astype('int64')
+                    y = np.random.random(size=[1]).astype('int64')
+                    yield x, y
+            data_loader.set_sample_generator(train_reader, batch_size=1)
+
+            place = fluid.CUDAPlace(0)
             exe = fluid.Executor(place)
             exe.run(fluid.default_startup_program())
+            batch_size = 1
             filelist = [] # you should set your own filelist, e.g. filelist = ["dataA.txt"]
             dataset = fluid.DatasetFactory().create_dataset("FileInstantDataset")
             dataset.set_use_var([x,y])
             dataset.set_batch_size(batch_size)
             dataset.set_filelist(filelist)
+            data_loader.start()
             exe.train_from_dataset(
-                        fluid.default_main_program(),
-                        dataset,
-                        thread=2,
-                        debug=False,
-                        fetch_list=[],
-                        fetch_info=[],
-                        print_period=1)
+                    fluid.default_main_program(),
+                    dataset)
+            data_loader.reset()
     """
 
-    def __init__(self,
-                 optimizer,
-                 cut_list=None,
-                 place_list=None,
-                 concurrency_list=None,
-                 queue_size=30,
-                 sync_steps=1,
-                 start_cpu_core_id=0):
+    def __init__(self, optimizer, num_microbatches=1, start_cpu_core_id=0):
         if framework.in_dygraph_mode():
             raise Exception("In dygraph, don't support PipelineOptimizer.")
-        # TODO: check properties
+        if not isinstance(optimizer, Optimizer):
+            raise ValueError("The 'optimizer' parameter for "
+                             "PipelineOptimizer must be an instance of "
+                             "Optimizer, but the given type is {}.".format(
+                                 type(optimizer)))
         self._optimizer = optimizer
-        self._cut_list = cut_list
-        self._place_list = place_list
-        self._concurrency_list = concurrency_list
-        self._queue_size = queue_size
-        self._sync_steps = sync_steps
+        assert num_microbatches >= 1, (
+            "num_microbatches must be a positive value.")
+        self._num_microbatches = num_microbatches
+        assert start_cpu_core_id >= 0, (
+            "start_cpu_core_id must be greater than or equal to 0.")
         self._start_cpu_core_id = start_cpu_core_id
+        self._place_list = None
+        op_maker = core.op_proto_and_checker_maker
+        self._op_role = op_maker.OpRole
+        self._op_role_key = op_maker.kOpRoleAttrName()
+        self._op_role_var_key = op_maker.kOpRoleVarAttrName()
+        self._op_device_key = op_maker.kOpDeviceAttrName()
+        self._param_device_map = dict()
 
     def _create_vars(self, block, main_program):
+        # Create vars for block, copied from main_program's global block
         used_var_set = set()
         for op_idx in range(block.desc.op_size()):
             op_desc = block.desc.op(op_idx)
             vars = op_desc.input_arg_names() + op_desc.output_arg_names()
             for var in vars:
-                if var in used_var_set:
+                # a var whose name contains "blocking_queue" 
+                # only exists in startup program 
+                if var in used_var_set or "_blocking_queue" in var:
                     continue
                 used_var_set.add(var)
                 source_var = main_program.block(0).var(str(var))
-                block._clone_variable(source_var, False)
+                if source_var.type == core.VarDesc.VarType.READER:
+                    block.create_var(name=var, type=core.VarDesc.VarType.READER)
+                else:
+                    block._clone_variable(source_var, False)
 
-    def _extract_section_opt_ops(self, ops, cut_point_name):
+    def _is_loss_grad_op(self, op):
+        if self._op_role_key not in op.attr_names:
+            return False
+        op_role = int(op.all_attrs()[self._op_role_key])
+        return op_role & int(self._op_role.Backward) and op_role & int(
+            self._op_role.Loss)
+
+    def _is_backward_op(self, op):
+        return self._op_role_key in op.attr_names and int(op.all_attrs()[
+            self._op_role_key]) & int(self._op_role.Backward)
+
+    def _is_optimize_op(self, op):
+        return self._op_role_key in op.attr_names and int(op.all_attrs()[
+            self._op_role_key]) & int(self._op_role.Optimize)
+
+    def _is_update_op(self, op):
+        return 'Param' in op.input_names and 'Grad' in op.input_names and (
+            "LearningRate" in op.input_names)
+
+    def _split_program(self, main_program):
         """
-        Extract opt ops in the given section
+        Split a program into sections according to devices that ops run on.
+
+        Args:
+            main_program (Program): the main program
         """
-        output_names = set(cut_point_name)
-        relevant_op_flags = [True] * len(ops)
-        for i, op in reversed(list(enumerate(ops))):
-            if _some_in_set_(op.desc.output_arg_names(), output_names):
-                for name in op.desc.input_arg_names():
-                    output_names.add(name)
-            else:
-                relevant_op_flags[i] = False
+        programs = []
+        # Map from device to its corresponding section program info
+        device_program_map = dict()
+        block = main_program.block(0)
 
-        op_path = [ops[i] for i in range(len(ops)) if relevant_op_flags[i]]
-        return op_path
+        for op in block.ops:
+            device = op.attr(self._op_device_key)
 
-    def _find_input_output(self, ops, name, is_forward=True):
+            if device not in device_program_map:
+                program = {"program": Program()}
+                device_program_map[device] = program
+            program = device_program_map[device]
+            op_desc = op.desc
+            ap_op = program["program"].block(0).desc.append_op()
+            ap_op.copy_from(op_desc)
+
+        for key in sorted(device_program_map.keys()):
+            program = device_program_map[key]
+            program['program']._sync_with_cpp()
+            programs.append(program)
+
+        return programs
+
+    def _find_post_op(self, ops, cur_op, var_name):
         """
-        Find the inputs or outputs of a section
+        Find the real post op that has variable named var_name as input.
+
+        Args:
+            ops (list): A list of ops.
+            cur_op (Operator): Current operator which has variable named
+                               var_name as output.
+            var_name (string): Variable name.
         """
-        all_set = set()
-        part_set = set()
+        post_op = []
+        before = True
         for op in ops:
-            if is_forward:
-                part_set.update(op.desc.output_arg_names())
-            else:
-                part_set.update(op.desc.input_arg_names())
-            all_set.update(op.desc.output_arg_names())
-            all_set.update(op.desc.input_arg_names())
-        return all_set - part_set
-
-    def _find_persistable_vars(self, ops, whole_parameters):
+            if op == cur_op:
+                before = False
+                continue
+            if before:
+                continue
+            for in_var_name in op.input_arg_names:
+                if in_var_name == var_name:
+                    post_op.append(op)
+        if post_op:
+            if not len(post_op) == 1:
+                raise ValueError("Each op can only have one post op.")
+            return post_op[0]
+        return None
+
+    def _find_real_prev_op(self, ops, cur_op, var_name):
         """
-        find the persistable input vars in current section
+        Find the real previous op that outputs variable named var_name.
+
+        Args:
+            ops (list): A list of ops.
+            cur_op (Operator): Current operator which has variable named
+                               var_name as input.
+            var_name (string): Variable name.
         """
-        res = set()
+        prev_op = []
         for op in ops:
-            vars = op.desc.input_arg_names()
-            for var in vars:
-                if var in whole_parameters:
-                    res.add(var)
-        return res
+            if op == cur_op:
+                break
+            for out_var_name in op.output_arg_names:
+                if out_var_name == var_name:
+                    prev_op.append(op)
+        if prev_op:
+            # A op may have more than one prev op,
+            # e.g., for 'learning_rate', there may be multiple ops have it as
+            # output.
+            return prev_op[-1]
+        return None
+
+    def _rename_arg(self, op, old_name, new_name):
+        op_desc = op.desc
+        if isinstance(op_desc, tuple):
+            op_desc = op_desc[0]
+        op_desc._rename_input(old_name, new_name)
+        op_desc._rename_output(old_name, new_name)
+
+    def _create_var(self, block, ref_var, name):
+        """
+        Create a new var for block, which has the same type,
+        shape and dtype as ref_var, then rename it with the
+        name `name`.
+        """
+        new_var = block.create_var(
+            name=name,
+            shape=ref_var.shape,
+            dtype=ref_var.dtype,
+            type=ref_var.type,
+            lod_level=ref_var.lod_level,
+            persistable=False,
+            is_data=False,
+            need_check_feed=ref_var.desc.need_check_feed())
+        return new_var
+
+    def _get_data_var_info(self, block):
+        """
+        Get all vars whose is_data attribute are true and then rename them.
 
-    def _is_opt_role_op(self, op):
-        op_maker = core.op_proto_and_checker_maker
-        optimize_role = core.op_proto_and_checker_maker.OpRole.Optimize
-        if op_maker.kOpRoleAttrName() in op.attr_names and \
-                int(op.all_attrs()[op_maker.kOpRoleAttrName()]) & int(optimize_role) != 0:
-            return True
-        return False
+        For PipelineTrainer, all data vars are binded to
+        minibatch scope, so we have to feed them to the microbatch
+        to avoid conflicts. The vars feeded to microbatch have to
+        be renamed.
+        """
+        # A map from var name to the renamed name.
+        raw_name_new_name_map = dict()
+        # Because we will create vars in block, it is more safe
+        # to get all var_names before iteration.
+        var_names = list(block.vars.keys())
+        for var_name in var_names:
+            var = block.var(var_name)
+            if not var.is_data:
+                continue
+            assert var_name not in raw_name_new_name_map, (
+                "{} has already been processed.".format(var_name))
+            new_name = unique_name.generate(var_name)
+            raw_name_new_name_map[var_name] = new_name
+            new_var = self._create_var(block, var, new_name)
+            new_var.is_data = False
+
+        # map of data to devices that that data on
+        data_devices_map = dict()
+        for op in block.ops:
+            dev_spec = op.attr(self._op_device_key)
+            for var_name in op.input_arg_names:
+                if var_name not in raw_name_new_name_map:
+                    continue
+                if not var_name in data_devices_map:
+                    data_devices_map[var_name] = []
+                if not dev_spec in data_devices_map[var_name]:
+                    data_devices_map[var_name].append(dev_spec)
+                new_name = raw_name_new_name_map[var_name]
+                #self._rename_arg(op, var_name, new_name)
+        return data_devices_map, raw_name_new_name_map
+
+    def _rename_var_in_block(self, block, raw_name_new_name_map):
+        """
+        Rename vars whose names in raw_name_new_name_map to the corresponding
+        new names.
+        """
+        for op in block.ops:
+            if op.type == "enqueue" or op.type == "dequeue":
+                continue
+            for var_name in op.input_arg_names:
+                if var_name in raw_name_new_name_map:
+                    new_name = raw_name_new_name_map[var_name]
+                    self._rename_arg(op, var_name, new_name)
 
-    def _is_lr_role_op(self, op):
-        op_maker = core.op_proto_and_checker_maker
-        optimize_role = core.op_proto_and_checker_maker.OpRole.LRSched
-        if op_maker.kOpRoleAttrName() in op.attr_names and \
-                int(op.all_attrs()[op_maker.kOpRoleAttrName()]) == int(optimize_role):
-            return True
-        return False
+    def _insert_enq_deq_for_data_var(self, main_block, programs, startup,
+                                     devices):
+        """
+        Insert enqueue and dequeue ops for data var
 
-    def _extract_section_ops(self, ops, cut_point_name):
+        Args:
+            main_block (Block): Global block for main program
+            programs (dict): Dictionary for section params
+            startup (Program): Startup program
+            devices (list): List of devices in the format (dev:dev_index)
+        """
+        main_program = main_block.program
+        data_devices_map, raw_name_new_name_map = self._get_data_var_info(
+            main_block)
+
+        first_prog = programs[0]['program']
+        first_block = first_prog.block(0)
+        enqueue_index = 0
+        if first_block.ops[0].type == "create_py_reader" or (
+                first_block.ops[1].type == "create_py_reader"):
+            for op in first_block.ops:
+                if op.type == "read":
+                    enqueue_index += 1
+                    break
+                enqueue_index += 1
+        first_dev_spec = devices[0]
+        for var_name in data_devices_map.keys():
+            for device in data_devices_map[var_name]:
+                # step1: generate queue for each pair of data var and device
+                # that that data on
+                queue_name = var_name + "_blocking_queue"
+                queue_name = unique_name.generate(queue_name)
+                queue_var = startup.block(0).create_var(
+                    name=queue_name,
+                    persistable=True,
+                    type=core.VarDesc.VarType.RAW)
+                startup.block(0).append_op(
+                    type='queue_generator',
+                    attrs={
+                        'names': [queue_name],
+                        'capacity': self._num_microbatches
+                    })
+                main_var = main_block.var(var_name)
+                assert main_var.is_data
+                if not var_name in first_block.vars:
+                    self._create_var(first_block, main_var, var_name)
+                first_block._insert_op(
+                    index=enqueue_index,
+                    type='enqueue',
+                    inputs={'X': first_block.var(var_name)},
+                    attrs={
+                        'queue_name': queue_name,
+                        self._op_device_key: first_dev_spec,
+                        self._op_role_key: self._op_role.Forward
+                    })
+                # Get the device that that data on
+                assert device in devices
+                prog_index = devices.index(device)
+                prog = programs[prog_index]['program']
+                block = prog.block(0)
+                index = 0
+                if device == first_dev_spec:
+                    index = enqueue_index + 1
+                new_name = raw_name_new_name_map[var_name]
+                source_var = main_program.block(0).var(var_name)
+                new_var = self._create_var(block, source_var, new_name)
+                block._insert_op(
+                    index=index,
+                    type='dequeue',
+                    outputs={'Out': [new_var]},
+                    attrs={
+                        self._op_device_key: device,
+                        self._op_role_key: self._op_role.Forward,
+                        'queue_name': queue_name,
+                    })
+                self._rename_var_in_block(block, raw_name_new_name_map)
+
+    def _strip_grad_suffix(self, name):
+        """
+        Strip the grad suffix from the given variable name
+        """
+        pos = name.find(core.grad_var_suffix())
+        return name[:pos] if pos != -1 else name
+
+    def _append_grad_suffix(self, name):
+        """
+        Append grad suffix to the given variable name
         """
-        Extract ops in the given section 
+        return name + core.grad_var_suffix()
+
+    def _update_param_device_map(self, params_grads, block):
+        for param_grad in params_grads:
+            if not param_grad[0].trainable: continue
+            param_name = param_grad[0].name
+            ops = block.ops
+            for op in ops:
+                input_arg_names = op.input_arg_names
+                if param_name in input_arg_names:
+                    self._param_device_map[param_name] = op.attr(
+                        self._op_device_key)
+                    break
+
+    def _add_opdevice_attr_for_regularization_clip(self, block):
         """
-        output_names = set(cut_point_name)
-        relevant_op_flags = [True] * len(ops)
-        for i, op in reversed(list(enumerate(ops))):
-            if not self._is_opt_role_op(op) and _some_in_set_(
-                    op.desc.output_arg_names(), output_names):
-                for name in op.desc.input_arg_names():
-                    output_names.add(name)
-            elif op.desc.type() == "print" and op.desc.input_arg_names()[
-                    0] in output_names:
+        Add op_device attribute for regulization and clip ops.
+        """
+        for op in block.ops:
+            # role for regularization and clip ops is optimize
+            if int(op.attr(self._op_role_key)) != int(self._op_role.Optimize):
                 continue
-            else:
-                relevant_op_flags[i] = False
+            if op.has_attr(self._op_device_key) and (
+                    op.attr(self._op_device_key) != ""):
+                continue
+            assert self._op_role_var_key in op.attr_names
+            op_role_var = op.all_attrs()[self._op_role_var_key]
+            assert len(op_role_var) == 2
+            param_name = block.vars[op_role_var[0]].name
+            device = self._param_device_map[param_name]
+            op._set_attr(self._op_device_key, device)
 
-        op_path = [ops[i] for i in range(len(ops)) if relevant_op_flags[i]]
-        return op_path
+    def _add_default_opdevice_attr(self, block):
+        """
+        1. Add default op_device attribute for lr-related ops.
+           The default value is the one that of the first place.
+        2. Add default op_device attribute for sum ops added during
+           backward. For these ops, we set the op_device attribute
+           as the one of its post op, i.e, which op has the output of the
+           sum op as an input.
+        """
+        first_devcie = ""
+
+        # Get the device spec of the first place.
+        # device_spec: 'cpu' for cpu device and 'gpu:id' for gpu device,
+        # e.g. 'gpu:0', 'gpu:1', etc.
+        for op in block.ops:
+            if op.has_attr(self._op_device_key) and (
+                    op.attr(self._op_device_key) != ""):
+                first_device = op.attr(self._op_device_key)
+                break
+        assert first_device
+
+        # set op_device attr for lr-related ops
+        lrsched_role = int(self._op_role.LRSched)
+        for op in block.ops:
+            if not op.has_attr(self._op_device_key) or (
+                    op.attr(self._op_device_key) == ""):
+                if op.type == "sum":
+                    # For sum ops that compute the sum of @RENAMED@ vars
+                    for name in op.desc.input_arg_names():
+                        assert '@RENAME@' in name
+                    assert len(op.desc.output_arg_names()) == 1
+                    out_name = op.desc.output_arg_names()[0]
+                    post_op = self._find_post_op(block.ops, op, out_name)
+                    device = post_op.attr(self._op_device_key)
+                    assert device
+                    op._set_attr(self._op_device_key, device)
+                    continue
 
-    def _find_section_opt(self, ops, params):
-        res = self._extract_section_opt_ops(ops, params)
-        return res
+                assert op.attr(self._op_role_key) == lrsched_role, (
+                    "Op whose op_device attr has not been set for pipeline"
+                    " must be of the role LRSched.")
+                op._set_attr(self._op_device_key, first_device)
 
-    def _split_program(self, main_program, cut_list):
-        programs = []
-        block = main_program.block(0)
-        whole_parameters = [e.name for e in block.all_parameters()]
-        cut_var_names = []
-        cut_len = len(cut_list)
-        sec_params = []
-        for i, cut_vars in enumerate(cut_list[:-1]):
-            cut_var_names.append([cut_var.name for cut_var in cut_vars])
-        for i, cut_vars in reversed(list(enumerate(cut_list[:-1]))):
-            cut_var_names.append(
-                [_append_grad_suffix_(cut_var.name) for cut_var in cut_vars])
-            if i == 0:
-                cut_var_names[-1] += [var.name for var in cut_list[-1]]
-        ops = block.ops[:]
-        for i, cut_vars in enumerate(cut_var_names):
-            program = {
-                "program": Program(),
-                "input_set": set(),
-                "output_set": set()
-            }
-            cur_ops = self._extract_section_ops(ops, cut_vars)
-            if i == 0:
-                for op in ops:
-                    if self._is_lr_role_op(op):
-                        cur_ops.append(op)
-            #prevent inplace in/out
-            program["input_set"].update(
-                self._find_input_output(
-                    cur_ops, [], is_forward=True))
-            for e in cur_ops:
-                ops.remove(e)
-
-            if i < cut_len:
-                sec_params.append(
-                    self._find_persistable_vars(cur_ops, whole_parameters))
-            if i >= cut_len - 1:
-                opt_ops = self._find_section_opt(
-                    ops, sec_params[2 * cut_len - 2 - i])
-
-                for e in opt_ops:
-                    ops.remove(e)
-                cur_ops += opt_ops
-
-            op_descs = [op.desc for op in cur_ops]
-            for op_desc in op_descs:
-                ap_op = program["program"].block(0).desc.append_op()
-                ap_op.copy_from(op_desc)
-            program["input_set"].update(
-                self._find_input_output(
-                    cur_ops, cut_vars, is_forward=True))
-            program["input_set"].update(sec_params[min(i, 2 * cut_len - 2 - i)])
-            program["output_set"].update(
-                self._find_input_output(
-                    cur_ops, cut_vars, is_forward=False))
-            programs.append(program)
-        program = {
-            "program": Program(),
-            "input_set": set(),
-            "output_set": set()
-        }
-        op_descs = [op.desc for op in ops]
-        for op_desc in op_descs:
-            ap_op = program["program"].block(0).desc.append_op()
-            ap_op.copy_from(op_desc)
-        program["input_set"].update(
-            [cut_var.name + "@GRAD" for cut_var in cut_list[0]])
-        program["input_set"].update(
-            self._find_input_output(
-                ops, [], is_forward=True))
-        program["input_set"].update(sec_params[0])
-        programs.append(program)
-        inputs = set()
-        for program in reversed(list(programs)):
-            output_list = list(program["output_set"])
-            for output in output_list:
-                if output not in inputs:
-                    program["output_set"].remove(output)
-            inputs.update(program["input_set"])
-        return programs
+    def _check_validation(self, block):
+        """
+        Check whether ops in a block are all validate (i.e., the 
+        op_device attribute has been set).
+        Then, return all device specifications in order.
+        """
+        device_specs = []
+        for op in block.ops:
+            type = op.type
+            if not op._has_kernel(type):
+                assert op.type == "conditional_block" and (
+                    op.attr(self._op_role_key) == int(self._op_role.LRSched)), (
+                        "Now, the only supported op without kernel is "
+                        "conditional_block, and its op role must be LRSched.")
+            assert op.has_attr(self._op_device_key), (
+                "op ({}) has no {} attribute.".format(op.type,
+                                                      self._op_device_key))
+            dev_spec = op.attr(self._op_device_key)
+            assert dev_spec, ("op_device attribute for op "
+                              "{} has not been set.".format(op.type))
+            if not dev_spec in device_specs:
+                device_specs.append(dev_spec)
+        return device_specs
+
+    def _insert_enq_deq_ops_for_boundaries(self, block, origin_block,
+                                           startup_program):
+        """
+        Insert a pair of enqueue and dequeue ops for every two
+        consecutive ops on different devices.
+        """
+        startup_block = startup_program.global_block()
+        extra_index = 0
+
+        # A map from var to device spec where op takes it as input,
+        # avoiding multiple enqueue and dequeue ops.
+        var_devspec = dict()
+
+        for index, op in list(enumerate(origin_block.ops)):
+            cur_device_spec = op.attr(self._op_device_key)
+            for var_name in op.input_arg_names:
+                # i.e., lod_tensor_blocking_queue created by DataLoader,
+                # which only exists in startup program.
+                if not var_name in origin_block.vars: continue
+                var = block.var(var_name)
+                # skip data, because we will process it later
+                if var.is_data: continue
+                prev_op = self._find_real_prev_op(origin_block.ops, op,
+                                                  var_name)
+                if prev_op is None:
+                    continue
+                prev_device_spec = prev_op.attr(self._op_device_key)
+
+                if prev_device_spec != cur_device_spec:
+                    if var_name not in var_devspec:
+                        var_devspec[var_name] = []
+                    if cur_device_spec in var_devspec[var_name]: continue
+                    var_devspec[var_name].append(cur_device_spec)
+
+                    queue_name = var_name + "_blocking_queue"
+                    queue_name = unique_name.generate(queue_name)
+                    queue_var = startup_block.create_var(
+                        name=queue_name,
+                        persistable=True,
+                        type=core.VarDesc.VarType.RAW)
+                    startup_block.append_op(
+                        type='queue_generator',
+                        attrs={
+                            'names': [queue_name],
+                            'capacity': self._num_microbatches
+                        })
+                    op_role = op.all_attrs()[self._op_role_key]
+                    var = block.vars[var_name]
+                    block._insert_op(
+                        index=index + extra_index,
+                        type='enqueue',
+                        inputs={'X': var},
+                        attrs={
+                            'queue_name': queue_name,
+                            self._op_device_key: prev_device_spec,
+                            self._op_role_key: op_role
+                        })
+                    extra_index += 1
+                    block._insert_op(
+                        index=index + extra_index,
+                        type='dequeue',
+                        outputs={'Out': [var]},
+                        attrs={
+                            self._op_device_key: cur_device_spec,
+                            'queue_name': queue_name,
+                            self._op_role_key: op_role
+                        })
+                    extra_index += 1
+
+    def _add_dequeue_ops_for_optimize(self, block, startup_program):
+        startup_block = startup_program.global_block()
+        grad_queue_map = dict()
+        grad_device_map = dict()
+        optimize_index = None
+        grad_names_to_dequeue = []
+
+        for index, op in reversed(list(enumerate(block.ops))):
+            device = op.attr(self._op_device_key)
+            # Optimizer pass
+            if not self._is_optimize_op(op):
+                optimize_index = index + 1
+                break
+            if not self._is_update_op(op): continue
+            assert self._op_role_var_key in op.attr_names
+            op_role_var = op.all_attrs()[self._op_role_var_key]
+            assert len(op_role_var) == 2
+            grad_name = op_role_var[1]
+            assert grad_name not in grad_device_map
+            assert grad_name not in grad_names_to_dequeue
+            grad_device_map[grad_name] = device
+            grad_names_to_dequeue.append(grad_name)
+
+        for grad_name in grad_names_to_dequeue:
+            device = grad_device_map[grad_name]
+            grad_names = []
+            grads = []
+            queue_name = grad_name + "_blocking_queue"
+            queue_name = unique_name.generate(queue_name)
+            grad_queue_map[grad_name] = queue_name
+            ref_var = block.vars[grad_name]
+            queue_var = startup_block.create_var(
+                name=queue_name,
+                persistable=True,
+                type=core.VarDesc.VarType.RAW)
+            startup_block.append_op(
+                type='queue_generator',
+                attrs={
+                    'names': [queue_name],
+                    'capacity': self._num_microbatches
+                })
+            orig_var_name = self._strip_grad_suffix(grad_name)
+            for _ in range(self._num_microbatches):
+                u_name = unique_name.generate(orig_var_name)
+                u_grad_name = self._append_grad_suffix(u_name)
+                grad_var = self._create_var(block, ref_var, u_grad_name)
+                grad_names.append(u_grad_name)
+                grads.append(grad_var)
+            block._insert_op(
+                index=optimize_index,
+                type='dequeue',
+                outputs={'Out': grads},
+                attrs={
+                    self._op_device_key: device,
+                    'queue_name': queue_name,
+                    self._op_role_key: self._op_role.Optimize
+                })
+            block._insert_op(
+                index=optimize_index + 1,
+                type='sum',
+                inputs={'X': grad_names},
+                outputs={'Out': ref_var},
+                attrs={
+                    self._op_device_key: device,
+                    self._op_role_key: self._op_role.Optimize
+                })
+        return grad_queue_map
+
+    def _insert_enq_deq_ops_for_update(self, block, startup_program):
+        """
+        Insert enqueue and dequeue ops for gradients of parameters.
+        """
+        startup_block = startup_program.global_block()
+        grad_queue_map = self._add_dequeue_ops_for_optimize(block,
+                                                            startup_program)
+
+        for index, op in reversed(list(enumerate(block.ops))):
+            offset = index
+            device = op.attr(self._op_device_key)
+
+            # Backward pass
+            if self._is_loss_grad_op(op):
+                loss_grad_var = block.vars[op.output_arg_names[0]]
+                scale_factor = self._num_microbatches
+                block._insert_op(
+                    index=index + 1,
+                    type='scale',
+                    inputs={'X': loss_grad_var},
+                    outputs={'Out': loss_grad_var},
+                    attrs={
+                        'scale': 1.0 / scale_factor,
+                        self._op_device_key: device,
+                        self._op_role_key: self._op_role.Backward
+                    })
+                break
+            if self._is_backward_op(op) and (
+                    self._op_role_var_key in op.attr_names):
+                op_role_var = op.all_attrs()[self._op_role_var_key]
+
+                if len(op_role_var) == 0:
+                    continue
+                assert len(op_role_var) % 2 == 0
+                for i in range(0, len(op_role_var), 2):
+                    grad_name = op_role_var[i + 1]
+                    grad_var = block.vars[grad_name]
+                    assert grad_name in grad_queue_map
+                    queue_name = grad_queue_map[grad_name]
+                    block._insert_op(
+                        index=offset + 1,
+                        type='enqueue',
+                        inputs={'X': block.vars[grad_name]},
+                        attrs={
+                            'queue_name': queue_name,
+                            self._op_device_key: device,
+                            self._op_role_key: self._op_role.Backward
+                        })
+                    offset += 1
+
+    def _add_sub_blocks(self, main_block, program_list):
+        main_program = main_block.program
+        for prog_info in program_list:
+            prog = prog_info['program']
+            for op in prog.block(0).ops:
+                if not op.has_attr('sub_block'):
+                    continue
+                origin_sub_block_id = op.attr('sub_block').id
+                origin_sub_block = main_program.block(origin_sub_block_id)
+                new_sub_block = prog._create_block(parent_idx=0)
+                for op in origin_sub_block.ops:
+                    op_desc = op.desc
+                    ap_op = new_sub_block.desc.append_op()
+                    ap_op.copy_from(op_desc)
+                new_sub_block._sync_with_cpp()
+                op._set_attr('sub_block:', new_sub_block)
+
+    def _get_device_info(self, block):
+        for op in block.ops:
+            if not op._has_kernel(op.type): continue
+            op_device = op.attr(self._op_device_key)
+            return op_device
+
+    def _process_persistable_vars_in_multi_sections(self, main_program,
+                                                    startup_prog, program_list):
+        """
+        Special Case: process persistable vars that exist in
+        multiple sections, e.g., shared weight
+        """
+        # var_info = {var_name: [program1, program2...]},
+        # persistable var only
+        var_info = dict()
+        for prog_info in program_list:
+            prog = prog_info['program']
+            block = prog.block(0)
+            for var_name in block.vars:
+                var = block.var(var_name)
+                if not var.persistable: continue
+                if not var_name in var_info:
+                    var_info[var_name] = []
+                if not prog in var_info[var_name]:
+                    var_info[var_name].append(prog)
+        for var_name in list(var_info.keys()):
+            if len(var_info[var_name]) == 1:
+                var_info.pop(var_name)
+
+        # write_info = {var_name: program}, where program is the only program
+        # in which the var named var_name is written.
+        write_info = dict()
+        for var_name in var_info.keys():
+            for prog in var_info[var_name]:
+                block = prog.block(0)
+                for op in block.ops:
+                    if op.type == "dequeue": continue
+                    # We have processed lr related vars
+                    if op.attr(self._op_role_key) == int(
+                            self._op_role.Optimize.LRSched):
+                        continue
+                    if var_name in op.desc.output_arg_names():
+                        assert var_name not in write_info, (
+                            "two sections write the same var({}): second "
+                            "op {}.".format(var_name, op))
+                        write_info[var_name] = prog
+                        break
+
+        for var_name in var_info.keys():
+            # Case 1: read only variables, no special process
+            if not var_name in write_info: continue
+
+            # Case 2: one write multiple reads
+            write_prog = write_info[var_name]
+            write_block = write_prog.block(0)
+            write_device = self._get_device_info(write_block)
+            all_progs = var_info[var_name]
+            for prog in all_progs:
+                if prog == write_prog: continue
+
+                queue_name = var_name + "_blocking_queue"
+                queue_name = unique_name.generate(queue_name)
+                queue_var = startup_prog.block(0).create_var(
+                    name=queue_name,
+                    persistable=True,
+                    type=core.VarDesc.VarType.RAW)
+                startup_prog.block(0).append_op(
+                    type='queue_generator',
+                    attrs={
+                        'names': [queue_name],
+                        'capacity': self._num_microbatches
+                    })
+                write_block._insert_op(
+                    index=0,
+                    type='enqueue',
+                    inputs={'X': write_block.var(var_name), },
+                    attrs={
+                        'queue_name': queue_name,
+                        self._op_device_key: write_device,
+                        # A trick to make the role LRSched to avoid copy every
+                        # microbatch
+                        self._op_role_key: self._op_role.LRSched
+                    })
+                read_block = prog.block(0)
+                read_device = self._get_device_info(read_block)
+                read_block._insert_op(
+                    index=0,
+                    type='dequeue',
+                    outputs={'Out': [read_block.var(var_name)]},
+                    attrs={
+                        self._op_device_key: read_device,
+                        # A trick to make the role LRSched to avoid copy every
+                        # microbatch
+                        self._op_role_key: self._op_role.LRSched,
+                        'queue_name': queue_name,
+                    })
 
     def minimize(self,
                  loss,
                  startup_program=None,
                  parameter_list=None,
                  no_grad_set=None):
-        self._optimizer.minimize(loss, startup_program, parameter_list,
-                                 no_grad_set)
-        program = loss.block.program
-        if len(self._cut_list) == 0:
+        main_block = loss.block
+        if startup_program is None:
+            startup_program = default_startup_program()
+        optimize_ops, params_grads = self._optimizer.minimize(
+            loss, startup_program, parameter_list, no_grad_set)
+        self._update_param_device_map(params_grads, main_block)
+
+        # Step1: add default op_device attribute for regulization and clip ops
+        self._add_opdevice_attr_for_regularization_clip(main_block)
+
+        # Step2: add default op_device attribute for ops whose op_device
+        # attribute have not been set yet.
+        self._add_default_opdevice_attr(main_block)
+        device_specs = self._check_validation(main_block)
+
+        # Step3: add enqueue and dequeue ops between section boundaries
+        origin_prog = main_block.program.clone(for_test=False)
+        origin_main_block = origin_prog.global_block()
+        self._insert_enq_deq_ops_for_boundaries(main_block, origin_main_block,
+                                                startup_program)
+
+        # Step4: add a pair of enqueue and dequeueN for parameter gradients
+        self._insert_enq_deq_ops_for_update(main_block, startup_program)
+
+        main_program = main_block.program
+
+        place_list = []
+        place_id_list = []
+        for dev_spec in device_specs:
+            if dev_spec == "cpu":
+                place_list.append(core.CPUPlace())
+                place_id_list.append(-1)
+            elif "gpu" in dev_spec and ":" in dev_spec:
+                dev_index = dev_spec.split(":")[1]
+                place_list.append(core.CUDAPlace(int(dev_index)))
+                place_id_list.append(int(dev_index))
+            else:
+                raise ValueError("Unknown device type: %s", dev_spec)
+
+        # Step5: split program into sections and add pairs of
+        # enqueue and dequeue ops for data var.
+        if len(place_list) == 0:
             program_list = []
-            ptmp = {"program": program, "input_set": set(), "output_set": set()}
+            ptmp = {
+                "program": main_program,
+                "input_set": set(),
+                "output_set": set()
+            }
             program_list.append(ptmp)
         else:
-            program_list = self._split_program(program, self._cut_list)
+            program_list = self._split_program(main_program)
             for p in program_list:
-                self._create_vars(p["program"].block(0), program)
-        whole_parameters = [e.name for e in program.block(0).all_parameters()]
-        param_need_sync = []
-        for i, section_p in enumerate(program_list):
-            if not isinstance(self._place_list[i], core.CUDAPlace):
-                continue
-            section_var = [e for e in section_p["program"].block(0).vars]
-            for p in section_var:
-                if p in whole_parameters:
-                    param_need_sync.append(p)
-        program._pipeline_opt = {
+                self._create_vars(p["program"].block(0), main_program)
+        self._insert_enq_deq_for_data_var(main_block, program_list,
+                                          startup_program, device_specs)
+
+        # Step6: Special Case: process persistable vars that exist in
+        # multiple sections
+        self._process_persistable_vars_in_multi_sections(
+            main_program, startup_program, program_list)
+
+        # Step7: Add sub blocks for section programs
+        self._add_sub_blocks(main_block, program_list)
+
+        main_program._pipeline_opt = {
             "trainer": "PipelineTrainer",
             "device_worker": "Section",
             "section_program_list": program_list,
-            "place_list": self._place_list,
-            "concurrency_list": self._concurrency_list,
-            "queue_size": self._queue_size,
+            "place_list": place_list,
+            "place_id_list": place_id_list,
+            "sync_steps": -1,
+            "num_microbatches": self._num_microbatches,
             "start_cpu_core_id": self._start_cpu_core_id,
-            "sync_steps": self._sync_steps,
-            "param_need_sync": param_need_sync
         }
+        return optimize_ops, params_grads, program_list
 
 
 class RecomputeOptimizer(Optimizer):
diff --git a/python/paddle/fluid/reader.py b/python/paddle/fluid/reader.py
index 06d74386b88a0b0466394cebd9076ea05bf470c5..0289ecea34acf65d01aa13b555ee523f7127b48d 100644
--- a/python/paddle/fluid/reader.py
+++ b/python/paddle/fluid/reader.py
@@ -23,7 +23,7 @@ from .executor import global_scope
 from .data_feeder import DataFeeder, BatchedTensorProvider
 from .multiprocess_utils import multiprocess_queue_set, CleanupFuncRegistrar, _cleanup_mmap, _cleanup, _set_SIGCHLD_handler
 from .dataloader import BatchSampler, Dataset
-from .dataloader.dataloader_iter import _DataLoaderIterSingleProcess, _DataLoaderIterMultiProcess
+from .dataloader.dataloader_iter import _DataLoaderIterSingleProcess, _DataLoaderIterMultiProcess, default_collate_fn
 from .layers.io import monkey_patch_reader_methods, _copy_reader_var_, double_buffer
 from .unique_name import UniqueNameGenerator
 import logging
@@ -44,7 +44,7 @@ else:
 # NOTE: [ avoid hanging & failed quickly ] These value is used in getting data from another process
 QUEUE_GET_TIMEOUT = 60
 
-__all__ = ['PyReader', 'DataLoader']
+__all__ = ['PyReader', 'DataLoader', 'default_collate_fn']
 
 data_loader_unique_name_generator = UniqueNameGenerator()
 
@@ -97,6 +97,18 @@ class DataLoaderBase(object):
     def __next__(self):
         raise NotImplementedError()
 
+    @classmethod
+    def _check_input_array(cls, item):
+        arr = np.asarray(item)
+        if arr.dtype == np.object:
+            raise TypeError(
+                "\n\tFaild to convert input data to a regular ndarray :\n\t* Usually "
+                "this means the input data contains nested lists with different lengths. "
+                "\n\t* Check the reader function passed to 'decorate_batch_generator'"
+                " to locate the data causes this issue.\n\t* Please consider using "
+                "'fluid.create_lod_tensor' to convert it to a LoD-Tensor.")
+        return arr
+
 
 class DataLoader(object):
     """
@@ -807,17 +819,6 @@ class DygraphGeneratorLoader(DataLoaderBase):
             self._reset()
             six.reraise(*sys.exc_info())
 
-    @classmethod
-    def _check_input_array(cls, item):
-        arr = np.array(item)
-        if arr.dtype == np.object:
-            raise TypeError(
-                "\n\tFaild to convert input data to a regular ndarray :\n\t* Usually "
-                "this means the input data contains nested lists with different lengths. "
-                "\n\t* Check the reader function passed to 'decorate_batch_generator'"
-                " to locate the data causes this issue.\n\t* Please consider using "
-                "'fluid.create_lod_tensor' to convert it to a LoD-Tensor.")
-
     def _exit_thread_expectedly(self):
         self._thread_done_event.set()
         self._blocking_queue.close()
@@ -894,7 +895,7 @@ class DygraphGeneratorLoader(DataLoaderBase):
                 array = core.LoDTensorArray()
                 for item in sample:
                     if not isinstance(item, core.LoDTensor):
-                        self._check_input_array(item)
+                        item = self._check_input_array(item)
                         tmp = core.LoDTensor()
                         tmp.set(item, core.CPUPlace())
                         item = tmp
@@ -1115,19 +1116,6 @@ class GeneratorLoader(DataLoaderBase):
         assert not self._iterable, "reset() cannot be called when DataLoader is iterable"
         self._reset()
 
-    @classmethod
-    def _check_input_array(cls, item):
-        arr = np.array(item)
-        if arr.dtype == np.object:
-            raise TypeError((
-                "\n\tFaild to convert input data to a regular ndarray :\n\t* Usually "
-                "this means the input data contains nested lists with different lengths. "
-                "\n\t* Check the reader function passed to 'decorate_batch_generator'"
-                " to locate the data causes this issue.\n\t* Please consider using "
-                "'fluid.create_lod_tensor' to convert it to a LoD-Tensor."))
-
-        return arr
-
     def _start(self):
         def __thread_main__():
             try:
diff --git a/python/paddle/fluid/tests/unittests/CMakeLists.txt b/python/paddle/fluid/tests/unittests/CMakeLists.txt
index 7aa11d95d78e0bf24dc0078bd30c1ceb393275d9..c95577561f45158ce4de80753e8f3725cd8673e0 100644
--- a/python/paddle/fluid/tests/unittests/CMakeLists.txt
+++ b/python/paddle/fluid/tests/unittests/CMakeLists.txt
@@ -45,13 +45,10 @@ if(NOT WITH_GPU OR WIN32)
     LIST(REMOVE_ITEM TEST_OPS test_reducescatter_api)
 endif()
 
-#TODO(malin): Fix this unitest failed on GCC8.
-LIST(REMOVE_ITEM TEST_OPS test_roll_op)
 #TODO(sunxiaolong01): Fix this unitest failed on GCC8.
 LIST(REMOVE_ITEM TEST_OPS test_conv2d_transpose_op)
 if(WIN32)
     LIST(REMOVE_ITEM TEST_OPS test_boxps)
-    LIST(REMOVE_ITEM TEST_OPS test_paddlebox_datafeed)
     LIST(REMOVE_ITEM TEST_OPS test_trainer_desc)
     LIST(REMOVE_ITEM TEST_OPS test_multiprocess_reader_exception)
     LIST(REMOVE_ITEM TEST_OPS test_avoid_twice_initialization)
@@ -61,7 +58,6 @@ if(WIN32)
     LIST(REMOVE_ITEM TEST_OPS test_debugger)
     list(REMOVE_ITEM TEST_OPS test_desc_clone)
     list(REMOVE_ITEM TEST_OPS test_fake_init_op)
-    list(REMOVE_ITEM TEST_OPS test_hsigmoid_op)
     list(REMOVE_ITEM TEST_OPS test_merge_ids_op)
     list(REMOVE_ITEM TEST_OPS test_split_ids_op)
     list(REMOVE_ITEM TEST_OPS test_program_code)
@@ -69,6 +65,11 @@ if(WIN32)
     LIST(REMOVE_ITEM TEST_OPS test_math_op_patch_var_base)
 endif()
 
+if(APPLE OR WIN32)
+    LIST(REMOVE_ITEM TEST_OPS test_hdfs)
+    LIST(REMOVE_ITEM TEST_OPS test_fs_interface)
+endif()
+
 if (NOT ${WITH_GPU})
     LIST(REMOVE_ITEM TEST_OPS test_conv2d_fusion_op)
     LIST(REMOVE_ITEM TEST_OPS test_rank_attention_op) # TODO(shenliang03): rank_attention_op support CPU device in future
@@ -91,7 +92,6 @@ endif()
 if(NOT WITH_GPU OR WIN32)
     LIST(REMOVE_ITEM TEST_OPS test_pipeline)
     LIST(REMOVE_ITEM TEST_OPS test_boxps)
-    LIST(REMOVE_ITEM TEST_OPS test_paddlebox_datafeed)
 endif()
 list(REMOVE_ITEM TEST_OPS test_seq_concat_op) # FIXME(helin): https://github.com/PaddlePaddle/Paddle/issues/8290
 list(REMOVE_ITEM TEST_OPS test_lstm_unit_op) # # FIXME(qijun) https://github.com/PaddlePaddle/Paddle/issues/5185
@@ -205,6 +205,7 @@ endfunction()
 
 list(REMOVE_ITEM TEST_OPS test_warpctc_op)
 list(REMOVE_ITEM TEST_OPS test_parallel_executor_crf)
+list(REMOVE_ITEM TEST_OPS test_parallel_executor_profiler)
 list(REMOVE_ITEM TEST_OPS test_data_norm_op)
 list(REMOVE_ITEM TEST_OPS test_parallel_executor_fetch_feed)
 list(REMOVE_ITEM TEST_OPS test_parallel_executor_transformer)
@@ -232,6 +233,8 @@ list(REMOVE_ITEM TEST_OPS test_imperative_debug_string)
 list(REMOVE_ITEM TEST_OPS test_fuse_bn_act_pass)
 list(REMOVE_ITEM TEST_OPS test_imperative_static_runner_mnist)
 list(REMOVE_ITEM TEST_OPS test_imperative_static_runner_while)
+list(REMOVE_ITEM TEST_OPS test_conv3d_transpose_op)
+list(REMOVE_ITEM TEST_OPS test_pipeline)
 
 # disable this unittest temporarily
 list(REMOVE_ITEM TEST_OPS test_imperative_data_loader_exception)
@@ -321,6 +324,7 @@ if(WITH_DISTRIBUTE)
     # FIXME(typhoonzero): add these tests back
     list(REMOVE_ITEM DIST_TEST_OPS "test_dist_transformer")
     list(REMOVE_ITEM DIST_TEST_OPS "test_dist_transpiler")
+    list(REMOVE_ITEM DIST_TEST_OPS "test_dist_fleet_ctr")
 
     #not need
     list(REMOVE_ITEM DIST_TEST_OPS "test_dist_base")
@@ -354,7 +358,7 @@ if(WITH_DISTRIBUTE)
         endif()
         bash_test_modules(test_launch_ps MODULES test_launch_ps.sh ENVS PADDLE_BINARY_DIR=${PADDLE_BINARY_DIR})
 
-        set(dist_ut_port 1000)
+        set(dist_ut_port 20001)
         foreach(TEST_OP ${DIST_TEST_OPS})
             bash_test_modules(${TEST_OP} MODULES dist_test.sh SERIAL LABELS "RUN_TYPE=EXCLUSIVE" ENVS "PADDLE_DIST_UT_PORT=${dist_ut_port}")
             MATH(EXPR dist_ut_port "${dist_ut_port}+50")
@@ -363,22 +367,29 @@ if(WITH_DISTRIBUTE)
 endif()
 
 py_test_modules(test_parallel_executor_crf MODULES test_parallel_executor_crf)
+py_test_modules(test_parallel_executor_profiler MODULES test_parallel_executor_profiler)
 py_test_modules(test_parallel_executor_transformer MODULES test_parallel_executor_transformer)
 py_test_modules(test_parallel_executor_transformer_auto_growth MODULES test_parallel_executor_transformer_auto_growth ENVS FLAGS_allocator_strategy=auto_growth)
 
 py_test_modules(test_data_norm_op MODULES test_data_norm_op)
 py_test_modules(test_fuse_bn_act_pass MODULES test_fuse_bn_act_pass ENVS FLAGS_cudnn_deterministic=1 FLAGS_cudnn_batchnorm_spatial_persistent=1 FLAGS_conv_workspace_size_limit=1000)
 
-if(NOT WIN32)
-    # TODO: fix these unittests failure on Windows
+# NOTE: These unittests will appear NaN steadily in windows CI. After analysis,
+# it is found that windows CI will run all the training unittests with the ON_INFER option turned on, 
+# which will not appear in other CIs. The calculation behavior of some ops in inference mode is 
+# inconsistent with that in non-inference mode.
+if(NOT ON_INFER)
     py_test_modules(test_parallel_executor_seresnext_base_cpu MODULES test_parallel_executor_seresnext_base_cpu)
     py_test_modules(test_parallel_executor_seresnext_with_reduce_cpu MODULES test_parallel_executor_seresnext_with_reduce_cpu)
     py_test_modules(test_parallel_executor_seresnext_with_fuse_all_reduce_cpu MODULES test_parallel_executor_seresnext_with_fuse_all_reduce_cpu)
-    py_test_modules(test_layers MODULES test_layers ENVS FLAGS_cudnn_deterministic=1)
     set_tests_properties(test_parallel_executor_seresnext_base_cpu PROPERTIES TIMEOUT 900)
     set_tests_properties(test_parallel_executor_seresnext_with_reduce_cpu PROPERTIES TIMEOUT 750)
     set_tests_properties(test_parallel_executor_seresnext_with_fuse_all_reduce_cpu PROPERTIES TIMEOUT 750)
+endif()
 
+if(NOT WIN32)
+    # TODO: fix these unittests failure on Windows
+    py_test_modules(test_layers MODULES test_layers ENVS FLAGS_cudnn_deterministic=1)
     py_test_modules(test_ir_memory_optimize_transformer MODULES test_ir_memory_optimize_transformer)
     # FIXME(zcd): temporally disable test_parallel_executor_fetch_feed in Windows CI because of the random failure.
     py_test_modules(test_parallel_executor_fetch_feed MODULES test_parallel_executor_fetch_feed)
@@ -414,6 +425,7 @@ set_tests_properties(test_parallel_executor_crf test_sync_batch_norm_op test_inp
         test_parallel_executor_seresnext_base_gpu
         test_parallel_executor_seresnext_with_reduce_gpu
         test_parallel_executor_seresnext_with_fuse_all_reduce_gpu
+        test_parallel_executor_profiler
         PROPERTIES LABELS "RUN_TYPE=DIST")
 
 if(NOT WIN32 AND NOT APPLE)
diff --git a/python/paddle/fluid/tests/unittests/ctr_dataset_reader.py b/python/paddle/fluid/tests/unittests/ctr_dataset_reader.py
index d5a29b925649be0752ac8d2b14d6119ec88618be..fe7513ae84238527d25cc28fa40b01f1f099f1c8 100644
--- a/python/paddle/fluid/tests/unittests/ctr_dataset_reader.py
+++ b/python/paddle/fluid/tests/unittests/ctr_dataset_reader.py
@@ -66,15 +66,19 @@ class CtrReader(object):
         pass
 
     def _reader_creator(self, filelist):
+        def get_rand(low=0.0, high=1.0):
+            return random.random()
+
         def reader():
             for file in filelist:
                 with open(file, 'r') as f:
                     for line in f:
-                        fs = line.strip().split('\t')
-                        dnn_input = load_dnn_input_record(fs[0])
-                        lr_input = load_lr_input_record(fs[1])
-                        click = [int(fs[2])]
-                        yield [dnn_input] + [lr_input] + [click]
+                        if get_rand() < 0.05:
+                            fs = line.strip().split('\t')
+                            dnn_input = load_dnn_input_record(fs[0])
+                            lr_input = load_lr_input_record(fs[1])
+                            click = [int(fs[2])]
+                            yield [dnn_input] + [lr_input] + [click]
 
         return reader
 
diff --git a/python/paddle/fluid/tests/unittests/dist_simnet_bow.py b/python/paddle/fluid/tests/unittests/dist_simnet_bow.py
index 09afae6114e2b6cc8bce9b2be3b221ba9825db8c..9fcba2aede1cea3c78108e7daa8eb34a1ab80048 100644
--- a/python/paddle/fluid/tests/unittests/dist_simnet_bow.py
+++ b/python/paddle/fluid/tests/unittests/dist_simnet_bow.py
@@ -92,8 +92,8 @@ def train_network(batch_size,
     # query
     q = fluid.layers.data(
         name="query_ids", shape=[1], dtype="int64", lod_level=1)
-    ## embedding
-    q_emb = fluid.layers.embedding(
+    # embedding
+    q_emb = fluid.embedding(
         input=q,
         is_distributed=is_distributed,
         size=[dict_dim, emb_dim],
@@ -104,10 +104,11 @@ def train_network(batch_size,
                 initializer=fluid.initializer.Constant(value=0.01),
                 name="__emb__"),
         is_sparse=is_sparse)
-    ## vsum
+    q_emb = fluid.layers.reshape(q_emb, [-1, emb_dim])
+    # vsum
     q_sum = fluid.layers.sequence_pool(input=q_emb, pool_type='sum')
     q_ss = fluid.layers.softsign(q_sum)
-    ## fc layer after conv
+    # fc layer after conv
     q_fc = fluid.layers.fc(
         input=q_ss,
         size=hid_dim,
@@ -120,8 +121,8 @@ def train_network(batch_size,
     # pt
     pt = fluid.layers.data(
         name="pos_title_ids", shape=[1], dtype="int64", lod_level=1)
-    ## embedding
-    pt_emb = fluid.layers.embedding(
+    # embedding
+    pt_emb = fluid.embedding(
         input=pt,
         is_distributed=is_distributed,
         size=[dict_dim, emb_dim],
@@ -132,10 +133,11 @@ def train_network(batch_size,
                 initializer=fluid.initializer.Constant(value=0.01),
                 name="__emb__"),
         is_sparse=is_sparse)
-    ## vsum
+    pt_emb = fluid.layers.reshape(pt_emb, [-1, emb_dim])
+    # vsum
     pt_sum = fluid.layers.sequence_pool(input=pt_emb, pool_type='sum')
     pt_ss = fluid.layers.softsign(pt_sum)
-    ## fc layer
+    # fc layer
     pt_fc = fluid.layers.fc(
         input=pt_ss,
         size=hid_dim,
@@ -147,8 +149,8 @@ def train_network(batch_size,
     # nt
     nt = fluid.layers.data(
         name="neg_title_ids", shape=[1], dtype="int64", lod_level=1)
-    ## embedding
-    nt_emb = fluid.layers.embedding(
+    # embedding
+    nt_emb = fluid.embedding(
         input=nt,
         is_distributed=is_distributed,
         size=[dict_dim, emb_dim],
@@ -159,10 +161,11 @@ def train_network(batch_size,
                 initializer=fluid.initializer.Constant(value=0.01),
                 name="__emb__"),
         is_sparse=is_sparse)
-    ## vsum
+    nt_emb = fluid.layers.reshape(nt_emb, [-1, emb_dim])
+    # vsum
     nt_sum = fluid.layers.sequence_pool(input=nt_emb, pool_type='sum')
     nt_ss = fluid.layers.softsign(nt_sum)
-    ## fc layer
+    # fc layer
     nt_fc = fluid.layers.fc(
         input=nt_ss,
         size=hid_dim,
diff --git a/python/paddle/fluid/tests/unittests/dygraph_to_static/bert_dygraph_model.py b/python/paddle/fluid/tests/unittests/dygraph_to_static/bert_dygraph_model.py
index df0afe8b5647ca45c870a9e40e0122a78764c858..b302dd37794fd05f6ca9ca76694070708a3d9549 100644
--- a/python/paddle/fluid/tests/unittests/dygraph_to_static/bert_dygraph_model.py
+++ b/python/paddle/fluid/tests/unittests/dygraph_to_static/bert_dygraph_model.py
@@ -247,8 +247,11 @@ class BertModelLayer(Layer):
 
         enc_output = self._encoder(emb_out, n_head_self_attn_mask)
 
-        if not self.return_pooled_out:
-            return enc_output
+        # TODO(zhhsplendid): uncomment this in next PR which we support various
+        # length of early return
+        #
+        #if not self.return_pooled_out:
+        #    return enc_output
         next_sent_feat = fluid.layers.slice(
             input=enc_output, axes=[1], starts=[0], ends=[1])
         next_sent_feat = self.pooled_fc(next_sent_feat)
diff --git a/python/paddle/fluid/tests/unittests/dygraph_to_static/seq2seq_dygraph_model.py b/python/paddle/fluid/tests/unittests/dygraph_to_static/seq2seq_dygraph_model.py
index 809e4d51a70881154cc27dcf00221940195ad40b..0a3be4478125cdb5c3256090033f6975b3f4cda9 100644
--- a/python/paddle/fluid/tests/unittests/dygraph_to_static/seq2seq_dygraph_model.py
+++ b/python/paddle/fluid/tests/unittests/dygraph_to_static/seq2seq_dygraph_model.py
@@ -329,8 +329,8 @@ class BaseModel(fluid.dygraph.Layer):
 
         # beam search
         batch_beam_shape = (self.batch_size, self.beam_size)
-        vocab_size_tensor = to_variable(
-            np.full((1), self.tar_vocab_size).astype("int64"))
+        vocab_size_tensor = to_variable(np.full((
+            1), self.tar_vocab_size)).astype("int64")
         start_token_tensor = to_variable(
             np.full(
                 batch_beam_shape, self.beam_start_token, dtype='int64'))
@@ -448,3 +448,293 @@ class BaseModel(fluid.dygraph.Layer):
         predicted_ids = fluid.layers.gather_tree(predicted_ids, parent_ids)
         predicted_ids = self._transpose_batch_time(predicted_ids)
         return predicted_ids
+
+
+class AttentionModel(fluid.dygraph.Layer):
+    def __init__(self,
+                 hidden_size,
+                 src_vocab_size,
+                 tar_vocab_size,
+                 batch_size,
+                 num_layers=1,
+                 init_scale=0.1,
+                 dropout=None,
+                 beam_size=1,
+                 beam_start_token=1,
+                 beam_end_token=2,
+                 beam_max_step_num=2,
+                 mode='train'):
+        super(AttentionModel, self).__init__()
+        self.hidden_size = hidden_size
+        self.src_vocab_size = src_vocab_size
+        self.tar_vocab_size = tar_vocab_size
+        self.batch_size = batch_size
+        self.num_layers = num_layers
+        self.init_scale = init_scale
+        self.dropout = dropout
+        self.beam_size = beam_size
+        self.beam_start_token = beam_start_token
+        self.beam_end_token = beam_end_token
+        self.beam_max_step_num = beam_max_step_num
+        self.mode = mode
+        self.kinf = 1e9
+
+        param_attr = ParamAttr(initializer=uniform_initializer(self.init_scale))
+        bias_attr = ParamAttr(initializer=zero_constant)
+        forget_bias = 1.0
+
+        self.src_embeder = Embedding(
+            size=[self.src_vocab_size, self.hidden_size],
+            param_attr=fluid.ParamAttr(
+                name='source_embedding',
+                initializer=uniform_initializer(init_scale)))
+
+        self.tar_embeder = Embedding(
+            size=[self.tar_vocab_size, self.hidden_size],
+            is_sparse=False,
+            param_attr=fluid.ParamAttr(
+                name='target_embedding',
+                initializer=uniform_initializer(init_scale)))
+
+        self.enc_units = []
+        for i in range(num_layers):
+            self.enc_units.append(
+                self.add_sublayer(
+                    "enc_units_%d" % i,
+                    BasicLSTMUnit(
+                        hidden_size=self.hidden_size,
+                        input_size=self.hidden_size,
+                        param_attr=param_attr,
+                        bias_attr=bias_attr,
+                        forget_bias=forget_bias)))
+
+        self.dec_units = []
+        for i in range(num_layers):
+            if i == 0:
+                self.dec_units.append(
+                    self.add_sublayer(
+                        "dec_units_%d" % i,
+                        BasicLSTMUnit(
+                            hidden_size=self.hidden_size,
+                            input_size=self.hidden_size * 2,
+                            param_attr=ParamAttr(
+                                name="dec_units_%d" % i,
+                                initializer=uniform_initializer(
+                                    self.init_scale)),
+                            bias_attr=bias_attr,
+                            forget_bias=forget_bias)))
+            else:
+                self.dec_units.append(
+                    self.add_sublayer(
+                        "dec_units_%d" % i,
+                        BasicLSTMUnit(
+                            hidden_size=self.hidden_size,
+                            input_size=self.hidden_size,
+                            param_attr=ParamAttr(
+                                name="dec_units_%d" % i,
+                                initializer=uniform_initializer(
+                                    self.init_scale)),
+                            bias_attr=bias_attr,
+                            forget_bias=forget_bias)))
+
+        self.attn_fc = fluid.dygraph.nn.Linear(
+            self.hidden_size,
+            self.hidden_size,
+            param_attr=ParamAttr(
+                name="self_attn_fc",
+                initializer=uniform_initializer(self.init_scale)),
+            bias_attr=False)
+
+        self.concat_fc = fluid.dygraph.nn.Linear(
+            2 * self.hidden_size,
+            self.hidden_size,
+            param_attr=ParamAttr(
+                name="self_concat_fc",
+                initializer=uniform_initializer(self.init_scale)),
+            bias_attr=False)
+
+        self.fc = fluid.dygraph.nn.Linear(
+            self.hidden_size,
+            self.tar_vocab_size,
+            param_attr=ParamAttr(
+                name="self_fc",
+                initializer=uniform_initializer(self.init_scale)),
+            bias_attr=False)
+
+    def _transpose_batch_time(self, x):
+        return fluid.layers.transpose(x, [1, 0] + list(range(2, len(x.shape))))
+
+    def _merge_batch_beams(self, x):
+        return fluid.layers.reshape(x, shape=(-1, x.shape[2]))
+
+    def tile_beam_merge_with_batch(self, x):
+        x = fluid.layers.unsqueeze(x, [1])  # [batch_size, 1, ...]
+        expand_times = [1] * len(x.shape)
+        expand_times[1] = self.beam_size
+        x = fluid.layers.expand(x, expand_times)  # [batch_size, beam_size, ...]
+        x = fluid.layers.transpose(x, list(range(2, len(x.shape))) +
+                                   [0, 1])  # [..., batch_size, beam_size]
+        # use 0 to copy to avoid wrong shape
+        x = fluid.layers.reshape(
+            x, shape=[0] *
+            (len(x.shape) - 2) + [-1])  # [..., batch_size * beam_size]
+        x = fluid.layers.transpose(
+            x, [len(x.shape) - 1] +
+            list(range(0, len(x.shape) - 1)))  # [batch_size * beam_size, ...]
+        return x
+
+    def _split_batch_beams(self, x):
+        return fluid.layers.reshape(x, shape=(-1, self.beam_size, x.shape[1]))
+
+    def _expand_to_beam_size(self, x):
+        x = fluid.layers.unsqueeze(x, [1])
+        expand_times = [1] * len(x.shape)
+        expand_times[1] = self.beam_size
+        x = fluid.layers.expand(x, expand_times)
+        return x
+
+    def _real_state(self, state, new_state, step_mask):
+        new_state = fluid.layers.elementwise_mul(new_state, step_mask, axis=0) - \
+                    fluid.layers.elementwise_mul(state, (step_mask - 1), axis=0)
+        return new_state
+
+    def _gather(self, x, indices, batch_pos):
+        topk_coordinates = fluid.layers.stack([batch_pos, indices], axis=2)
+        return fluid.layers.gather_nd(x, topk_coordinates)
+
+    def attention(self, query, enc_output, mask=None):
+        query = fluid.layers.unsqueeze(query, [1])
+        memory = self.attn_fc(enc_output)
+        attn = fluid.layers.matmul(query, memory, transpose_y=True)
+
+        if mask is not None:
+            attn = fluid.layers.transpose(attn, [1, 0, 2])
+            attn = fluid.layers.elementwise_add(attn, mask * 1000000000, -1)
+            attn = fluid.layers.transpose(attn, [1, 0, 2])
+        weight = fluid.layers.softmax(attn)
+        weight_memory = fluid.layers.matmul(weight, memory)
+
+        return weight_memory
+
+    def _change_size_for_array(self, func, array):
+        print(" ^" * 10, "_change_size_for_array")
+        print("array : ", array)
+        for i, state in enumerate(array):
+            fluid.layers.array_write(func(state), i, array)
+
+        return array
+
+    @declarative
+    def forward(self, inputs):
+        src, tar, label, src_sequence_length, tar_sequence_length = inputs
+        if src.shape[0] < self.batch_size:
+            self.batch_size = src.shape[0]
+
+        src_emb = self.src_embeder(self._transpose_batch_time(src))
+
+        # NOTE: modify model code about `enc_hidden` and `enc_cell` to transforme dygraph code successfully.
+        # Because nested list can't be transformed now.
+        enc_hidden_0 = to_variable(
+            np.zeros(
+                (self.batch_size, self.hidden_size), dtype='float32'))
+        enc_hidden_0.stop_gradient = True
+        enc_cell_0 = to_variable(
+            np.zeros(
+                (self.batch_size, self.hidden_size), dtype='float32'))
+        enc_hidden_0.stop_gradient = True
+        zero = fluid.layers.zeros(shape=[1], dtype="int64")
+        enc_hidden = fluid.layers.create_array(dtype="float32")
+        enc_cell = fluid.layers.create_array(dtype="float32")
+        for i in range(self.num_layers):
+            index = zero + i
+            enc_hidden = fluid.layers.array_write(
+                enc_hidden_0, index, array=enc_hidden)
+            enc_cell = fluid.layers.array_write(
+                enc_cell_0, index, array=enc_cell)
+
+        max_seq_len = src_emb.shape[0]
+
+        enc_len_mask = fluid.layers.sequence_mask(
+            src_sequence_length, maxlen=max_seq_len, dtype="float32")
+        enc_padding_mask = (enc_len_mask - 1.0)
+        enc_len_mask = fluid.layers.transpose(enc_len_mask, [1, 0])
+
+        enc_outputs = []
+        # TODO: Because diff exits if call while_loop in static graph.
+        # In while block, a Variable created in parent block participates in the calculation of gradient,
+        # the gradient is wrong because each step scope always returns the same value generated by last step.
+        for p in range(max_seq_len):
+            k = 0 + p
+            enc_step_input = src_emb[k]
+            step_mask = enc_len_mask[k]
+            new_enc_hidden, new_enc_cell = [], []
+            for i in range(self.num_layers):
+                enc_new_hidden, enc_new_cell = self.enc_units[i](
+                    enc_step_input, enc_hidden[i], enc_cell[i])
+                if self.dropout != None and self.dropout > 0.0:
+                    enc_step_input = fluid.layers.dropout(
+                        enc_new_hidden,
+                        dropout_prob=self.dropout,
+                        dropout_implementation='upscale_in_train')
+                else:
+                    enc_step_input = enc_new_hidden
+
+                new_enc_hidden.append(
+                    self._real_state(enc_hidden[i], enc_new_hidden, step_mask))
+                new_enc_cell.append(
+                    self._real_state(enc_cell[i], enc_new_cell, step_mask))
+            enc_outputs.append(enc_step_input)
+            enc_hidden, enc_cell = new_enc_hidden, new_enc_cell
+
+        enc_outputs = fluid.layers.stack(enc_outputs)
+        enc_outputs = self._transpose_batch_time(enc_outputs)
+
+        # train
+        input_feed = to_variable(
+            np.zeros(
+                (self.batch_size, self.hidden_size), dtype='float32'))
+        # NOTE: set stop_gradient here, otherwise grad var is null
+        input_feed.stop_gradient = True
+        dec_hidden, dec_cell = enc_hidden, enc_cell
+        tar_emb = self.tar_embeder(self._transpose_batch_time(tar))
+        max_seq_len = tar_emb.shape[0]
+        dec_output = []
+
+        for step_idx in range(max_seq_len):
+            j = step_idx + 0
+            step_input = tar_emb[j]
+            step_input = fluid.layers.concat([step_input, input_feed], 1)
+            new_dec_hidden, new_dec_cell = [], []
+            for i in range(self.num_layers):
+                new_hidden, new_cell = self.dec_units[i](
+                    step_input, dec_hidden[i], dec_cell[i])
+                new_dec_hidden.append(new_hidden)
+                new_dec_cell.append(new_cell)
+                if self.dropout != None and self.dropout > 0.0:
+                    step_input = fluid.layers.dropout(
+                        new_hidden,
+                        dropout_prob=self.dropout,
+                        dropout_implementation='upscale_in_train')
+                else:
+                    step_input = new_hidden
+            dec_att = self.attention(step_input, enc_outputs, enc_padding_mask)
+            dec_att = fluid.layers.squeeze(dec_att, [1])
+            concat_att_out = fluid.layers.concat([dec_att, step_input], 1)
+            out = self.concat_fc(concat_att_out)
+            input_feed = out
+            dec_output.append(out)
+            dec_hidden, dec_cell = new_dec_hidden, new_dec_cell
+
+        dec_output = fluid.layers.stack(dec_output)
+        dec_output = self.fc(self._transpose_batch_time(dec_output))
+        loss = fluid.layers.softmax_with_cross_entropy(
+            logits=dec_output, label=label, soft_label=False)
+        loss = fluid.layers.squeeze(loss, axes=[2])
+        max_tar_seq_len = fluid.layers.shape(tar)[1]
+        tar_mask = fluid.layers.sequence_mask(
+            tar_sequence_length, maxlen=max_tar_seq_len, dtype='float32')
+        loss = loss * tar_mask
+        loss = fluid.layers.reduce_mean(loss, dim=[0])
+        loss = fluid.layers.reduce_sum(loss)
+
+        return loss
diff --git a/python/paddle/fluid/tests/unittests/dygraph_to_static/seq2seq_utils.py b/python/paddle/fluid/tests/unittests/dygraph_to_static/seq2seq_utils.py
index 7f9766c5357effcd2fd5a9b6b91cbfbf63d8d2ee..821fea3a67ddbf45db73c337637d0ebbffeefbd3 100644
--- a/python/paddle/fluid/tests/unittests/dygraph_to_static/seq2seq_utils.py
+++ b/python/paddle/fluid/tests/unittests/dygraph_to_static/seq2seq_utils.py
@@ -125,11 +125,13 @@ class Seq2SeqModelHyperParams(object):
     max_grad_norm = 5.0
 
     # model path for model to save
-    model_path = "dy2stat/model/seq2seq"
+
+    base_model_path = "dy2stat/model/base_seq2seq"
+    attn_model_path = "dy2stat/model/attn_seq2seq"
 
     # reload model to inference
     reload_model = "model/epoch_0.pdparams"
 
-    beam_size = 10
+    beam_size = 4
 
     max_seq_len = 3
diff --git a/python/paddle/fluid/tests/unittests/dygraph_to_static/simnet_dygraph_model.py b/python/paddle/fluid/tests/unittests/dygraph_to_static/simnet_dygraph_model.py
new file mode 100644
index 0000000000000000000000000000000000000000..331eeeb3de6810433b75d307f761660f352a1949
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/dygraph_to_static/simnet_dygraph_model.py
@@ -0,0 +1,516 @@
+#   Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import paddle.fluid as fluid
+import paddle.fluid.param_attr as attr
+
+from functools import reduce
+from paddle.fluid.dygraph import declarative, to_variable
+from paddle.fluid.dygraph import Embedding, Layer, Linear
+
+
+class EmbeddingLayer(object):
+    """
+    Embedding Layer class
+    """
+
+    def __init__(self, dict_size, emb_dim, name="emb", padding_idx=None):
+        """
+        initialize
+        """
+        self.dict_size = dict_size
+        self.emb_dim = emb_dim
+        self.name = name
+        self.padding_idx = padding_idx
+
+    def ops(self):
+        """
+        operation
+        """
+        # TODO(huihuangzheng): The original code set the is_sparse=True, but it
+        # causes crush in dy2stat. Set it to True after fixing it.
+        emb = Embedding(
+            size=[self.dict_size, self.emb_dim],
+            is_sparse=True,
+            padding_idx=self.padding_idx,
+            param_attr=attr.ParamAttr(
+                name=self.name, initializer=fluid.initializer.Xavier()))
+
+        return emb
+
+
+class FCLayer(object):
+    """
+    Fully Connect Layer class
+    """
+
+    def __init__(self, fc_dim, act, name="fc"):
+        """
+        initialize
+        """
+        self.fc_dim = fc_dim
+        self.act = act
+        self.name = name
+
+    def ops(self):
+        """
+        operation
+        """
+        fc = FC(size=self.fc_dim,
+                param_attr=attr.ParamAttr(name="%s.w" % self.name),
+                bias_attr=attr.ParamAttr(name="%s.b" % self.name),
+                act=self.act)
+        return fc
+
+
+class ConcatLayer(object):
+    """
+    Connection Layer class
+    """
+
+    def __init__(self, axis):
+        """
+        initialize
+        """
+        self.axis = axis
+
+    def ops(self, inputs):
+        """
+        operation
+        """
+        concat = fluid.layers.concat(inputs, axis=self.axis)
+        return concat
+
+
+class ReduceMeanLayer(object):
+    """
+    Reduce Mean Layer class
+    """
+
+    def __init__(self):
+        """
+        initialize
+        """
+        pass
+
+    def ops(self, input):
+        """
+        operation
+        """
+        mean = fluid.layers.reduce_mean(input)
+        return mean
+
+
+class CosSimLayer(object):
+    """
+    Cos Similarly Calculate Layer
+    """
+
+    def __init__(self):
+        """
+        initialize
+        """
+        pass
+
+    def ops(self, x, y):
+        """
+        operation
+        """
+        sim = fluid.layers.cos_sim(x, y)
+        return sim
+
+
+class ElementwiseMaxLayer(object):
+    """
+    Elementwise Max Layer class
+    """
+
+    def __init__(self):
+        """
+        initialize
+        """
+        pass
+
+    def ops(self, x, y):
+        """
+        operation
+        """
+        max = fluid.layers.elementwise_max(x, y)
+        return max
+
+
+class ElementwiseAddLayer(object):
+    """
+    Elementwise Add Layer class
+    """
+
+    def __init__(self):
+        """
+        initialize
+        """
+        pass
+
+    def ops(self, x, y):
+        """
+        operation
+        """
+        add = fluid.layers.elementwise_add(x, y)
+        return add
+
+
+class ElementwiseSubLayer(object):
+    """
+    Elementwise Add Layer class
+    """
+
+    def __init__(self):
+        """
+        initialize
+        """
+        pass
+
+    def ops(self, x, y):
+        """
+        operation
+        """
+        sub = fluid.layers.elementwise_sub(x, y)
+        return sub
+
+
+class ConstantLayer(object):
+    """
+    Generate A Constant Layer class
+    """
+
+    def __init__(self):
+        """
+        initialize
+        """
+        pass
+
+    def ops(self, input, shape, dtype, value):
+        """
+        operation
+        """
+        shape = list(shape)
+        input_shape = fluid.layers.shape(input)
+        shape[0] = input_shape[0]
+        constant = fluid.layers.fill_constant(shape, dtype, value)
+        return constant
+
+
+class SoftsignLayer(object):
+    """
+    Softsign Layer class
+    """
+
+    def __init__(self):
+        """
+        initialize
+        """
+        pass
+
+    def ops(self, input):
+        """
+        operation
+        """
+        softsign = fluid.layers.softsign(input)
+        return softsign
+
+
+class FC(Layer):
+    """
+    This interface is used to construct a callable object of the ``FC`` class.
+    For more details, refer to code examples.
+    It creates a fully connected layer in the network. It can take
+    one or multiple ``Tensor`` as its inputs. It creates a Variable called weights for each input tensor,
+    which represents a fully connected weight matrix from each input unit to
+    each output unit. The fully connected layer multiplies each input tensor
+    with its corresponding weight to produce an output Tensor with shape [N, `size`],
+    where N is batch size. If multiple input tensors are given, the results of
+    multiple output tensors with shape [N, `size`] will be summed up. If ``bias_attr``
+    is not None, a bias variable will be created and added to the output.
+    Finally, if ``act`` is not None, it will be applied to the output as well.
+    When the input is single ``Tensor`` :
+    .. math::
+        Out = Act({XW + b})
+    When the input are multiple ``Tensor`` :
+    .. math::
+        Out = Act({\sum_{i=0}^{N-1}X_iW_i + b})
+    In the above equation:
+    * :math:`N`: Number of the input. N equals to len(input) if input is list of ``Tensor`` .
+    * :math:`X_i`: The i-th input ``Tensor`` .
+    * :math:`W_i`: The i-th weights matrix corresponding i-th input tensor.
+    * :math:`b`: The bias parameter created by this layer (if needed).
+    * :math:`Act`: The activation function.
+    * :math:`Out`: The output ``Tensor`` .
+    See below for an example.
+    .. code-block:: text
+        Given:
+            data_1.data = [[[0.1, 0.2]]]
+            data_1.shape = (1, 1, 2) # 1 is batch_size
+            data_2.data = [[[0.1, 0.2, 0.3]]]
+            data_2.shape = (1, 1, 3) # 1 is batch_size
+            fc = FC("fc", 2, num_flatten_dims=2)
+            out = fc(input=[data_1, data_2])
+        Then:
+            out.data = [[[0.182996 -0.474117]]]
+            out.shape = (1, 1, 2)
+    Parameters:
+        
+        size(int): The number of output units in this layer.
+        num_flatten_dims (int, optional): The fc layer can accept an input tensor with more than
+            two dimensions. If this happens, the multi-dimension tensor will first be flattened
+            into a 2-dimensional matrix. The parameter `num_flatten_dims` determines how the input
+            tensor is flattened: the first `num_flatten_dims` (inclusive, index starts from 1)
+            dimensions will be flatten to form the first dimension of the final matrix (height of
+            the matrix), and the rest `rank(X) - num_flatten_dims` dimensions are flattened to
+            form the second dimension of the final matrix (width of the matrix). For example, suppose
+            `X` is a 5-dimensional tensor with a shape [2, 3, 4, 5, 6], and `num_flatten_dims` = 3.
+            Then, the flattened matrix will have a shape [2 x 3 x 4, 5 x 6] = [24, 30]. Default: 1
+        param_attr (ParamAttr or list of ParamAttr, optional): The parameter attribute for learnable
+            weights(Parameter) of this layer. Default: None.
+        bias_attr (ParamAttr or list of ParamAttr, optional): The attribute for the bias
+            of this layer. If it is set to False, no bias will be added to the output units.
+            If it is set to None, the bias is initialized zero. Default: None.
+        act (str, optional): Activation to be applied to the output of this layer. Default: None.
+        is_test(bool, optional): A flag indicating whether execution is in test phase. Default: False.
+        dtype(str, optional): Dtype used for weight, it can be "float32" or "float64". Default: "float32".
+    Attribute:
+        **weight** (list of Parameter): the learnable weights of this layer.
+        **bias** (Parameter or None): the learnable bias of this layer.
+    Returns:
+        None
+    
+    Examples:
+        .. code-block:: python
+          from paddle.fluid.dygraph.base import to_variable
+          import paddle.fluid as fluid
+          from paddle.fluid.dygraph import FC
+          import numpy as np
+          data = np.random.uniform(-1, 1, [30, 10, 32]).astype('float32')
+          with fluid.dygraph.guard():
+              fc = FC("fc", 64, num_flatten_dims=2)
+              data = to_variable(data)
+              conv = fc(data)
+    """
+
+    def __init__(self,
+                 size,
+                 num_flatten_dims=1,
+                 param_attr=None,
+                 bias_attr=None,
+                 act=None,
+                 is_test=False,
+                 dtype="float32"):
+        super(FC, self).__init__(dtype)
+
+        self._size = size
+        self._num_flatten_dims = num_flatten_dims
+        self._dtype = dtype
+        self._param_attr = param_attr
+        self._bias_attr = bias_attr
+        self._act = act
+        self.__w = list()
+
+    def _build_once(self, input):
+        i = 0
+        for inp, param in self._helper.iter_inputs_and_params(input,
+                                                              self._param_attr):
+            input_shape = inp.shape
+
+            param_shape = [
+                reduce(lambda a, b: a * b, input_shape[self._num_flatten_dims:],
+                       1)
+            ] + [self._size]
+            self.__w.append(
+                self.add_parameter(
+                    '_w%d' % i,
+                    self.create_parameter(
+                        attr=param,
+                        shape=param_shape,
+                        dtype=self._dtype,
+                        is_bias=False)))
+            i += 1
+
+        size = list([self._size])
+        self._b = self.create_parameter(
+            attr=self._bias_attr, shape=size, dtype=self._dtype, is_bias=True)
+
+    # TODO(songyouwei): We should remove _w property
+    @property
+    def _w(self, i=0):
+        return self.__w[i]
+
+    @_w.setter
+    def _w(self, value, i=0):
+        assert isinstance(self.__w[i], Variable)
+        self.__w[i].set_value(value)
+
+    @property
+    def weight(self):
+        if len(self.__w) > 1:
+            return self.__w
+        else:
+            return self.__w[0]
+
+    @weight.setter
+    def weight(self, value):
+        if len(self.__w) == 1:
+            self.__w[0] = value
+
+    @property
+    def bias(self):
+        return self._b
+
+    @bias.setter
+    def bias(self, value):
+        self._b = value
+
+    def forward(self, input):
+        mul_results = list()
+        i = 0
+        for inp, param in self._helper.iter_inputs_and_params(input,
+                                                              self._param_attr):
+            tmp = self._helper.create_variable_for_type_inference(self._dtype)
+            self._helper.append_op(
+                type="mul",
+                inputs={"X": inp,
+                        "Y": self.__w[i]},
+                outputs={"Out": tmp},
+                attrs={
+                    "x_num_col_dims": self._num_flatten_dims,
+                    "y_num_col_dims": 1
+                })
+            i += 1
+            mul_results.append(tmp)
+
+        if len(mul_results) == 1:
+            pre_bias = mul_results[0]
+        else:
+            pre_bias = self._helper.create_variable_for_type_inference(
+                self._dtype)
+            self._helper.append_op(
+                type="sum",
+                inputs={"X": mul_results},
+                outputs={"Out": pre_bias},
+                attrs={"use_mkldnn": False})
+
+        if self._b is not None:
+            pre_activation = self._helper.create_variable_for_type_inference(
+                dtype=self._dtype)
+            self._helper.append_op(
+                type='elementwise_add',
+                inputs={'X': [pre_bias],
+                        'Y': [self._b]},
+                outputs={'Out': [pre_activation]},
+                attrs={'axis': self._num_flatten_dims})
+        else:
+            pre_activation = pre_bias
+        # Currently, we don't support inplace in dygraph mode
+        return self._helper.append_activation(pre_activation, act=self._act)
+
+
+class HingeLoss(object):
+    """
+    Hing Loss Calculate class
+    """
+
+    def __init__(self, conf_dict):
+        """
+        initialize
+        """
+        self.margin = conf_dict["loss"]["margin"]
+
+    def compute(self, pos, neg):
+        """
+        compute loss
+        """
+        elementwise_max = ElementwiseMaxLayer()
+        elementwise_add = ElementwiseAddLayer()
+        elementwise_sub = ElementwiseSubLayer()
+        constant = ConstantLayer()
+        reduce_mean = ReduceMeanLayer()
+        loss = reduce_mean.ops(
+            elementwise_max.ops(
+                constant.ops(neg, neg.shape, "float32", 0.0),
+                elementwise_add.ops(
+                    elementwise_sub.ops(neg, pos),
+                    constant.ops(neg, neg.shape, "float32", self.margin))))
+        return loss
+
+
+class BOW(Layer):
+    """
+    BOW
+    """
+
+    def __init__(self, conf_dict):
+        """
+        initialize
+        """
+        super(BOW, self).__init__()
+        self.dict_size = conf_dict["dict_size"]
+        self.task_mode = conf_dict["task_mode"]
+        self.emb_dim = conf_dict["net"]["emb_dim"]
+        self.bow_dim = conf_dict["net"]["bow_dim"]
+        self.seq_len = conf_dict["seq_len"]
+        self.emb_layer = EmbeddingLayer(self.dict_size, self.emb_dim,
+                                        "emb").ops()
+        self.bow_layer = Linear(self.bow_dim, self.bow_dim)
+        self.bow_layer_po = FCLayer(self.bow_dim, None, "fc").ops()
+        self.softmax_layer = FCLayer(2, "softmax", "cos_sim").ops()
+
+    @declarative
+    def forward(self, left, right):
+        """
+        Forward network
+        """
+
+        # embedding layer
+        left_emb = self.emb_layer(left)
+        right_emb = self.emb_layer(right)
+        left_emb = fluid.layers.reshape(
+            left_emb, shape=[-1, self.seq_len, self.bow_dim])
+        right_emb = fluid.layers.reshape(
+            right_emb, shape=[-1, self.seq_len, self.bow_dim])
+
+        bow_left = fluid.layers.reduce_sum(left_emb, dim=1)
+        bow_right = fluid.layers.reduce_sum(right_emb, dim=1)
+        softsign_layer = SoftsignLayer()
+        left_soft = softsign_layer.ops(bow_left)
+        right_soft = softsign_layer.ops(bow_right)
+
+        left_bow = self.bow_layer(left_soft)
+        right_bow = self.bow_layer(right_soft)
+        cos_sim_layer = CosSimLayer()
+        pred = cos_sim_layer.ops(left_bow, right_bow)
+        return left_bow, pred
+
+        # TODO(huihuangzheng): uncomment the following return statements after
+        # we fix it.
+        #
+        # matching layer
+        #if self.task_mode == "pairwise":
+        #    left_bow = self.bow_layer(left_soft)
+        #    right_bow = self.bow_layer(right_soft)
+        #    cos_sim_layer = CosSimLayer()
+        #    pred = cos_sim_layer.ops(left_bow, right_bow)
+        #    return left_bow, pred
+        #else:
+        #    concat_layer = ConcatLayer(1)
+        #    concat = concat_layer.ops([left_soft, right_soft])
+        #    concat_fc = self.bow_layer_po(concat)
+        #    pred = self.softmax_layer(concat_fc)
+        #    return left_soft, pred
diff --git a/python/paddle/fluid/tests/unittests/dygraph_to_static/test_bmn.py b/python/paddle/fluid/tests/unittests/dygraph_to_static/test_bmn.py
index 0e0084aca349e870154c828c7aeb55b017a3ba03..5896d3a29294861bde07a025678a9d78bebf5a6b 100644
--- a/python/paddle/fluid/tests/unittests/dygraph_to_static/test_bmn.py
+++ b/python/paddle/fluid/tests/unittests/dygraph_to_static/test_bmn.py
@@ -186,11 +186,11 @@ class BMN(fluid.dygraph.Layer):
             act="relu")
 
         # init to speed up
-        self.sample_mask = get_interp1d_mask(
-            self.tscale, self.dscale, self.prop_boundary_ratio, self.num_sample,
-            self.num_sample_perbin)
-        # self.sample_mask = fluid.dygraph.base.to_variable(sample_mask)
-        # self.sample_mask.stop_gradient = True
+        sample_mask = get_interp1d_mask(self.tscale, self.dscale,
+                                        self.prop_boundary_ratio,
+                                        self.num_sample, self.num_sample_perbin)
+        self.sample_mask = fluid.dygraph.base.to_variable(sample_mask)
+        self.sample_mask.stop_gradient = True
 
         self.p_conv3d1 = fluid.dygraph.Conv3D(
             num_channels=128,
@@ -241,12 +241,6 @@ class BMN(fluid.dygraph.Layer):
 
     @declarative
     def forward(self, x):
-        # TODO(Aurelius84): sample_mask is created in `__init__`,
-        # but currently we don't support that. The two lines code
-        # will be removed when support creating var outside of forward.
-        sample_mask = to_variable(self.sample_mask)
-        sample_mask.stop_gradient = True
-
         # Base Module
         x = self.b_conv1(x)
         x = self.b_conv2(x)
@@ -262,7 +256,7 @@ class BMN(fluid.dygraph.Layer):
         # PEM
         xp = self.p_conv1(x)
         # BM layer
-        xp = fluid.layers.matmul(xp, sample_mask)
+        xp = fluid.layers.matmul(xp, self.sample_mask)
         xp = fluid.layers.reshape(
             xp, shape=[0, 0, -1, self.dscale, self.tscale])
 
diff --git a/python/paddle/fluid/tests/unittests/dygraph_to_static/test_cast.py b/python/paddle/fluid/tests/unittests/dygraph_to_static/test_cast.py
new file mode 100644
index 0000000000000000000000000000000000000000..b4cc38b3a601b97d81a556dc37028ef7418dbf26
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/dygraph_to_static/test_cast.py
@@ -0,0 +1,173 @@
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+
+import unittest
+import numpy as np
+import paddle.fluid as fluid
+from paddle.fluid.dygraph import declarative
+
+SEED = 2020
+np.random.seed(SEED)
+
+
+@declarative
+def test_bool_cast(x):
+    x = fluid.dygraph.to_variable(x)
+    x = bool(x)
+    return x
+
+
+@declarative
+def test_int_cast(x):
+    x = fluid.dygraph.to_variable(x)
+    x = int(x)
+    return x
+
+
+@declarative
+def test_float_cast(x):
+    x = fluid.dygraph.to_variable(x)
+    x = float(x)
+    return x
+
+
+@declarative
+def test_not_var_cast(x):
+    x = int(x)
+    return x
+
+
+@declarative
+def test_mix_cast(x):
+    x = fluid.dygraph.to_variable(x)
+    x = int(x)
+    x = float(x)
+    x = bool(x)
+    x = float(x)
+    return x
+
+
+class TestCastBase(unittest.TestCase):
+    def setUp(self):
+        self.place = fluid.CUDAPlace(0) if fluid.is_compiled_with_cuda(
+        ) else fluid.CPUPlace()
+        self.prepare()
+        self.set_func()
+
+    def prepare(self):
+        self.input_shape = (16, 32)
+        self.input_dtype = 'float32'
+        self.input = np.random.binomial(
+            4, 0.3, size=np.product(self.input_shape)).reshape(
+                self.input_shape).astype(self.input_dtype)
+        self.cast_dtype = 'bool'
+
+    def set_func(self):
+        self.func = test_bool_cast
+
+    def do_test(self):
+        with fluid.dygraph.guard():
+            res = self.func(self.input)
+            return res
+
+    def test_cast_result(self):
+        res = self.do_test().numpy()
+        self.assertTrue(
+            res.dtype == self.cast_dtype,
+            msg='The target dtype is {}, but the casted dtype is {}.'.format(
+                self.cast_dtype, res.dtype))
+        ref_val = self.input.astype(self.cast_dtype)
+        self.assertTrue(
+            np.allclose(res, ref_val),
+            msg='The casted value is {}.\nThe correct value is {}.'.format(
+                res, ref_val))
+
+
+class TestIntCast(TestCastBase):
+    def prepare(self):
+        self.input_shape = (1, )
+        self.input_dtype = 'float32'
+        self.input = np.random.normal(
+            loc=6, scale=10, size=np.product(self.input_shape)).reshape(
+                self.input_shape).astype(self.input_dtype)
+        self.cast_dtype = 'int32'
+
+    def set_func(self):
+        self.func = test_int_cast
+
+
+class TestFloatCast(TestCastBase):
+    def prepare(self):
+        self.input_shape = (8, 16)
+        self.input_dtype = 'bool'
+        self.input = np.random.binomial(
+            2, 0.5, size=np.product(self.input_shape)).reshape(
+                self.input_shape).astype(self.input_dtype)
+        self.cast_dtype = 'float32'
+
+    def set_func(self):
+        self.func = test_float_cast
+
+
+class TestMixCast(TestCastBase):
+    def prepare(self):
+        self.input_shape = (8, 32)
+        self.input_dtype = 'float32'
+        self.input = np.random.normal(
+            loc=6, scale=10, size=np.product(self.input_shape)).reshape(
+                self.input_shape).astype(self.input_dtype)
+        self.cast_int = 'int'
+        self.cast_float = 'float32'
+        self.cast_bool = 'bool'
+        self.cast_dtype = 'float32'
+
+    def set_func(self):
+        self.func = test_mix_cast
+
+    def test_cast_result(self):
+        res = self.do_test().numpy()
+        self.assertTrue(
+            res.dtype == self.cast_dtype,
+            msg='The target dtype is {}, but the casted dtype is {}.'.format(
+                self.cast_dtype, res.dtype))
+        ref_val = self.input.astype(self.cast_int).astype(
+            self.cast_float).astype(self.cast_bool).astype(self.cast_dtype)
+        self.assertTrue(
+            np.allclose(res, ref_val),
+            msg='The casted value is {}.\nThe correct value is {}.'.format(
+                res, ref_val))
+
+
+class TestNotVarCast(TestCastBase):
+    def prepare(self):
+        self.input = 3.14
+        self.cast_dtype = 'int'
+
+    def set_func(self):
+        self.func = test_not_var_cast
+
+    def test_cast_result(self):
+        res = self.do_test()
+        self.assertTrue(type(res) == int, msg='The casted dtype is not int.')
+        ref_val = int(self.input)
+        self.assertTrue(
+            res == ref_val,
+            msg='The casted value is {}.\nThe correct value is {}.'.format(
+                res, ref_val))
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/dygraph_to_static/test_cycle_gan.py b/python/paddle/fluid/tests/unittests/dygraph_to_static/test_cycle_gan.py
index 844438eaf6f7818fe1d4d01bfd5cdcf18a39b688..d6840ed62810e13648b869340a41691aa0e89101 100644
--- a/python/paddle/fluid/tests/unittests/dygraph_to_static/test_cycle_gan.py
+++ b/python/paddle/fluid/tests/unittests/dygraph_to_static/test_cycle_gan.py
@@ -32,6 +32,10 @@ import unittest
 import numpy as np
 from PIL import Image, ImageOps
 
+import os
+# Use GPU:0 to elimate the influence of other tasks.
+os.environ["CUDA_VISIBLE_DEVICES"] = "1"
+
 import paddle
 import paddle.fluid as fluid
 from paddle.fluid.dygraph import to_variable, declarative, ProgramTranslator
@@ -40,10 +44,13 @@ from paddle.fluid.dygraph.nn import Conv2D, Conv2DTranspose, BatchNorm
 # Note: Set True to eliminate randomness.
 #     1. For one operation, cuDNN has several algorithms,
 #        some algorithm results are non-deterministic, like convolution algorithms.
+#     2. If include BatchNorm, please set `use_global_stats=True` to avoid using
+#        cudnnBatchNormalizationBackward which is non-deterministic.
 if fluid.is_compiled_with_cuda():
     fluid.set_flags({'FLAGS_cudnn_deterministic': True})
 
-use_cudnn = True
+# set False to speed up training.
+use_cudnn = False
 step_per_epoch = 10
 lambda_A = 10.0
 lambda_B = 10.0
@@ -110,7 +117,7 @@ class Cycle_Gan(fluid.dygraph.Layer):
         return fake_A, fake_B, cyc_A, cyc_B, g_A_loss, g_B_loss, idt_loss_A, idt_loss_B, cyc_A_loss, cyc_B_loss, g_loss
 
     @declarative
-    def disriminatorA(self, input_A, input_B):
+    def discriminatorA(self, input_A, input_B):
         """
         Discriminator A of GAN model.
         """
@@ -324,8 +331,14 @@ class conv2d(fluid.dygraph.Layer):
                 initializer=fluid.initializer.NormalInitializer(
                     loc=0.0, scale=stddev)),
             bias_attr=con_bias_attr)
+        # Note(Aurelius84): The calculation of GPU kernel in BN is non-deterministic, 
+        # failure rate is 1/100 in Dev but seems incremental in CE platform.
+        # If on GPU, we disable BN temporarily.
+        if fluid.is_compiled_with_cuda():
+            norm = False
         if norm:
             self.bn = BatchNorm(
+                use_global_stats=True,  # set True to use deterministic algorithm
                 num_channels=num_filters,
                 param_attr=fluid.ParamAttr(
                     initializer=fluid.initializer.NormalInitializer(1.0, 0.02)),
@@ -379,8 +392,11 @@ class DeConv2D(fluid.dygraph.Layer):
                 initializer=fluid.initializer.NormalInitializer(
                     loc=0.0, scale=stddev)),
             bias_attr=de_bias_attr)
+        if fluid.is_compiled_with_cuda():
+            norm = False
         if norm:
             self.bn = BatchNorm(
+                use_global_stats=True,  # set True to use deterministic algorithm
                 num_channels=num_filters,
                 param_attr=fluid.ParamAttr(
                     initializer=fluid.initializer.NormalInitializer(1.0, 0.02)),
@@ -429,7 +445,6 @@ class ImagePool(object):
 
 
 def reader_creater():
-    # local_random = np.random.RandomState(SEED)
     def reader():
         while True:
             fake_image = np.uint8(
@@ -480,13 +495,8 @@ def optimizer_setting(parameters):
 
 
 def train(args, to_static):
-    # FIXME(Aurelius84): Found diff just on GPU and it disappears when we remove the BatchNorm layers.
-    # In dygraph mode, it still exists with different output while executing the every time.
-
-    # place = fluid.CUDAPlace(0) if fluid.is_compiled_with_cuda() \
-    #     else fluid.CPUPlace()
-
-    place = fluid.CPUPlace()
+    place = fluid.CUDAPlace(0) if fluid.is_compiled_with_cuda() \
+        else fluid.CPUPlace()
 
     program_translator.enable(to_static)
 
@@ -553,8 +563,8 @@ def train(args, to_static):
                 fake_pool_A = to_variable(fake_pool_A)
 
                 # optimize the d_A network
-                rec_B, fake_pool_rec_B = cycle_gan.disriminatorA(data_B,
-                                                                 fake_pool_B)
+                rec_B, fake_pool_rec_B = cycle_gan.discriminatorA(data_B,
+                                                                  fake_pool_B)
                 d_loss_A = (fluid.layers.square(fake_pool_rec_B) +
                             fluid.layers.square(rec_B - 1)) / 2.0
                 d_loss_A = fluid.layers.reduce_mean(d_loss_A)
@@ -581,7 +591,6 @@ def train(args, to_static):
                     idt_loss_A, g_B_loss, cyc_B_loss, idt_loss_B
                 ]
                 cur_batch_loss = [x.numpy()[0] for x in cur_batch_loss]
-                loss_data.append(cur_batch_loss)
 
                 batch_time = time.time() - s_time
                 t_time += batch_time
@@ -593,6 +602,7 @@ def train(args, to_static):
                 if batch_id > args.train_step:
                     break
 
+                loss_data.append(cur_batch_loss)
         return np.array(loss_data)
 
 
@@ -607,8 +617,16 @@ class TestCycleGANModel(unittest.TestCase):
     def test_train(self):
         st_out = self.train(to_static=True)
         dy_out = self.train(to_static=False)
+
+        assert_func = np.allclose
+        # Note(Aurelius84): Because we disable BN on GPU, 
+        # but here we enhance the check on CPU by `np.array_equal`
+        # which means the dy_out and st_out shall be exactly same.
+        if not fluid.is_compiled_with_cuda():
+            assert_func = np.array_equal
+
         self.assertTrue(
-            np.allclose(dy_out, st_out),
+            assert_func(dy_out, st_out),
             msg="dy_out:\n {}\n st_out:\n{}".format(dy_out, st_out))
 
 
diff --git a/python/paddle/fluid/tests/unittests/dygraph_to_static/test_lac.py b/python/paddle/fluid/tests/unittests/dygraph_to_static/test_lac.py
new file mode 100644
index 0000000000000000000000000000000000000000..8141f9f462c1682188189ef3cfcef37f576f504c
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/dygraph_to_static/test_lac.py
@@ -0,0 +1,573 @@
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+import math
+import time
+import numpy as np
+import unittest
+
+import os
+os.environ["CUDA_VISIBLE_DEVICES"] = "2"
+
+import paddle.fluid as fluid
+from paddle.fluid.dygraph import to_variable
+from paddle.fluid.dygraph import Embedding, Linear, GRUUnit
+from paddle.fluid.dygraph import declarative, ProgramTranslator
+
+SEED = 2020
+
+program_translator = ProgramTranslator()
+
+
+class DynamicGRU(fluid.dygraph.Layer):
+    def __init__(self,
+                 size,
+                 h_0=None,
+                 param_attr=None,
+                 bias_attr=None,
+                 is_reverse=False,
+                 gate_activation='sigmoid',
+                 candidate_activation='tanh',
+                 origin_mode=False,
+                 init_size=None):
+        super(DynamicGRU, self).__init__()
+
+        self.gru_unit = GRUUnit(
+            size * 3,
+            param_attr=param_attr,
+            bias_attr=bias_attr,
+            activation=candidate_activation,
+            gate_activation=gate_activation,
+            origin_mode=origin_mode)
+
+        self.size = size
+        self.h_0 = h_0
+        self.is_reverse = is_reverse
+
+    def forward(self, inputs):
+        # Use `to_variable` to create a copy of global h_0 created not in `DynamicGRU`,
+        # to avoid modify it because `h_0` is both used in other `DynamicGRU`.
+        hidden = to_variable(self.h_0)
+        hidden.stop_gradient = True
+
+        res = []
+        for i in range(inputs.shape[1]):
+            if self.is_reverse:
+                j = fluid.layers.shape(inputs)[1] - 1 - i
+            else:
+                # TODO(Aurelius84): In while block, if the var created in parent block
+                # participates in the calculation of gradient, the result of gradient
+                # is incorrect because each step scope always returns the same value
+                # generated by last step. Here we add 0 to create `j` in while block to
+                # avoid this bug, and working on fixing it in next PR.
+                j = i + 0
+            # FIXME(Aurelius84): see above explanation.
+            hidden = fluid.layers.scale(hidden, 1)
+
+            # See above explanation.
+            # input_ = inputs[:, i:i+1, :]  # original code
+            input_ = fluid.layers.slice(
+                inputs, axes=[1], starts=[j], ends=[j + 1])
+            input_ = fluid.layers.reshape(
+                input_, [-1, input_.shape[2]], inplace=False)
+            hidden, reset, gate = self.gru_unit(input_, hidden)
+            hidden_ = fluid.layers.reshape(
+                hidden, [-1, 1, hidden.shape[1]], inplace=False)
+            res.append(hidden_)
+
+        if self.is_reverse:
+            res = res[::-1]
+        res = fluid.layers.concat(res, axis=1)
+        return res
+
+
+class BiGRU(fluid.dygraph.Layer):
+    def __init__(self, input_dim, grnn_hidden_dim, init_bound, h_0=None):
+        super(BiGRU, self).__init__()
+
+        self.pre_gru = Linear(
+            input_dim=input_dim,
+            output_dim=grnn_hidden_dim * 3,
+            param_attr=fluid.ParamAttr(
+                initializer=fluid.initializer.Uniform(
+                    low=-init_bound, high=init_bound),
+                regularizer=fluid.regularizer.L2DecayRegularizer(
+                    regularization_coeff=1e-4)))
+
+        self.gru = DynamicGRU(
+            size=grnn_hidden_dim,
+            h_0=h_0,
+            param_attr=fluid.ParamAttr(
+                initializer=fluid.initializer.Uniform(
+                    low=-init_bound, high=init_bound),
+                regularizer=fluid.regularizer.L2DecayRegularizer(
+                    regularization_coeff=1e-4)))
+
+        self.pre_gru_r = Linear(
+            input_dim=input_dim,
+            output_dim=grnn_hidden_dim * 3,
+            param_attr=fluid.ParamAttr(
+                initializer=fluid.initializer.Uniform(
+                    low=-init_bound, high=init_bound),
+                regularizer=fluid.regularizer.L2DecayRegularizer(
+                    regularization_coeff=1e-4)))
+
+        self.gru_r = DynamicGRU(
+            size=grnn_hidden_dim,
+            is_reverse=True,
+            h_0=h_0,
+            param_attr=fluid.ParamAttr(
+                initializer=fluid.initializer.Uniform(
+                    low=-init_bound, high=init_bound),
+                regularizer=fluid.regularizer.L2DecayRegularizer(
+                    regularization_coeff=1e-4)))
+
+    def forward(self, input_feature):
+        res_pre_gru = self.pre_gru(input_feature)
+        res_gru = self.gru(res_pre_gru)
+
+        res_pre_gru_r = self.pre_gru_r(input_feature)
+        res_gru_r = self.gru_r(res_pre_gru_r)
+
+        bi_merge = fluid.layers.concat(input=[res_gru, res_gru_r], axis=-1)
+        return bi_merge
+
+
+class LinearChainCRF(fluid.dygraph.Layer):
+    def __init__(self, param_attr, size=None, is_test=False, dtype='float32'):
+        super(LinearChainCRF, self).__init__()
+
+        self._param_attr = param_attr
+        self._dtype = dtype
+        self._size = size
+        self._is_test = is_test
+        self._transition = self.create_parameter(
+            attr=self._param_attr,
+            shape=[self._size + 2, self._size],
+            dtype=self._dtype)
+
+    @property
+    def weight(self):
+        return self._transition
+
+    @weight.setter
+    def weight(self, value):
+        self._transition = value
+
+    def forward(self, input, label, length=None):
+
+        alpha = self._helper.create_variable_for_type_inference(
+            dtype=self._dtype)
+        emission_exps = self._helper.create_variable_for_type_inference(
+            dtype=self._dtype)
+        transition_exps = self._helper.create_variable_for_type_inference(
+            dtype=self._dtype)
+        log_likelihood = self._helper.create_variable_for_type_inference(
+            dtype=self._dtype)
+        this_inputs = {
+            "Emission": [input],
+            "Transition": self._transition,
+            "Label": [label]
+        }
+        if length is not None:
+            this_inputs['Length'] = [length]
+        self._helper.append_op(
+            type='linear_chain_crf',
+            inputs=this_inputs,
+            outputs={
+                "Alpha": [alpha],
+                "EmissionExps": [emission_exps],
+                "TransitionExps": transition_exps,
+                "LogLikelihood": log_likelihood
+            },
+            attrs={"is_test": self._is_test, })
+        return log_likelihood
+
+
+class CRFDecoding(fluid.dygraph.Layer):
+    def __init__(self, param_attr, size=None, is_test=False, dtype='float32'):
+        super(CRFDecoding, self).__init__()
+
+        self._dtype = dtype
+        self._size = size
+        self._is_test = is_test
+        self._param_attr = param_attr
+        self._transition = self.create_parameter(
+            attr=self._param_attr,
+            shape=[self._size + 2, self._size],
+            dtype=self._dtype)
+
+    @property
+    def weight(self):
+        return self._transition
+
+    @weight.setter
+    def weight(self, value):
+        self._transition = value
+
+    def forward(self, input, label=None, length=None):
+
+        viterbi_path = self._helper.create_variable_for_type_inference(
+            dtype=self._dtype)
+        this_inputs = {
+            "Emission": [input],
+            "Transition": self._transition,
+            "Label": label
+        }
+        if length is not None:
+            this_inputs['Length'] = [length]
+        self._helper.append_op(
+            type='crf_decoding',
+            inputs=this_inputs,
+            outputs={"ViterbiPath": [viterbi_path]},
+            attrs={"is_test": self._is_test, })
+        return viterbi_path
+
+
+class ChunkEval(fluid.dygraph.Layer):
+    def __init__(self, num_chunk_types, chunk_scheme,
+                 excluded_chunk_types=None):
+        super(ChunkEval, self).__init__()
+        self.num_chunk_types = num_chunk_types
+        self.chunk_scheme = chunk_scheme
+        self.excluded_chunk_types = excluded_chunk_types
+
+    def forward(self, input, label, seq_length=None):
+
+        precision = self._helper.create_variable_for_type_inference(
+            dtype="float32")
+        recall = self._helper.create_variable_for_type_inference(
+            dtype="float32")
+        f1_score = self._helper.create_variable_for_type_inference(
+            dtype="float32")
+        num_infer_chunks = self._helper.create_variable_for_type_inference(
+            dtype="int64")
+        num_label_chunks = self._helper.create_variable_for_type_inference(
+            dtype="int64")
+        num_correct_chunks = self._helper.create_variable_for_type_inference(
+            dtype="int64")
+
+        this_input = {"Inference": [input], "Label": [label]}
+        if seq_length is not None:
+            this_input["SeqLength"] = [seq_length]
+
+        self._helper.append_op(
+            type='chunk_eval',
+            inputs=this_input,
+            outputs={
+                "Precision": [precision],
+                "Recall": [recall],
+                "F1-Score": [f1_score],
+                "NumInferChunks": [num_infer_chunks],
+                "NumLabelChunks": [num_label_chunks],
+                "NumCorrectChunks": [num_correct_chunks]
+            },
+            attrs={
+                "num_chunk_types": self.num_chunk_types,
+                "chunk_scheme": self.chunk_scheme,
+                "excluded_chunk_types": self.excluded_chunk_types or []
+            })
+        return (precision, recall, f1_score, num_infer_chunks, num_label_chunks,
+                num_correct_chunks)
+
+
+class LexNet(fluid.dygraph.Layer):
+    def __init__(self, args, length=None):
+        super(LexNet, self).__init__()
+        """
+        define the lexical analysis network structure
+        word: stores the input of the model
+        for_infer: a boolean value, indicating if the model to be created is for training or predicting.
+
+        return:
+            for infer: return the prediction
+            otherwise: return the prediction
+        """
+        self.word_emb_dim = args.word_emb_dim
+        self.vocab_size = args.vocab_size
+        self.num_labels = args.num_labels
+        self.grnn_hidden_dim = args.grnn_hidden_dim
+        self.emb_lr = args.emb_learning_rate if 'emb_learning_rate' in dir(
+            args) else 1.0
+        self.crf_lr = args.emb_learning_rate if 'crf_learning_rate' in dir(
+            args) else 1.0
+        self.bigru_num = args.bigru_num
+        self.init_bound = 0.1
+
+        self.word_embedding = Embedding(
+            size=[self.vocab_size, self.word_emb_dim],
+            dtype='float32',
+            param_attr=fluid.ParamAttr(
+                learning_rate=self.emb_lr,
+                name="word_emb",
+                initializer=fluid.initializer.Uniform(
+                    low=-self.init_bound, high=self.init_bound)))
+
+        h_0 = np.zeros((args.batch_size, self.grnn_hidden_dim), dtype="float32")
+        h_0 = to_variable(h_0)
+
+        self.bigru_units = []
+        for i in range(self.bigru_num):
+            if i == 0:
+                self.bigru_units.append(
+                    self.add_sublayer(
+                        "bigru_units%d" % i,
+                        BiGRU(
+                            self.grnn_hidden_dim,
+                            self.grnn_hidden_dim,
+                            self.init_bound,
+                            h_0=h_0)))
+            else:
+                self.bigru_units.append(
+                    self.add_sublayer(
+                        "bigru_units%d" % i,
+                        BiGRU(
+                            self.grnn_hidden_dim * 2,
+                            self.grnn_hidden_dim,
+                            self.init_bound,
+                            h_0=h_0)))
+
+        self.fc = Linear(
+            input_dim=self.grnn_hidden_dim * 2,
+            output_dim=self.num_labels,
+            param_attr=fluid.ParamAttr(
+                initializer=fluid.initializer.Uniform(
+                    low=-self.init_bound, high=self.init_bound),
+                regularizer=fluid.regularizer.L2DecayRegularizer(
+                    regularization_coeff=1e-4)))
+
+        self.linear_chain_crf = LinearChainCRF(
+            param_attr=fluid.ParamAttr(
+                name='linear_chain_crfw', learning_rate=self.crf_lr),
+            size=self.num_labels)
+
+        self.crf_decoding = CRFDecoding(
+            param_attr=fluid.ParamAttr(
+                name='crfw', learning_rate=self.crf_lr),
+            size=self.num_labels)
+        # share weight
+        self.crf_decoding.weight = self.linear_chain_crf.weight
+
+    @declarative
+    def forward(self, word, target, length=None):
+        """
+        Configure the network
+        """
+        word_embed = self.word_embedding(word)
+        input_feature = word_embed
+
+        for i in range(self.bigru_num):
+            bigru_output = self.bigru_units[i](input_feature)
+            input_feature = bigru_output
+
+        emission = self.fc(bigru_output)
+
+        crf_cost = self.linear_chain_crf(
+            input=emission, label=target, length=length)
+        avg_cost = fluid.layers.mean(x=crf_cost)
+        crf_decode = self.crf_decoding(input=emission, length=length)
+        return avg_cost, crf_decode
+
+
+class Args(object):
+    epoch = 1
+    batch_size = 4
+    vocab_size = 100
+    num_labels = 10
+    word_emb_dim = 128
+    grnn_hidden_dim = 128
+    base_learning_rate = 0.01
+    bigru_num = 2
+    print_steps = 1
+    model_save_dir = "./lac_model"
+    dy_param_path = "./lac_dy_param"
+
+
+def get_random_input_data(batch_size, vocab_size, num_labels, max_seq_len=64):
+    local_random = np.random.RandomState(SEED)
+    padding_id = np.int64(0)
+    iter_num = 5
+
+    def __reader__():
+        batch, init_lens = [], []
+        for i in range(iter_num * batch_size):
+            cur_len = local_random.randint(3, max_seq_len)
+            word_ids = local_random.randint(0, vocab_size,
+                                            [cur_len]).astype('int64').tolist()
+            label_ids = local_random.randint(0, num_labels,
+                                             [cur_len]).astype('int64').tolist()
+            batch.append((word_ids, label_ids))
+            init_lens.append(cur_len)
+            if len(batch) == batch_size:
+                batch_max_len = min(max(init_lens), max_seq_len)
+                new_batch = []
+                for words_len, (word_ids, label_ids) in zip(init_lens, batch):
+                    word_ids = word_ids[0:batch_max_len]
+                    words_len = np.int64(len(word_ids))
+                    word_ids += [
+                        padding_id for _ in range(batch_max_len - words_len)
+                    ]
+                    label_ids = label_ids[0:batch_max_len]
+                    label_ids += [
+                        padding_id for _ in range(batch_max_len - words_len)
+                    ]
+                    assert len(word_ids) == len(label_ids)
+                    new_batch.append((word_ids, label_ids, words_len))
+                yield new_batch
+                batch, init_lens = [], []
+
+    return __reader__
+
+
+def create_dataloader(reader, place):
+    data_loader = fluid.io.DataLoader.from_generator(
+        capacity=16, use_double_buffer=True, iterable=True)
+
+    data_loader.set_sample_list_generator(reader, places=place)
+
+    return data_loader
+
+
+def do_train(args, to_static):
+    program_translator.enable(to_static)
+    place = fluid.CUDAPlace(0) if fluid.is_compiled_with_cuda(
+    ) else fluid.CPUPlace()
+    with fluid.dygraph.guard(place):
+        fluid.default_startup_program().random_seed = SEED
+        fluid.default_main_program().random_seed = SEED
+
+        reader = get_random_input_data(args.batch_size, args.vocab_size,
+                                       args.num_labels)
+        train_loader = create_dataloader(reader, place)
+
+        model = LexNet(args)
+        optimizer = fluid.optimizer.AdamOptimizer(
+            learning_rate=args.base_learning_rate,
+            parameter_list=model.parameters())
+        chunk_eval = ChunkEval(
+            int(math.ceil((args.num_labels - 1) / 2.0)), "IOB")
+
+        step = 0
+        chunk_evaluator = fluid.metrics.ChunkEvaluator()
+        chunk_evaluator.reset()
+
+        loss_data = []
+        for epoch_id in range(args.epoch):
+            for batch in train_loader():
+                words, targets, length = batch
+                start_time = time.time()
+                avg_cost, crf_decode = model(words, targets, length)
+                loss_data.append(avg_cost.numpy()[0])
+
+                # backward and optimization
+                avg_cost.backward()
+                optimizer.minimize(avg_cost)
+                model.clear_gradients()
+                end_time = time.time()
+
+                if step % args.print_steps == 0:
+                    (precision, recall, f1_score, num_infer_chunks,
+                     num_label_chunks, num_correct_chunks) = chunk_eval(
+                         input=crf_decode, label=targets, seq_length=length)
+                    outputs = [avg_cost, precision, recall, f1_score]
+                    avg_cost, precision, recall, f1_score = [
+                        np.mean(x.numpy()) for x in outputs
+                    ]
+
+                    print(
+                        "[train] step = %d, loss = %f, P: %f, R: %f, F1: %f, elapsed time %f"
+                        % (step, avg_cost, precision, recall, f1_score,
+                           end_time - start_time))
+
+                step += 1
+        # save inference model
+        if to_static:
+            program_translator.save_inference_model(
+                dirname=args.model_save_dir, feed=[0, 2], fetch=[1])
+        else:
+            fluid.dygraph.save_dygraph(model.state_dict(), args.dy_param_path)
+
+        return np.array(loss_data)
+
+
+class TestLACModel(unittest.TestCase):
+    def setUp(self):
+        self.args = Args()
+        self.place = fluid.CUDAPlace(0) if fluid.is_compiled_with_cuda(
+        ) else fluid.CPUPlace()
+
+    def train(self, to_static):
+        out = do_train(self.args, to_static)
+        return out
+
+    def test_train(self):
+        st_out = self.train(to_static=True)
+        dy_out = self.train(to_static=False)
+        self.assertTrue(
+            np.allclose(dy_out, st_out),
+            msg="dygraph output:\n{},\nstatic output:\n {}.".format(dy_out,
+                                                                    st_out))
+        # Prediction needs trained models, so put `test_predict` at last of `test_train`
+        self.verify_predict()
+
+    def verify_predict(self):
+        reader = get_random_input_data(
+            self.args.batch_size, self.args.vocab_size, self.args.num_labels)
+        for batch in reader():
+            batch = [np.vstack(var) for var in zip(*batch)]
+            dy_pre = self.predict_dygraph(batch)
+            st_pre = self.predict_static(batch)
+            self.assertTrue(
+                np.allclose(dy_pre, st_pre),
+                msg="dy_pre:\n {}\n, st_pre: \n{}.".format(dy_pre, st_pre))
+
+    def predict_dygraph(self, batch):
+        words, targets, length = batch
+        program_translator.enable(False)
+        with fluid.dygraph.guard(self.place):
+            model = LexNet(self.args)
+            # load dygraph trained parameters
+            model_dict, _ = fluid.load_dygraph(self.args.dy_param_path +
+                                               ".pdparams")
+            model.set_dict(model_dict)
+            model.eval()
+
+            _, pred_res = model(
+                to_variable(words), to_variable(targets), to_variable(length))
+
+            return pred_res.numpy()
+
+    def predict_static(self, batch):
+        """
+        LAC model contains h_0 created in `__init__` that is necessary for inferring.
+        Load inference model to test it's ok for prediction.
+        """
+        exe = fluid.Executor(self.place)
+        # load inference model
+        [inference_program, feed_target_names,
+         fetch_targets] = fluid.io.load_inference_model(
+             self.args.model_save_dir, executor=exe)
+
+        words, targets, length = batch
+        pred_res = exe.run(
+            inference_program,
+            feed={feed_target_names[0]: words,
+                  feed_target_names[1]: length},
+            fetch_list=fetch_targets)
+        return pred_res[0]
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/dygraph_to_static/test_mnist.py b/python/paddle/fluid/tests/unittests/dygraph_to_static/test_mnist.py
index 71aed18c105b858b9defb70e0daf46af5f6ce804..09be10e6c8a7e9b676e434b410f702c3fe7bdb91 100644
--- a/python/paddle/fluid/tests/unittests/dygraph_to_static/test_mnist.py
+++ b/python/paddle/fluid/tests/unittests/dygraph_to_static/test_mnist.py
@@ -26,6 +26,7 @@ from paddle.fluid.dygraph import to_variable
 from paddle.fluid.dygraph.nn import Conv2D, Linear, Pool2D
 from paddle.fluid.optimizer import AdamOptimizer
 from paddle.fluid.dygraph.jit import declarative
+from paddle.fluid.dygraph.io import VARIABLE_FILENAME
 from paddle.fluid.dygraph.dygraph_to_static import ProgramTranslator
 
 SEED = 2020
@@ -107,12 +108,9 @@ class MNIST(fluid.dygraph.Layer):
             loss = fluid.layers.cross_entropy(x, label)
             avg_loss = fluid.layers.mean(loss)
 
-        # TODO: Uncomment code after "return" statement can be transformed correctly.
-
-        #     return x, acc, avg_loss
-        # else:
-        #     return x
-        return x, acc, avg_loss
+            return x, acc, avg_loss
+        else:
+            return x
 
     def inference(self, inputs):
         x = self._simple_img_conv_pool_1(inputs)
@@ -201,6 +199,9 @@ class TestMNISTWithDeclarative(TestMNIST):
                         self.check_save_inference_model([dy_x_data, y_data],
                                                         prog_trans, to_static,
                                                         prediction)
+                        # new save load check
+                        self.check_jit_save_load(mnist, [dy_x_data], [img],
+                                                 to_static, prediction)
                         break
         return loss_data
 
@@ -224,6 +225,45 @@ class TestMNISTWithDeclarative(TestMNIST):
 
         return np.array(results[0])
 
+    def check_jit_save_load(self, model, inputs, input_spec, to_static, gt_out):
+        if to_static:
+            infer_model_path = "./test_mnist_inference_model_by_jit_save"
+            configs = fluid.dygraph.jit.SaveLoadConfig()
+            configs.output_spec = [gt_out]
+            fluid.dygraph.jit.save(
+                layer=model,
+                model_path=infer_model_path,
+                input_spec=input_spec,
+                configs=configs)
+            # load in static mode
+            static_infer_out = self.jit_load_and_run_inference_static(
+                infer_model_path, inputs)
+            self.assertTrue(np.allclose(gt_out.numpy(), static_infer_out))
+            # load in dygraph mode
+            dygraph_infer_out = self.jit_load_and_run_inference_dygraph(
+                infer_model_path, inputs)
+            self.assertTrue(np.allclose(gt_out.numpy(), dygraph_infer_out))
+
+    @switch_to_static_graph
+    def jit_load_and_run_inference_static(self, model_path, inputs):
+        exe = fluid.Executor(self.place)
+        [inference_program, feed_target_names,
+         fetch_targets] = fluid.io.load_inference_model(
+             dirname=model_path,
+             executor=exe,
+             params_filename=VARIABLE_FILENAME)
+        assert len(inputs) == len(feed_target_names)
+        results = exe.run(inference_program,
+                          feed=dict(zip(feed_target_names, inputs)),
+                          fetch_list=fetch_targets)
+
+        return np.array(results[0])
+
+    def jit_load_and_run_inference_dygraph(self, model_path, inputs):
+        infer_net = fluid.dygraph.jit.load(model_path)
+        pred = infer_net(inputs[0])
+        return pred.numpy()
+
 
 if __name__ == "__main__":
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/dygraph_to_static/test_mobile_net.py b/python/paddle/fluid/tests/unittests/dygraph_to_static/test_mobile_net.py
index 88e57e206143ab88932160303bd284f312cb1edf..33b5860d7fd1fcdfa13743fc8d1edce6d00e77a4 100644
--- a/python/paddle/fluid/tests/unittests/dygraph_to_static/test_mobile_net.py
+++ b/python/paddle/fluid/tests/unittests/dygraph_to_static/test_mobile_net.py
@@ -407,12 +407,14 @@ def create_optimizer(args, parameter_list):
     return optimizer
 
 
-def fake_data_reader(batch_size, lable_size):
+def fake_data_reader(batch_size, label_size):
+    local_random = np.random.RandomState(SEED)
+
     def reader():
         batch_data = []
         while True:
-            img = np.random.random([3, 224, 224]).astype('float32')
-            label = np.random.randint(0, lable_size, [1]).astype('int64')
+            img = local_random.random_sample([3, 224, 224]).astype('float32')
+            label = local_random.randint(0, label_size, [1]).astype('int64')
             batch_data.append([img, label])
             if len(batch_data) == batch_size:
                 yield batch_data
@@ -517,10 +519,10 @@ class TestMobileNet(unittest.TestCase):
             np.allclose(dy_out, st_out),
             msg="dy_out: {}, st_out: {}".format(dy_out, st_out))
 
-    def test_mobileNetV1(self):
+    def test_mobileNet(self):
+        # MobileNet-V1
         self.assert_same_loss("MobileNetV1")
-
-    def test_mobileNetV2(self):
+        # MobileNet-V2
         self.assert_same_loss("MobileNetV2")
 
 
diff --git a/python/paddle/fluid/tests/unittests/dygraph_to_static/test_program_translator.py b/python/paddle/fluid/tests/unittests/dygraph_to_static/test_program_translator.py
index 3cf8f5b71d7760e9cfea11049f02e07ee31a8087..873d9ecb53549e9d6a3982ca4528e63526bd3a0d 100644
--- a/python/paddle/fluid/tests/unittests/dygraph_to_static/test_program_translator.py
+++ b/python/paddle/fluid/tests/unittests/dygraph_to_static/test_program_translator.py
@@ -63,6 +63,14 @@ def get_source_code(func):
 class StaticCode1():
     # TODO: Transform return statement
     def dyfunc_with_if_else(x_v, label=None):
+        __return_1 = fluid.layers.fill_constant(
+            shape=[1], dtype='bool', value=False)
+        __return_0 = fluid.layers.fill_constant(
+            shape=[1], dtype='bool', value=False)
+        __return_value_init_0 = fluid.layers.fill_constant(
+            shape=[1], dtype='float64', value=0.0)
+        __return_value_0 = __return_value_init_0
+
         def true_fn_0(x_v):
             x_v = x_v - 1
             return x_v
@@ -75,45 +83,95 @@ class StaticCode1():
             fluid.layers.mean(x_v)[0] > 5, true_fn_0, false_fn_0, (x_v, ),
             (x_v, ), (x_v, ))
 
-        def true_fn_1(label, x_v):
+        def true_fn_1(__return_0, __return_value_0, label, x_v):
             loss = fluid.layers.cross_entropy(x_v, label)
-            return loss
-            return
-
-        def false_fn_1():
-            return
-
-        fluid.dygraph.dygraph_to_static.convert_operators.convert_ifelse(
-            label is not None, true_fn_1, false_fn_1, (label, x_v), (), ())
-        return x_v
+            __return_0 = fluid.layers.fill_constant(
+                shape=[1], dtype='bool', value=True)
+            __return_value_0 = loss
+            return __return_0, __return_value_0
+
+        def false_fn_1(__return_0, __return_value_0):
+            return __return_0, __return_value_0
+
+        __return_0, __return_value_0 = (
+            fluid.dygraph.dygraph_to_static.convert_operators.convert_ifelse(
+                label is not None, true_fn_1, false_fn_1,
+                (__return_0, __return_value_0, label, x_v),
+                (__return_0, __return_value_0), (__return_0, __return_value_0)))
+
+        def true_fn_2(__return_1, __return_value_0, x_v):
+            __return_1 = fluid.layers.fill_constant(
+                shape=[1], dtype='bool', value=True)
+            __return_value_0 = x_v
+            return __return_1, __return_value_0
+
+        def false_fn_2(__return_1, __return_value_0):
+            return __return_1, __return_value_0
+
+        __return_1, __return_value_0 = (
+            fluid.dygraph.dygraph_to_static.convert_operators.convert_ifelse(
+                fluid.dygraph.dygraph_to_static.convert_operators.
+                convert_logical_not(__return_0), true_fn_2, false_fn_2,
+                (__return_1, __return_value_0, x_v),
+                (__return_1, __return_value_0), (__return_1, __return_value_0)))
+        return __return_value_0
 
 
 class StaticCode2():
     # TODO: Transform return statement
     def dyfunc_with_if_else(x_v, label=None):
-        def true_fn_2(x_v):
+        __return_3 = fluid.layers.fill_constant(
+            shape=[1], dtype='bool', value=False)
+        __return_2 = fluid.layers.fill_constant(
+            shape=[1], dtype='bool', value=False)
+        __return_value_init_1 = fluid.layers.fill_constant(
+            shape=[1], dtype='float64', value=0.0)
+        __return_value_1 = __return_value_init_1
+
+        def true_fn_3(x_v):
             x_v = x_v - 1
             return x_v
 
-        def false_fn_2(x_v):
+        def false_fn_3(x_v):
             x_v = x_v + 1
             return x_v
 
         x_v = fluid.dygraph.dygraph_to_static.convert_operators.convert_ifelse(
-            fluid.layers.mean(x_v)[0] > 5, true_fn_2, false_fn_2, (x_v, ),
+            fluid.layers.mean(x_v)[0] > 5, true_fn_3, false_fn_3, (x_v, ),
             (x_v, ), (x_v, ))
 
-        def true_fn_3(label, x_v):
+        def true_fn_4(__return_2, __return_value_1, label, x_v):
             loss = fluid.layers.cross_entropy(x_v, label)
-            return loss
-            return
-
-        def false_fn_3():
-            return
-
-        fluid.dygraph.dygraph_to_static.convert_operators.convert_ifelse(
-            label is not None, true_fn_3, false_fn_3, (label, x_v), (), ())
-        return x_v
+            __return_2 = fluid.layers.fill_constant(
+                shape=[1], dtype='bool', value=True)
+            __return_value_1 = loss
+            return __return_2, __return_value_1
+
+        def false_fn_4(__return_2, __return_value_1):
+            return __return_2, __return_value_1
+
+        __return_2, __return_value_1 = (
+            fluid.dygraph.dygraph_to_static.convert_operators.convert_ifelse(
+                label is not None, true_fn_4, false_fn_4,
+                (__return_2, __return_value_1, label, x_v),
+                (__return_2, __return_value_1), (__return_2, __return_value_1)))
+
+        def true_fn_5(__return_3, __return_value_1, x_v):
+            __return_3 = fluid.layers.fill_constant(
+                shape=[1], dtype='bool', value=True)
+            __return_value_1 = x_v
+            return __return_3, __return_value_1
+
+        def false_fn_5(__return_3, __return_value_1):
+            return __return_3, __return_value_1
+
+        __return_3, __return_value_1 = (
+            fluid.dygraph.dygraph_to_static.convert_operators.convert_ifelse(
+                fluid.dygraph.dygraph_to_static.convert_operators.
+                convert_logical_not(__return_2), true_fn_5, false_fn_5,
+                (__return_3, __return_value_1, x_v),
+                (__return_3, __return_value_1), (__return_3, __return_value_1)))
+        return __return_value_1
 
 
 class NetWithError(fluid.dygraph.layers.Layer):
diff --git a/python/paddle/fluid/tests/unittests/dygraph_to_static/test_reinforcement_learning.py b/python/paddle/fluid/tests/unittests/dygraph_to_static/test_reinforcement_learning.py
new file mode 100644
index 0000000000000000000000000000000000000000..2f753cd5cfc49c08f82b3594f594c1f9a5c2d48c
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/dygraph_to_static/test_reinforcement_learning.py
@@ -0,0 +1,218 @@
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import gym
+import math
+import itertools
+import numpy as np
+import paddle.fluid as fluid
+import paddle.fluid.dygraph.nn as nn
+from paddle.fluid.dygraph import to_variable, Layer
+from paddle.fluid.dygraph import declarative, ProgramTranslator
+
+import unittest
+
+SEED = 2020
+program_translator = ProgramTranslator()
+
+
+class Policy(Layer):
+    def __init__(self):
+        super(Policy, self).__init__()
+
+        self.affine1 = nn.Linear(4, 128)
+        self.affine2 = nn.Linear(128, 2)
+        self.dropout_ratio = 0.6
+
+        self.saved_log_probs = []
+        self.rewards = []
+
+    @declarative
+    def forward(self, x):
+        x = fluid.layers.reshape(x, shape=[1, 4])
+        x = self.affine1(x)
+        x = fluid.layers.dropout(x, self.dropout_ratio)
+        x = fluid.layers.relu(x)
+        action_scores = self.affine2(x)
+
+        log_prob = fluid.layers.softmax(action_scores, axis=1)
+
+        return log_prob
+
+
+class Args(object):
+    gamma = 0.99
+    log_interval = 1
+    train_step = 10
+
+
+def train(args, place, to_static):
+    program_translator.enable(to_static)
+
+    env = gym.make('CartPole-v0')
+    env.seed(SEED)
+
+    with fluid.dygraph.guard(place):
+        fluid.default_main_program().random_seed = SEED
+        fluid.default_startup_program().random_seed = SEED
+        local_random = np.random.RandomState(SEED)
+
+        policy = Policy()
+
+        eps = np.finfo(np.float32).eps.item()
+        optimizer = fluid.optimizer.AdamaxOptimizer(
+            learning_rate=1e-2, parameter_list=policy.parameters())
+
+        def get_mean_and_std(values=[]):
+            n = 0.
+            s = 0.
+            for val in values:
+                s += val
+                n += 1
+            mean = s / n
+
+            std = 0.
+            for val in values:
+                std += (val - mean) * (val - mean)
+            std /= n
+            std = math.sqrt(std)
+
+            return mean, std
+
+        def sample_action(probs):
+            sample = local_random.random_sample()
+            idx = 0
+
+            while idx < len(probs) and sample > probs[idx]:
+                sample -= probs[idx]
+                idx += 1
+            mask = [0.] * len(probs)
+            mask[idx] = 1.
+
+            return idx, np.array([mask]).astype("float32")
+
+        def choose_best_action(probs):
+            idx = 0 if probs[0] > probs[1] else 1
+            mask = [1., 0.] if idx == 0 else [0., 1.]
+
+            return idx, np.array([mask]).astype("float32")
+
+        def select_action(state):
+            state = to_variable(state)
+            state.stop_gradient = True
+            loss_probs = policy(state)
+            # print(loss_probs.name)
+            probs = loss_probs.numpy()
+
+            action, _mask = sample_action(probs[0])
+            mask = to_variable(_mask)
+            mask.stop_gradient = True
+
+            loss_probs = fluid.layers.log(loss_probs)
+            loss_probs = fluid.layers.elementwise_mul(loss_probs, mask)
+            loss_probs = fluid.layers.reduce_sum(loss_probs, dim=-1)
+
+            policy.saved_log_probs.append(loss_probs)
+            return action, loss_probs
+
+        def finish_episode():
+            R = 0
+            policy_loss = []
+            returns = []
+            for r in policy.rewards[::-1]:
+                R = r + args.gamma * R
+                returns.insert(0, R)
+
+            mean, std = get_mean_and_std(returns)
+
+            returns = np.array(returns).astype("float32")
+            returns = (returns - mean) / (std + eps)
+
+            # calculate policy loss of each step.
+            for log_prob, R in zip(policy.saved_log_probs, returns):
+                log_prob_numpy = log_prob.numpy()
+
+                R_numpy = np.ones_like(log_prob_numpy).astype("float32")
+                _R = -1 * R * R_numpy
+                _R = to_variable(_R)
+                _R.stop_gradient = True
+                cur_loss = fluid.layers.elementwise_mul(_R, log_prob)
+                policy_loss.append(cur_loss)
+
+            policy_loss = fluid.layers.concat(policy_loss)
+            policy_loss = fluid.layers.reduce_sum(policy_loss)
+
+            policy_loss.backward()
+            optimizer.minimize(policy_loss)
+            policy.clear_gradients()
+
+            del policy.rewards[:]
+            del policy.saved_log_probs[:]
+
+            return returns
+
+        loss_data = []
+        running_reward = 10
+        for i_episode in itertools.count(1):
+            state, ep_reward = env.reset(), 0
+            # TODO(Aurelius84): In RL, we continuously select actions with multiple steps, 
+            # then accumulate loss to apply optimization. But currently all vars shared with 
+            # the same inner scope, which has problem in backward. I will fix it in next PR.
+            for t in range(1, 2):  # default 1000
+                state = np.array(state).astype("float32")
+                action, loss = select_action(state)
+                state, reward, done, _ = env.step(action)
+
+                # log loss_probs
+                loss_data.append(loss.numpy()[0])
+
+                policy.rewards.append(reward)
+                ep_reward += reward
+
+                if done:
+                    break
+
+            # sum loss and apply optimization
+            returns = finish_episode()
+
+            running_reward = 0.05 * ep_reward + (1 - 0.05) * running_reward
+            if i_episode % args.log_interval == 0:
+                print(
+                    'Episode {}\tLast reward: {:.2f}\tAverage reward: {:.2f}\t loss_probs: {}'.
+                    format(i_episode, ep_reward, running_reward,
+                           loss.numpy()[0]))
+
+            if i_episode > args.train_step:
+                break
+
+        return np.array(loss_data)
+
+
+class TestDeclarative(unittest.TestCase):
+    def setUp(self):
+        self.place = fluid.CUDAPlace(0) if fluid.is_compiled_with_cuda() \
+            else fluid.CPUPlace()
+
+        self.args = Args()
+
+    def test_train(self):
+        st_out = train(self.args, self.place, to_static=True)
+        dy_out = train(self.args, self.place, to_static=False)
+        self.assertTrue(
+            np.allclose(st_out, dy_out),
+            msg="dy_out:\n {}\n st_out:\n{}\n".format(dy_out, st_out))
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/dygraph_to_static/test_return.py b/python/paddle/fluid/tests/unittests/dygraph_to_static/test_return.py
new file mode 100644
index 0000000000000000000000000000000000000000..1f4f82146645ded6f345abc7d17b1724d9c3a8b9
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/dygraph_to_static/test_return.py
@@ -0,0 +1,259 @@
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+
+import unittest
+import numpy as np
+import paddle.fluid as fluid
+import paddle.fluid.core as core
+from paddle.fluid.dygraph import declarative
+from paddle.fluid.dygraph import ProgramTranslator
+
+from ifelse_simple_func import dyfunc_with_if_else
+
+SEED = 2020
+np.random.seed(SEED)
+
+
+@declarative
+def test_return_base(x):
+    x = fluid.dygraph.to_variable(x)
+    return x
+
+
+@declarative
+def test_inside_func_base(x):
+    x = fluid.dygraph.to_variable(x)
+
+    def inner_func(x):
+        return x
+
+    return inner_func(x)
+
+
+@declarative
+def test_return_if(x):
+    x = fluid.dygraph.to_variable(x)
+    if x < 0:
+        x -= 1
+        return -x
+    x += 3
+    return x
+
+
+@declarative
+def test_return_if_else(x):
+    x = fluid.dygraph.to_variable(x)
+    if x > 0:
+        x += 10086
+        return x
+        x -= 3  # useless statement to test our code can handle it.
+    else:
+        x += 6666
+        return x
+        x -= 8888  # useless statement to test our code can handle it.
+
+
+@declarative
+def test_return_in_while(x):
+    x = fluid.dygraph.to_variable(x)
+    i = fluid.layers.fill_constant(shape=[1], dtype='int32', value=0)
+    while i < 10:
+        i += 1
+        if i > 5:
+            x += 110
+            return x
+        x += i
+    return x
+
+
+@declarative
+def test_return_in_for(x):
+    x = fluid.dygraph.to_variable(x)
+    for i in range(10):
+        if i <= 4:
+            x += 1
+            continue
+        else:
+            return x + 10086
+    return x - 1
+
+
+@declarative
+def test_recursive_return(x):
+    x = fluid.dygraph.to_variable(x)
+    return dyfunc_with_if_else(x)
+
+
+@declarative
+def test_return_different_length_if_body(x):
+    x = fluid.dygraph.to_variable(x)
+    y = x + 1
+    if x > 0:
+        # x = to_variable(np.ones(1)) so it will return here
+        return x, y
+    else:
+        return x
+
+
+@declarative
+def test_return_different_length_else(x):
+    x = fluid.dygraph.to_variable(x)
+    y = x + 1
+    if x < 0:
+        return x, y
+    else:
+        # x = to_variable(np.ones(1)) so it will return here
+        return x
+
+
+@declarative
+def test_no_return(x):
+    x = fluid.dygraph.to_variable(x)
+    y = x + 1
+
+
+@declarative
+def test_return_none(x):
+    x = fluid.dygraph.to_variable(x)
+    y = x + 1
+    if x > 0:
+        # x = to_variable(np.ones(1)) so it will return here
+        return None
+    else:
+        return x, y
+
+
+@declarative
+def test_return_no_variable(x):
+    x = fluid.dygraph.to_variable(x)
+    y = x + 1
+    if x < 0:
+        return x, y
+    else:
+        # x = to_variable(np.ones(1)) so it will return here
+        return
+
+
+class TestReturnBase(unittest.TestCase):
+    def setUp(self):
+        self.input = np.ones((1)).astype('int32')
+        self.place = fluid.CUDAPlace(0) if fluid.is_compiled_with_cuda(
+        ) else fluid.CPUPlace()
+        self.init_dygraph_func()
+        self.program_translator = ProgramTranslator()
+
+    def init_dygraph_func(self):
+        self.dygraph_func = test_return_base
+
+    def run_dygraph_mode(self):
+        self.program_translator.enable(False)
+        with fluid.dygraph.guard():
+            res = self.dygraph_func(self.input)
+            if isinstance(res, (tuple)):
+                return tuple(r.numpy() for r in res)
+            elif isinstance(res, core.VarBase):
+                return res.numpy()
+            return res
+
+    def run_static_mode(self):
+        self.program_translator.enable(True)
+        with fluid.dygraph.guard():
+            res = self.dygraph_func(self.input)
+            if isinstance(res, tuple):
+                return tuple(r.numpy() for r in res)
+            elif isinstance(res, core.VarBase):
+                return res.numpy()
+            return res
+
+    def test_transformed_static_result(self):
+        dygraph_res = self.run_dygraph_mode()
+        static_res = self.run_static_mode()
+        if isinstance(dygraph_res, tuple):
+            self.assertTrue(isinstance(static_res, tuple))
+            self.assertEqual(len(dygraph_res), len(static_res))
+            for i in range(len(dygraph_res)):
+                self.assertTrue(
+                    np.allclose(dygraph_res[i], static_res[i]),
+                    msg='dygraph res is {}\nstatic_res is {}'.format(
+                        dygraph_res[i], static_res[i]))
+
+        elif isinstance(dygraph_res, np.ndarray):
+            self.assertTrue(
+                np.allclose(dygraph_res, static_res),
+                msg='dygraph res is {}\nstatic_res is {}'.format(dygraph_res,
+                                                                 static_res))
+        else:
+            self.assertEqual(dygraph_res, static_res)
+
+
+class TestInsideFuncBase(TestReturnBase):
+    def init_dygraph_func(self):
+        self.dygraph_func = test_inside_func_base
+
+
+class TestReturnIf(TestReturnBase):
+    def init_dygraph_func(self):
+        self.dygraph_func = test_return_if
+
+
+class TestReturnIfElse(TestReturnBase):
+    def init_dygraph_func(self):
+        self.dygraph_func = test_return_if_else
+
+
+class TestReturnInWhile(TestReturnBase):
+    def init_dygraph_func(self):
+        self.dygraph_func = test_return_in_while
+
+
+class TestReturnInFor(TestReturnBase):
+    def init_dygraph_func(self):
+        self.dygraph_func = test_return_in_for
+
+
+class TestRecursiveReturn(TestReturnBase):
+    def init_dygraph_func(self):
+        self.input = self.input.astype(np.float32)
+        self.dygraph_func = test_recursive_return
+
+
+class TestReturnDifferentLengthIfBody(TestReturnBase):
+    def init_dygraph_func(self):
+        self.dygraph_func = test_return_different_length_if_body
+
+
+class TestReturnDifferentLengthElse(TestReturnBase):
+    def init_dygraph_func(self):
+        self.dygraph_func = test_return_different_length_else
+
+
+class TestNoReturn(TestReturnBase):
+    def init_dygraph_func(self):
+        self.dygraph_func = test_no_return
+
+
+class TestReturnNone(TestReturnBase):
+    def init_dygraph_func(self):
+        self.dygraph_func = test_return_none
+
+
+class TestReturnNoVariable(TestReturnBase):
+    def init_dygraph_func(self):
+        self.dygraph_func = test_return_no_variable
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/dygraph_to_static/test_sentiment.py b/python/paddle/fluid/tests/unittests/dygraph_to_static/test_sentiment.py
new file mode 100644
index 0000000000000000000000000000000000000000..fd5a58be26be43996bbb1f80557512bf974de52f
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/dygraph_to_static/test_sentiment.py
@@ -0,0 +1,357 @@
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import time
+import unittest
+import numpy as np
+
+import paddle.fluid as fluid
+from paddle.fluid.dygraph.nn import Conv2D, Linear, Embedding
+from paddle.fluid.dygraph import to_variable, ProgramTranslator, declarative
+
+from test_lac import DynamicGRU
+
+SEED = 2020
+program_translator = ProgramTranslator()
+
+# Note: Set True to eliminate randomness.
+#     1. For one operation, cuDNN has several algorithms,
+#        some algorithm results are non-deterministic, like convolution algorithms.
+if fluid.is_compiled_with_cuda():
+    fluid.set_flags({'FLAGS_cudnn_deterministic': True})
+
+
+class SimpleConvPool(fluid.dygraph.Layer):
+    def __init__(self,
+                 num_channels,
+                 num_filters,
+                 filter_size,
+                 use_cudnn=True,
+                 batch_size=None):
+        super(SimpleConvPool, self).__init__()
+        self.batch_size = batch_size
+        self._conv2d = Conv2D(
+            num_channels=num_channels,
+            num_filters=num_filters,
+            filter_size=filter_size,
+            padding=[1, 1],
+            use_cudnn=use_cudnn,
+            act='tanh')
+
+    def forward(self, inputs):
+        x = self._conv2d(inputs)
+        x = fluid.layers.reduce_max(x, dim=-1)
+        x = fluid.layers.reshape(x, shape=[self.batch_size, -1])
+        return x
+
+
+class CNN(fluid.dygraph.Layer):
+    def __init__(self, dict_dim, batch_size, seq_len):
+        super(CNN, self).__init__()
+        self.dict_dim = dict_dim
+        self.emb_dim = 128
+        self.hid_dim = 128
+        self.fc_hid_dim = 96
+        self.class_dim = 2
+        self.channels = 1
+        self.win_size = [3, self.hid_dim]
+        self.batch_size = batch_size
+        self.seq_len = seq_len
+        self.embedding = Embedding(
+            size=[self.dict_dim + 1, self.emb_dim],
+            dtype='float32',
+            is_sparse=False)
+        self._simple_conv_pool_1 = SimpleConvPool(
+            self.channels,
+            self.hid_dim,
+            self.win_size,
+            batch_size=self.batch_size)
+        self._fc1 = Linear(
+            input_dim=self.hid_dim * self.seq_len,
+            output_dim=self.fc_hid_dim,
+            act="softmax")
+        self._fc_prediction = Linear(
+            input_dim=self.fc_hid_dim, output_dim=self.class_dim, act="softmax")
+
+    @declarative
+    def forward(self, inputs, label=None):
+        emb = self.embedding(inputs)
+        o_np_mask = (
+            fluid.layers.reshape(inputs, [-1, 1]) != self.dict_dim).astype(
+                dtype='float32')
+        mask_emb = fluid.layers.expand(o_np_mask, [1, self.hid_dim])
+        emb = emb * mask_emb
+        emb = fluid.layers.reshape(
+            emb, shape=[-1, self.channels, self.seq_len, self.hid_dim])
+        conv_3 = self._simple_conv_pool_1(emb)
+        fc_1 = self._fc1(conv_3)
+        prediction = self._fc_prediction(fc_1)
+
+        cost = fluid.layers.cross_entropy(input=prediction, label=label)
+        avg_cost = fluid.layers.mean(x=cost)
+        acc = fluid.layers.accuracy(input=prediction, label=label)
+        return avg_cost, prediction, acc
+
+
+class BOW(fluid.dygraph.Layer):
+    def __init__(self, dict_dim, batch_size, seq_len):
+        super(BOW, self).__init__()
+        self.dict_dim = dict_dim
+        self.emb_dim = 128
+        self.hid_dim = 128
+        self.fc_hid_dim = 96
+        self.class_dim = 2
+        self.batch_size = batch_size
+        self.seq_len = seq_len
+        self.embedding = Embedding(
+            size=[self.dict_dim + 1, self.emb_dim],
+            dtype='float32',
+            is_sparse=False)
+        self._fc1 = Linear(
+            input_dim=self.hid_dim, output_dim=self.hid_dim, act="tanh")
+        self._fc2 = Linear(
+            input_dim=self.hid_dim, output_dim=self.fc_hid_dim, act="tanh")
+        self._fc_prediction = Linear(
+            input_dim=self.fc_hid_dim, output_dim=self.class_dim, act="softmax")
+
+    @declarative
+    def forward(self, inputs, label=None):
+        emb = self.embedding(inputs)
+        o_np_mask = (
+            fluid.layers.reshape(inputs, [-1, 1]) != self.dict_dim).astype(
+                dtype='float32')
+        mask_emb = fluid.layers.expand(o_np_mask, [1, self.hid_dim])
+        emb = emb * mask_emb
+        emb = fluid.layers.reshape(emb, shape=[-1, self.seq_len, self.hid_dim])
+        bow_1 = fluid.layers.reduce_sum(emb, dim=1)
+        bow_1 = fluid.layers.tanh(bow_1)
+        fc_1 = self._fc1(bow_1)
+        fc_2 = self._fc2(fc_1)
+        prediction = self._fc_prediction(fc_2)
+
+        cost = fluid.layers.cross_entropy(input=prediction, label=label)
+        avg_cost = fluid.layers.mean(x=cost)
+        acc = fluid.layers.accuracy(input=prediction, label=label)
+        return avg_cost, prediction, acc
+
+
+class GRU(fluid.dygraph.Layer):
+    def __init__(self, dict_dim, batch_size, seq_len):
+        super(GRU, self).__init__()
+        self.dict_dim = dict_dim
+        self.emb_dim = 128
+        self.hid_dim = 128
+        self.fc_hid_dim = 96
+        self.class_dim = 2
+        self.batch_size = batch_size
+        self.seq_len = seq_len
+        self.embedding = Embedding(
+            size=[self.dict_dim + 1, self.emb_dim],
+            dtype='float32',
+            param_attr=fluid.ParamAttr(learning_rate=30),
+            is_sparse=False)
+        h_0 = np.zeros((self.batch_size, self.hid_dim), dtype="float32")
+        h_0 = to_variable(h_0)
+        self._fc1 = Linear(input_dim=self.hid_dim, output_dim=self.hid_dim * 3)
+        self._fc2 = Linear(
+            input_dim=self.hid_dim, output_dim=self.fc_hid_dim, act="tanh")
+        self._fc_prediction = Linear(
+            input_dim=self.fc_hid_dim, output_dim=self.class_dim, act="softmax")
+        self._gru = DynamicGRU(size=self.hid_dim, h_0=h_0)
+
+    @declarative
+    def forward(self, inputs, label=None):
+        emb = self.embedding(inputs)
+        o_np_mask = (fluid.layers.reshape(inputs, [-1, 1]) != self.dict_dim
+                     ).astype('float32')
+        mask_emb = fluid.layers.expand(o_np_mask, [1, self.hid_dim])
+        emb = emb * mask_emb
+        emb = fluid.layers.reshape(
+            emb, shape=[self.batch_size, -1, self.hid_dim])
+        fc_1 = self._fc1(emb)
+        gru_hidden = self._gru(fc_1)
+        gru_hidden = fluid.layers.reduce_max(gru_hidden, dim=1)
+        tanh_1 = fluid.layers.tanh(gru_hidden)
+        fc_2 = self._fc2(tanh_1)
+        prediction = self._fc_prediction(fc_2)
+
+        cost = fluid.layers.cross_entropy(input=prediction, label=label)
+        avg_cost = fluid.layers.mean(x=cost)
+        acc = fluid.layers.accuracy(input=prediction, label=label)
+        return avg_cost, prediction, acc
+
+
+class BiGRU(fluid.dygraph.Layer):
+    def __init__(self, dict_dim, batch_size, seq_len):
+        super(BiGRU, self).__init__()
+        self.dict_dim = dict_dim
+        self.emb_dim = 128
+        self.hid_dim = 128
+        self.fc_hid_dim = 96
+        self.class_dim = 2
+        self.batch_size = batch_size
+        self.seq_len = seq_len
+        self.embedding = Embedding(
+            size=[self.dict_dim + 1, self.emb_dim],
+            dtype='float32',
+            param_attr=fluid.ParamAttr(learning_rate=30),
+            is_sparse=False)
+        h_0 = np.zeros((self.batch_size, self.hid_dim), dtype="float32")
+        h_0 = to_variable(h_0)
+        self._fc1 = Linear(input_dim=self.hid_dim, output_dim=self.hid_dim * 3)
+        self._fc2 = Linear(
+            input_dim=self.hid_dim * 2, output_dim=self.fc_hid_dim, act="tanh")
+        self._fc_prediction = Linear(
+            input_dim=self.fc_hid_dim, output_dim=self.class_dim, act="softmax")
+        self._gru_forward = DynamicGRU(
+            size=self.hid_dim, h_0=h_0, is_reverse=False)
+        self._gru_backward = DynamicGRU(
+            size=self.hid_dim, h_0=h_0, is_reverse=True)
+
+    @declarative
+    def forward(self, inputs, label=None):
+        emb = self.embedding(inputs)
+        o_np_mask = (fluid.layers.reshape(inputs, [-1, 1]) != self.dict_dim
+                     ).astype('float32')
+        mask_emb = fluid.layers.expand(o_np_mask, [1, self.hid_dim])
+        emb = emb * mask_emb
+        emb = fluid.layers.reshape(
+            emb, shape=[self.batch_size, -1, self.hid_dim])
+        fc_1 = self._fc1(emb)
+        gru_forward = self._gru_forward(fc_1)
+        gru_backward = self._gru_backward(fc_1)
+        gru_forward_tanh = fluid.layers.tanh(gru_forward)
+        gru_backward_tanh = fluid.layers.tanh(gru_backward)
+        encoded_vector = fluid.layers.concat(
+            input=[gru_forward_tanh, gru_backward_tanh], axis=2)
+        encoded_vector = fluid.layers.reduce_max(encoded_vector, dim=1)
+        fc_2 = self._fc2(encoded_vector)
+        prediction = self._fc_prediction(fc_2)
+        # TODO(Aurelius84): Uncomment the following codes when we support return variable-length vars.
+        # if label is not None:
+        cost = fluid.layers.cross_entropy(input=prediction, label=label)
+        avg_cost = fluid.layers.mean(x=cost)
+        acc = fluid.layers.accuracy(input=prediction, label=label)
+        return avg_cost, prediction, acc
+        # else:
+        #     return prediction
+
+
+def fake_data_reader(class_num, vocab_size, batch_size, padding_size):
+    local_random = np.random.RandomState(SEED)
+
+    def reader():
+        batch_data = []
+        while True:
+            label = local_random.randint(0, class_num)
+            seq_len = local_random.randint(padding_size // 2,
+                                           int(padding_size * 1.2))
+            word_ids = local_random.randint(0, vocab_size, [seq_len]).tolist()
+            word_ids = word_ids[:padding_size] + [vocab_size] * (padding_size -
+                                                                 seq_len)
+            batch_data.append((word_ids, [label], seq_len))
+            if len(batch_data) == batch_size:
+                yield batch_data
+                batch_data = []
+
+    return reader
+
+
+class Args(object):
+    epoch = 1
+    batch_size = 4
+    class_num = 2
+    lr = 0.01
+    vocab_size = 1000
+    padding_size = 50
+    log_step = 2
+    train_step = 10
+
+
+def train(args, to_static):
+    program_translator.enable(to_static)
+    place = fluid.CUDAPlace(0) if fluid.is_compiled_with_cuda() \
+        else fluid.CPUPlace()
+
+    with fluid.dygraph.guard(place):
+        np.random.seed(SEED)
+        fluid.default_startup_program().random_seed = SEED
+        fluid.default_main_program().random_seed = SEED
+
+        train_reader = fake_data_reader(args.class_num, args.vocab_size,
+                                        args.batch_size, args.padding_size)
+        train_loader = fluid.io.DataLoader.from_generator(capacity=24)
+        train_loader.set_sample_list_generator(train_reader)
+
+        if args.model_type == 'cnn_net':
+            model = CNN(args.vocab_size, args.batch_size, args.padding_size)
+        elif args.model_type == 'bow_net':
+            model = BOW(args.vocab_size, args.batch_size, args.padding_size)
+        elif args.model_type == 'gru_net':
+            model = GRU(args.vocab_size, args.batch_size, args.padding_size)
+        elif args.model_type == 'bigru_net':
+            model = BiGRU(args.vocab_size, args.batch_size, args.padding_size)
+        sgd_optimizer = fluid.optimizer.Adagrad(
+            learning_rate=args.lr, parameter_list=model.parameters())
+
+        loss_data = []
+        for eop in range(args.epoch):
+            time_begin = time.time()
+            for batch_id, data in enumerate(train_loader()):
+                word_ids, labels, seq_lens = data
+                doc = to_variable(word_ids.numpy().reshape(-1)).astype('int64')
+                label = labels.astype('int64')
+
+                model.train()
+                avg_cost, prediction, acc = model(doc, label)
+                loss_data.append(avg_cost.numpy()[0])
+
+                avg_cost.backward()
+                sgd_optimizer.minimize(avg_cost)
+                model.clear_gradients()
+
+                if batch_id % args.log_step == 0:
+                    time_end = time.time()
+                    used_time = time_end - time_begin
+                    print("step: %d, ave loss: %f, speed: %f steps/s" %
+                          (batch_id, avg_cost.numpy()[0],
+                           args.log_step / used_time))
+                    time_begin = time.time()
+
+                if batch_id == args.train_step:
+                    break
+                batch_id += 1
+    return loss_data
+
+
+class TestSentiment(unittest.TestCase):
+    def setUp(self):
+        self.args = Args()
+
+    def train_model(self, model_type='cnn_net'):
+        self.args.model_type = model_type
+        st_out = train(self.args, True)
+        dy_out = train(self.args, False)
+        self.assertTrue(
+            np.allclose(dy_out, st_out),
+            msg="dy_out:\n {}\n st_out:\n {}".format(dy_out, st_out))
+
+    def test_train(self):
+        model_types = ['cnn_net', 'bow_net', 'gru_net', 'bigru_net']
+        for model_type in model_types:
+            print('training %s ....' % model_type)
+            self.train_model(model_type)
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/dygraph_to_static/test_seq2seq.py b/python/paddle/fluid/tests/unittests/dygraph_to_static/test_seq2seq.py
index c44b5375d263d3bf435743d1731637b511ab10ba..3c0c046191c9da482645ee93d15650e077d77b3b 100644
--- a/python/paddle/fluid/tests/unittests/dygraph_to_static/test_seq2seq.py
+++ b/python/paddle/fluid/tests/unittests/dygraph_to_static/test_seq2seq.py
@@ -21,7 +21,7 @@ import paddle.fluid as fluid
 from paddle.fluid.clip import GradientClipByGlobalNorm
 from paddle.fluid.dygraph.dygraph_to_static import ProgramTranslator
 
-from seq2seq_dygraph_model import BaseModel
+from seq2seq_dygraph_model import BaseModel, AttentionModel
 from seq2seq_utils import Seq2SeqModelHyperParams as args
 from seq2seq_utils import get_data_iter
 place = fluid.CUDAPlace(0) if fluid.is_compiled_with_cuda() else fluid.CPUPlace(
@@ -43,19 +43,29 @@ def prepare_input(batch):
     return inputs, np.sum(tar_mask)
 
 
-def train():
+def train(attn_model=False):
     with fluid.dygraph.guard(place):
         fluid.default_startup_program().random_seed = 2020
         fluid.default_main_program().random_seed = 2020
 
-        model = BaseModel(
-            args.hidden_size,
-            args.src_vocab_size,
-            args.tar_vocab_size,
-            args.batch_size,
-            num_layers=args.num_layers,
-            init_scale=args.init_scale,
-            dropout=args.dropout)
+        if attn_model:
+            model = AttentionModel(
+                args.hidden_size,
+                args.src_vocab_size,
+                args.tar_vocab_size,
+                args.batch_size,
+                num_layers=args.num_layers,
+                init_scale=args.init_scale,
+                dropout=args.dropout)
+        else:
+            model = BaseModel(
+                args.hidden_size,
+                args.src_vocab_size,
+                args.tar_vocab_size,
+                args.batch_size,
+                num_layers=args.num_layers,
+                init_scale=args.init_scale,
+                dropout=args.dropout)
 
         gloabl_norm_clip = GradientClipByGlobalNorm(args.max_grad_norm)
         optimizer = fluid.optimizer.SGD(args.learning_rate,
@@ -88,84 +98,108 @@ def train():
                     "Batch:[%d]; Time: %.5f s; loss: %.5f; total_loss: %.5f; word num: %.5f; ppl: %.5f"
                     % (batch_id, batch_time, loss.numpy(), total_loss.numpy(),
                        word_count, np.exp(total_loss.numpy() / word_count)))
-            if batch_id + 1 >= STEP_NUM:
-                break
-        model_dir = os.path.join(args.model_path)
+
+            if attn_model:
+                # NOTE: Please see code of AttentionModel.
+                # Because diff exits if call while_loop in static graph, only run 4 batches to pass the test temporarily.
+                if batch_id + 1 >= 4:
+                    break
+            else:
+                if batch_id + 1 >= STEP_NUM:
+                    break
+
+        model_path = args.attn_model_path if attn_model else args.base_model_path
+        model_dir = os.path.join(model_path)
+
         if not os.path.exists(model_dir):
             os.makedirs(model_dir)
         fluid.save_dygraph(model.state_dict(), model_dir)
         return loss.numpy()
 
 
-def infer():
+def infer(attn_model=False):
     with fluid.dygraph.guard(place):
-        model = BaseModel(
-            args.hidden_size,
-            args.src_vocab_size,
-            args.tar_vocab_size,
-            args.batch_size,
-            beam_size=args.beam_size,
-            num_layers=args.num_layers,
-            init_scale=args.init_scale,
-            dropout=0.0,
-            mode='beam_search')
-        state_dict, _ = fluid.dygraph.load_dygraph(args.model_path)
+
+        if attn_model:
+            model = AttentionModel(
+                args.hidden_size,
+                args.src_vocab_size,
+                args.tar_vocab_size,
+                args.batch_size,
+                beam_size=args.beam_size,
+                num_layers=args.num_layers,
+                init_scale=args.init_scale,
+                dropout=0.0,
+                mode='beam_search')
+        else:
+            model = BaseModel(
+                args.hidden_size,
+                args.src_vocab_size,
+                args.tar_vocab_size,
+                args.batch_size,
+                beam_size=args.beam_size,
+                num_layers=args.num_layers,
+                init_scale=args.init_scale,
+                dropout=0.0,
+                mode='beam_search')
+
+        model_path = args.attn_model_path if attn_model else args.base_model_path
+        state_dict, _ = fluid.dygraph.load_dygraph(model_path)
         model.set_dict(state_dict)
         model.eval()
         train_data_iter = get_data_iter(args.batch_size, mode='infer')
-        batch_times = []
         for batch_id, batch in enumerate(train_data_iter):
-            batch_start_time = time.time()
             input_data_feed, word_num = prepare_input(batch)
             input_data_feed = [
                 fluid.dygraph.to_variable(np_inp) for np_inp in input_data_feed
             ]
             outputs = model.beam_search(input_data_feed)
-            batch_end_time = time.time()
-            batch_time = batch_end_time - batch_start_time
-            batch_times.append(batch_time)
-            if batch_id > STEP_NUM:
-                break
+            break
 
         return outputs.numpy()
 
 
 class TestSeq2seq(unittest.TestCase):
-    def run_dygraph(self, mode="train"):
+    def run_dygraph(self, mode="train", attn_model=False):
         program_translator.enable(False)
         if mode == "train":
-            return train()
+            return train(attn_model)
         else:
-            return infer()
+            return infer(attn_model)
 
-    def run_static(self, mode="train"):
+    def run_static(self, mode="train", attn_model=False):
         program_translator.enable(True)
         if mode == "train":
-            return train()
+            return train(attn_model)
         else:
-            return infer()
+            return infer(attn_model)
 
-    def _test_train(self):
-        dygraph_loss = self.run_dygraph(mode="train")
-        static_loss = self.run_static(mode="train")
+    def _test_train(self, attn_model=False):
+        dygraph_loss = self.run_dygraph(mode="train", attn_model=attn_model)
+        static_loss = self.run_static(mode="train", attn_model=attn_model)
         result = np.allclose(dygraph_loss, static_loss)
         self.assertTrue(
             result,
             msg="\ndygraph_loss = {} \nstatic_loss = {}".format(dygraph_loss,
                                                                 static_loss))
 
-    def _test_predict(self):
-        pred_dygraph = self.run_dygraph(mode="test")
-        pred_static = self.run_static(mode="test")
+    def _test_predict(self, attn_model=False):
+        pred_dygraph = self.run_dygraph(mode="test", attn_model=attn_model)
+        pred_static = self.run_static(mode="test", attn_model=attn_model)
         result = np.allclose(pred_static, pred_dygraph)
         self.assertTrue(
             result,
             msg="\npred_dygraph = {} \npred_static = {}".format(pred_dygraph,
                                                                 pred_static))
 
-    def test_check_result(self):
-        self._test_train()
-        self._test_predict()
+    def test_base_model(self):
+        self._test_train(attn_model=False)
+        self._test_predict(attn_model=False)
+
+    def test_attn_model(self):
+        self._test_train(attn_model=True)
+        # TODO(liym27): add predict
+        # self._test_predict(attn_model=True)
 
 
 if __name__ == '__main__':
diff --git a/python/paddle/fluid/tests/unittests/dygraph_to_static/test_simnet.py b/python/paddle/fluid/tests/unittests/dygraph_to_static/test_simnet.py
new file mode 100644
index 0000000000000000000000000000000000000000..373e942f6f342a31954d94579508256d42a18ac7
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/dygraph_to_static/test_simnet.py
@@ -0,0 +1,173 @@
+#   Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import argparse
+import numpy as np
+import paddle
+import paddle.fluid as fluid
+import random
+import unittest
+
+from paddle.fluid.dygraph import ProgramTranslator
+from simnet_dygraph_model import BOW, HingeLoss
+
+SEED = 102
+random.seed(SEED)
+
+
+def create_conf_dict():
+    conf_dict = {}
+    conf_dict["task_mode"] = "train"
+    conf_dict["net"] = {"emb_dim": 128, "bow_dim": 128, "hidden_dim": 128}
+    conf_dict["loss"] = {"margin": 0.1}
+    return conf_dict
+
+
+def parse_args():
+    parser = argparse.ArgumentParser()
+    parser.add_argument(
+        "--batch_size",
+        type=int,
+        default=32,
+        help="Total examples' number in batch for training.")
+    parser.add_argument(
+        "--seq_len", type=int, default=32, help="The length of each sentence.")
+    parser.add_argument(
+        "--epoch", type=int, default=1, help="The number of training epoch.")
+    parser.add_argument(
+        "--fake_sample_size",
+        type=int,
+        default=128,
+        help="The number of samples of fake data.")
+    args = parser.parse_args([])
+    return args
+
+
+args = parse_args()
+
+
+def fake_vocabulary():
+    vocab = {}
+    vocab["<unk>"] = 0
+    for i in range(26):
+        c = chr(ord('a') + i)
+        vocab[c] = i + 1
+    return vocab
+
+
+vocab = fake_vocabulary()
+
+
+class FakeReaderProcessor(object):
+    def __init__(self, args, vocab):
+        self.vocab = vocab
+        self.seq_len = args.seq_len
+        self.sample_size = args.fake_sample_size
+        self.data_samples = []
+        for i in range(self.sample_size):
+            query = [random.randint(0, 26) for i in range(self.seq_len)]
+            pos_title = query[:]
+            neg_title = [26 - q for q in query]
+            self.data_samples.append(
+                np.array([query, pos_title, neg_title]).astype(np.int64))
+
+    def get_reader(self, mode, epoch=0):
+        def reader_with_pairwise():
+            if mode == "train":
+                for i in range(self.sample_size):
+                    yield self.data_samples[i]
+
+        return reader_with_pairwise
+
+
+simnet_process = FakeReaderProcessor(args, vocab)
+
+
+def train(conf_dict, to_static):
+    """
+    train process
+    """
+    program_translator = ProgramTranslator()
+    program_translator.enable(to_static)
+
+    # Get device
+    if fluid.is_compiled_with_cuda():
+        place = fluid.CUDAPlace(0)
+    else:
+        place = fluid.CPUPlace()
+
+    with fluid.dygraph.guard(place):
+        fluid.default_startup_program().random_seed = SEED
+        fluid.default_main_program().random_seed = SEED
+
+        conf_dict['dict_size'] = len(vocab)
+        conf_dict['seq_len'] = args.seq_len
+
+        net = BOW(conf_dict)
+        loss = HingeLoss(conf_dict)
+        optimizer = fluid.optimizer.AdamOptimizer(
+            learning_rate=0.001,
+            beta1=0.9,
+            beta2=0.999,
+            epsilon=1e-08,
+            parameter_list=net.parameters())
+
+        metric = fluid.metrics.Auc(name="auc")
+
+        global_step = 0
+        losses = []
+
+        train_loader = fluid.io.DataLoader.from_generator(
+            capacity=16,
+            return_list=True,
+            iterable=True,
+            use_double_buffer=True)
+        get_train_examples = simnet_process.get_reader(
+            "train", epoch=args.epoch)
+        train_loader.set_sample_list_generator(
+            paddle.batch(
+                get_train_examples, batch_size=args.batch_size), place)
+
+        for left, pos_right, neg_right in train_loader():
+            left = fluid.layers.reshape(left, shape=[-1, 1])
+            pos_right = fluid.layers.reshape(pos_right, shape=[-1, 1])
+            neg_right = fluid.layers.reshape(neg_right, shape=[-1, 1])
+            net.train()
+            global_step += 1
+            left_feat, pos_score = net(left, pos_right)
+            pred = pos_score
+            _, neg_score = net(left, neg_right)
+            avg_cost = loss.compute(pos_score, neg_score)
+            losses.append(np.mean(avg_cost.numpy()))
+            avg_cost.backward()
+            optimizer.minimize(avg_cost)
+            net.clear_gradients()
+    return losses
+
+
+class TestSimnet(unittest.TestCase):
+    def test_dygraph_static_same_loss(self):
+        if fluid.is_compiled_with_cuda():
+            fluid.set_flags({"FLAGS_cudnn_deterministic": True})
+        conf_dict = create_conf_dict()
+        dygraph_loss = train(conf_dict, to_static=False)
+        static_loss = train(conf_dict, to_static=True)
+
+        self.assertEqual(len(dygraph_loss), len(static_loss))
+        for i in range(len(dygraph_loss)):
+            self.assertAlmostEqual(dygraph_loss[i], static_loss[i])
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/dygraph_to_static/yolov3.py b/python/paddle/fluid/tests/unittests/dygraph_to_static/yolov3.py
index 3f55af964792252839a2712f7915ccd0cc0b92a0..b883f1820c1b19a5ee8513c98dbf3a9196369d3c 100644
--- a/python/paddle/fluid/tests/unittests/dygraph_to_static/yolov3.py
+++ b/python/paddle/fluid/tests/unittests/dygraph_to_static/yolov3.py
@@ -314,21 +314,19 @@ class YOLOv3(fluid.dygraph.Layer):
                         scores, perm=[0, 2, 1]))
             self.downsample //= 2
 
-        # TODO(liym27): Uncomment code after "return" statement can be transformed correctly.
-        # if not self.is_train:
-        #     # get pred
-        #     yolo_boxes = fluid.layers.concat(self.boxes, axis=1)
-        #     yolo_scores = fluid.layers.concat(self.scores, axis=2)
-        #
-        #     pred = fluid.layers.multiclass_nms(
-        #         bboxes=yolo_boxes,
-        #         scores=yolo_scores,
-        #         score_threshold=cfg.valid_thresh,
-        #         nms_top_k=cfg.nms_topk,
-        #         keep_top_k=cfg.nms_posk,
-        #         nms_threshold=cfg.nms_thresh,
-        #         background_label=-1)
-        #     return pred
-        # else:
-        #     return sum(self.losses)
-        return sum(self.losses)
+        if not self.is_train:
+            # get pred
+            yolo_boxes = fluid.layers.concat(self.boxes, axis=1)
+            yolo_scores = fluid.layers.concat(self.scores, axis=2)
+
+            pred = fluid.layers.multiclass_nms(
+                bboxes=yolo_boxes,
+                scores=yolo_scores,
+                score_threshold=cfg.valid_thresh,
+                nms_top_k=cfg.nms_topk,
+                keep_top_k=cfg.nms_posk,
+                nms_threshold=cfg.nms_thresh,
+                background_label=-1)
+            return pred
+        else:
+            return sum(self.losses)
diff --git a/python/paddle/fluid/tests/unittests/ir/inference/inference_pass_test.py b/python/paddle/fluid/tests/unittests/ir/inference/inference_pass_test.py
index 0b8ea1f9392e9264c2abf1827e39582be92988cb..d3a53bbbff98176f977692bbb5dd4002d4ead158 100644
--- a/python/paddle/fluid/tests/unittests/ir/inference/inference_pass_test.py
+++ b/python/paddle/fluid/tests/unittests/ir/inference/inference_pass_test.py
@@ -78,7 +78,10 @@ class InferencePassTest(unittest.TestCase):
             shape = tensor_shapes[name]
             shape[0] = 1
             tensor = predictor.get_input_tensor(name)
-            tensor.copy_from_cpu(list(self.feeds.values())[i])
+            feed_data = list(self.feeds.values())[i]
+            tensor.copy_from_cpu(np.array(feed_data))
+            if type(feed_data) == fluid.LoDTensor:
+                tensor.set_lod(feed_data.lod())
 
         predictor.zero_copy_run()
 
diff --git a/python/paddle/fluid/tests/unittests/ir/inference/test_conv_elementwise_add2_act_fuse_pass.py b/python/paddle/fluid/tests/unittests/ir/inference/test_conv_elementwise_add2_act_fuse_pass.py
index 16979488a614e9e7851cba967f6ff1cd398445b1..d6dbd397b90368d5cac27c3c5d92b7a7dce9dcf5 100644
--- a/python/paddle/fluid/tests/unittests/ir/inference/test_conv_elementwise_add2_act_fuse_pass.py
+++ b/python/paddle/fluid/tests/unittests/ir/inference/test_conv_elementwise_add2_act_fuse_pass.py
@@ -44,7 +44,8 @@ class ConvElementwiseAdd2ActFusePassTest(InferencePassTest):
 
     def test_check_output(self):
         if core.is_compiled_with_cuda():
-            self.check_output_with_option([True])
+            use_gpu = True
+            self.check_output_with_option(use_gpu)
 
 
 if __name__ == "__main__":
diff --git a/python/paddle/fluid/tests/unittests/ir/inference/test_conv_elementwise_add_act_fuse_pass.py b/python/paddle/fluid/tests/unittests/ir/inference/test_conv_elementwise_add_act_fuse_pass.py
index f4014f7cd42e41f0bf59fc6a5fab7f3bee0762d3..2e9035420d7ee45dd69de8d3cd8acc9bb1590c72 100644
--- a/python/paddle/fluid/tests/unittests/ir/inference/test_conv_elementwise_add_act_fuse_pass.py
+++ b/python/paddle/fluid/tests/unittests/ir/inference/test_conv_elementwise_add_act_fuse_pass.py
@@ -46,7 +46,8 @@ class ConvElementwiseAddActFusePassTest(InferencePassTest):
 
     def test_check_output(self):
         if core.is_compiled_with_cuda():
-            self.check_output_with_option([True])
+            use_gpu = True
+            self.check_output_with_option(use_gpu)
 
 
 if __name__ == "__main__":
diff --git a/python/paddle/fluid/tests/unittests/ir/inference/test_conv_elementwise_add_fuse_pass.py b/python/paddle/fluid/tests/unittests/ir/inference/test_conv_elementwise_add_fuse_pass.py
index cea007d56e41ba6d1596a0b3052fa123c7373949..7c4e0d6e76ec45c4b75ba91522306a4dd0abc7c5 100644
--- a/python/paddle/fluid/tests/unittests/ir/inference/test_conv_elementwise_add_fuse_pass.py
+++ b/python/paddle/fluid/tests/unittests/ir/inference/test_conv_elementwise_add_fuse_pass.py
@@ -42,7 +42,8 @@ class ConvElementwiseAddFusePassTest(InferencePassTest):
 
     def test_check_output(self):
         if core.is_compiled_with_cuda():
-            self.check_output_with_option([True])
+            use_gpu = True
+            self.check_output_with_option(use_gpu)
 
 
 if __name__ == "__main__":
diff --git a/python/paddle/fluid/tests/unittests/ir/inference/test_transpose_flatten_concat_fuse_pass.py b/python/paddle/fluid/tests/unittests/ir/inference/test_transpose_flatten_concat_fuse_pass.py
index 6444264f80fb52dd655a26cca627b26ea13a4d76..dfcd1758db2b22b211f84be528739aa71132ab8a 100644
--- a/python/paddle/fluid/tests/unittests/ir/inference/test_transpose_flatten_concat_fuse_pass.py
+++ b/python/paddle/fluid/tests/unittests/ir/inference/test_transpose_flatten_concat_fuse_pass.py
@@ -42,7 +42,8 @@ class TransposeFlattenConcatFusePassTest(InferencePassTest):
     def test_check_output(self):
         # There is no cpu pass for transpose_flatten_concat_fuse
         if core.is_compiled_with_cuda():
-            self.check_output_with_option([True])
+            use_gpu = True
+            self.check_output_with_option(use_gpu)
 
 
 if __name__ == "__main__":
diff --git a/python/paddle/fluid/tests/unittests/ir/inference/test_trt_transpose_flatten_concat_fuse_pass.py b/python/paddle/fluid/tests/unittests/ir/inference/test_trt_transpose_flatten_concat_fuse_pass.py
index 41f02b0427d68216f7363236b77ddf3229e92143..4661333ffeca10b7026c68a47b44fc3be83ff093 100644
--- a/python/paddle/fluid/tests/unittests/ir/inference/test_trt_transpose_flatten_concat_fuse_pass.py
+++ b/python/paddle/fluid/tests/unittests/ir/inference/test_trt_transpose_flatten_concat_fuse_pass.py
@@ -48,7 +48,8 @@ class TransposeFlattenConcatFusePassTRTTest(InferencePassTest):
     def test_check_output(self):
         # There is no cpu pass for transpose_flatten_concat_fuse
         if core.is_compiled_with_cuda():
-            self.check_output_with_option([True])
+            use_gpu = True
+            self.check_output_with_option(use_gpu)
 
 
 if __name__ == "__main__":
diff --git a/python/paddle/fluid/tests/unittests/mkldnn/test_lrn_mkldnn_op.py b/python/paddle/fluid/tests/unittests/mkldnn/test_lrn_mkldnn_op.py
index 5c9fccd4eca5f523bcfd482936fc0e1fb42cce3a..ba7c8abc56daa91dda364713ffe7aa332610921c 100644
--- a/python/paddle/fluid/tests/unittests/mkldnn/test_lrn_mkldnn_op.py
+++ b/python/paddle/fluid/tests/unittests/mkldnn/test_lrn_mkldnn_op.py
@@ -26,8 +26,10 @@ class TestLRNMKLDNNOp(TestLRNOp):
         return attrs
 
     def test_check_output(self):
+        # We cannot validate MidOut as LRN REF has diffrent meaning in it
         # TODO(wangzhongpu): support mkldnn op in dygraph mode
-        self.check_output(atol=0.002, check_dygraph=False)
+        self.check_output(
+            atol=0.002, no_check_set=['MidOut'], check_dygraph=False)
 
     def test_check_grad_normal(self):
         # TODO(wangzhongpu): support mkldnn op in dygraph mode
diff --git a/python/paddle/fluid/tests/unittests/seresnext_net.py b/python/paddle/fluid/tests/unittests/seresnext_net.py
index 252ffe2c22ad2e9d0b48e89d9dc0a7d695e1c0a4..17e0cd0d5b18652f828af9936b07cb4122f87b97 100644
--- a/python/paddle/fluid/tests/unittests/seresnext_net.py
+++ b/python/paddle/fluid/tests/unittests/seresnext_net.py
@@ -22,6 +22,7 @@ from simple_nets import init_data
 import math
 import os
 os.environ['CPU_NUM'] = str(4)
+os.environ['FLAGS_cudnn_deterministic'] = str(1)
 
 # FIXME(zcd): If the neural net has dropout_op, the output of ParallelExecutor
 # and Executor is different. Because, for ParallelExecutor, the dropout_op of
@@ -35,14 +36,10 @@ remove_dropout = False
 # and Executor is different.
 remove_bn = False
 
-# FIXME(huihuangzheng): Temporarily disable cudnn of conv2d in unit test because
-# it will cause random test failure. We have to re-enable it after someone fixs
-# cudnn_conv
 remove_cudnn_conv = False
 
 remove_dropout = True
 remove_bn = True
-remove_cudnn_conv = True
 
 
 def squeeze_excitation(input, num_channels, reduction_ratio):
diff --git a/python/paddle/fluid/tests/unittests/simple_nets.py b/python/paddle/fluid/tests/unittests/simple_nets.py
index 5f48cc8e6a7157ecf2a2481ca7479f37f6014004..7f22df67d1b94282bb8f253151071c3d058d255c 100644
--- a/python/paddle/fluid/tests/unittests/simple_nets.py
+++ b/python/paddle/fluid/tests/unittests/simple_nets.py
@@ -18,10 +18,10 @@ import numpy as np
 
 def simple_fc_net_with_inputs(img, label, class_num=10):
     hidden = img
-    for _ in range(4):
+    for _ in range(2):
         hidden = fluid.layers.fc(
             hidden,
-            size=200,
+            size=100,
             act='relu',
             bias_attr=fluid.ParamAttr(
                 initializer=fluid.initializer.Constant(value=1.0)))
diff --git a/python/paddle/fluid/tests/unittests/test_activation_op.py b/python/paddle/fluid/tests/unittests/test_activation_op.py
index 8dbdd2921b9a0683b60ed1b1a133381db90c73c9..5b9e7bfe62b7f4804c49d43c449d7e3e366f4942 100644
--- a/python/paddle/fluid/tests/unittests/test_activation_op.py
+++ b/python/paddle/fluid/tests/unittests/test_activation_op.py
@@ -799,22 +799,15 @@ class TestLog1p(TestActivation):
                 shape=[11, 17],
                 append_batch_size=False,
                 dtype="float64")
-            res_log1p = fluid.layers.data(
-                name="res_log1p",
-                shape=[11, 17],
-                append_batch_size=False,
-                dtype="float64")
 
             out1 = paddle.log1p(data_x)
-            out2 = paddle.log1p(data_x, out=res_log1p)
             exe = fluid.Executor(place=fluid.CPUPlace())
             exe.run(fluid.default_startup_program())
-            res1, res_in = exe.run(fluid.default_main_program(),
-                                   feed={"data_x": input_x},
-                                   fetch_list=[out1, res_log1p])
+            res1 = exe.run(fluid.default_main_program(),
+                           feed={"data_x": input_x},
+                           fetch_list=[out1])
         expected_res = np.log1p(input_x)
-        np.testing.assert_allclose(res1, expected_res)
-        np.testing.assert_allclose(res_in, expected_res)
+        self.assertTrue(np.allclose(res1, expected_res))
 
         # dygraph
         with fluid.dygraph.guard():
@@ -823,7 +816,7 @@ class TestLog1p(TestActivation):
             z = paddle.log1p(data_x)
             np_z = z.numpy()
             z_expected = np.array(np.log1p(np_x))
-        np.testing.assert_allclose(np_z, z_expected)
+        self.assertTrue(np.allclose(np_z, z_expected))
 
 
 class TestSquare(TestActivation):
diff --git a/python/paddle/fluid/tests/unittests/test_arange.py b/python/paddle/fluid/tests/unittests/test_arange.py
index d715744b02a010e442dfe9fb4f2409d481c8f8d9..1736e49f3b67b380b88e53ac9876f3ccde53104c 100644
--- a/python/paddle/fluid/tests/unittests/test_arange.py
+++ b/python/paddle/fluid/tests/unittests/test_arange.py
@@ -15,7 +15,8 @@
 from __future__ import print_function
 
 import paddle
-import paddle.fluid as fluid
+from paddle.fluid import core
+from paddle import program_guard, Program
 import unittest
 import numpy as np
 from op_test import OpTest
@@ -44,47 +45,67 @@ class TestArangeOp(OpTest):
         self.check_output()
 
 
-class TestFloatArangeOpCase0(TestArangeOp):
+class TestFloatArangeOp(TestArangeOp):
     def init_config(self):
         self.dtype = np.float32
         self.case = (0, 5, 1)
 
 
-class TestInt32ArangeOpCase0(TestArangeOp):
+class TestInt32ArangeOp(TestArangeOp):
     def init_config(self):
         self.dtype = np.int32
         self.case = (0, 5, 2)
 
 
-class TestInt32ArangeOpCase1(TestArangeOp):
+class TestFloat64ArangeOp(TestArangeOp):
     def init_config(self):
-        self.dtype = np.int32
+        self.dtype = np.float64
         self.case = (10, 1, -2)
 
 
-class TestInt32ArangeOpCase2(TestArangeOp):
+class TestInt64ArangeOp(TestArangeOp):
     def init_config(self):
-        self.dtype = np.int32
+        self.dtype = np.int64
         self.case = (-1, -10, -2)
 
 
+class TestArangeOpError(unittest.TestCase):
+    def test_errors(self):
+        with program_guard(Program(), Program()):
+            self.assertRaises(TypeError, paddle.arange, 10, dtype='int8')
+
+
 class TestArangeAPI(unittest.TestCase):
     def test_out(self):
-        with fluid.program_guard(fluid.Program()):
-            data = paddle.arange(0, 5, 1)
-            place = fluid.CPUPlace()
-            exe = fluid.Executor(place)
-            result, = exe.run(fetch_list=[data])
-            expected_data = np.arange(0, 5, 1).astype(np.float32)
-        self.assertEqual((result == expected_data).all(), True)
-
-        with fluid.program_guard(fluid.Program()):
-            data = paddle.arange(0.0, 5.0, 1.0, 'int32')
-            place = fluid.CPUPlace()
-            exe = fluid.Executor(place)
-            result, = exe.run(fetch_list=[data])
-            expected_data = np.arange(0, 5, 1).astype(np.int32)
-        self.assertEqual((result == expected_data).all(), True)
+        with program_guard(Program(), Program()):
+            x1 = paddle.arange(0, 5, 1, 'float32')
+
+            place = paddle.CUDAPlace(0) if core.is_compiled_with_cuda(
+            ) else paddle.CPUPlace()
+            exe = paddle.Executor(place)
+            out = exe.run(fetch_list=[x1])
+
+        expected_data = np.arange(0, 5, 1).astype(np.float32)
+        self.assertEqual((out == expected_data).all(), True)
+
+
+class TestArangeImperative(unittest.TestCase):
+    def test_out(self):
+        place = paddle.CUDAPlace(0) if core.is_compiled_with_cuda(
+        ) else paddle.CPUPlace()
+        with paddle.imperative.guard(place):
+            x1 = paddle.arange(0, 5, 1)
+            x2 = paddle.tensor.arange(5)
+            x3 = paddle.tensor.creation.arange(5)
+
+            start = paddle.imperative.to_variable(np.array([0], 'float32'))
+            end = paddle.imperative.to_variable(np.array([5], 'float32'))
+            step = paddle.imperative.to_variable(np.array([1], 'float32'))
+            x4 = paddle.arange(start, end, step, 'int64')
+
+        expected_data = np.arange(0, 5, 1).astype(np.int64)
+        for i in [x1, x2, x3, x4]:
+            self.assertEqual((i.numpy() == expected_data).all(), True)
 
 
 if __name__ == "__main__":
diff --git a/python/paddle/fluid/tests/unittests/test_base_layer.py b/python/paddle/fluid/tests/unittests/test_base_layer.py
index 707b0b00b4acb6ba3823e19aee88c9f0908abce2..bc666c0de5be06be7529bced39071303430c8ace 100644
--- a/python/paddle/fluid/tests/unittests/test_base_layer.py
+++ b/python/paddle/fluid/tests/unittests/test_base_layer.py
@@ -16,6 +16,8 @@ import unittest
 import numpy as np
 
 import paddle.fluid as fluid
+from paddle.fluid.dygraph import to_variable
+from paddle.fluid.framework import ParamBase
 
 
 class L1(fluid.Layer):
@@ -85,5 +87,181 @@ class TestBaseLayer(unittest.TestCase):
             self.assertTrue(np.allclose(ret.numpy(), 0.8 * np.ones([2, 2])))
 
 
+class BufferLayer(fluid.Layer):
+    def __init__(self):
+        super(BufferLayer, self).__init__()
+        buffer_var = to_variable(np.zeros([2, 4]).astype('int32'))
+        self.register_buffer("layer_buffer", buffer_var)
+
+    def forward(self):
+        pass
+
+
+class BufferNet(fluid.Layer):
+    def __init__(self):
+        super(BufferNet, self).__init__()
+        self.buffer_layer = BufferLayer()
+        self.w1 = self.create_parameter(
+            shape=[2, 2], dtype='float32', is_bias=False)
+        buffer_var = to_variable(np.ones([2, 4]).astype('int32'))
+        self.register_buffer("net_buffer", buffer_var)
+
+        self.new_buffer = to_variable(np.ones([4, 2]).astype('int32'))
+
+    def forward(self):
+        pass
+
+
+class TestBuffer(unittest.TestCase):
+    def test_buffers_and_named_buffers(self):
+        def names(named_buffers):
+            return [name for name, _ in named_buffers]
+
+        with fluid.dygraph.guard():
+            layer = BufferLayer()
+            net = BufferNet()
+
+            self.assertEqual(len(layer.buffers()), 1)
+            self.assertEqual(names(layer.named_buffers()), ['layer_buffer'])
+
+            self.assertEqual(len(net.buffers()), 3)
+            self.assertEqual(
+                names(net.named_buffers()),
+                ['net_buffer', 'new_buffer', 'buffer_layer.layer_buffer'])
+
+            self.assertEqual(len(net.buffers(include_sublayers=False)), 2)
+            self.assertEqual(
+                names(net.named_buffers(include_sublayers=False)),
+                ['net_buffer', 'new_buffer'])
+
+    def test_register_buffer_with_error(self):
+        with fluid.dygraph.guard():
+            net = fluid.Layer()
+            var = to_variable(np.zeros([1]))
+
+            with self.assertRaisesRegexp(TypeError,
+                                         "name of buffer should be a string"):
+                net.register_buffer(12, var)
+
+            with self.assertRaisesRegexp(TypeError,
+                                         "buffer should be a core.VarBase"):
+                net.register_buffer("buffer_name", ParamBase([2, 2], 'float32'))
+
+            with self.assertRaisesRegexp(KeyError,
+                                         "name of buffer can not contain"):
+                net.register_buffer("buffer.name", var)
+
+            with self.assertRaisesRegexp(KeyError,
+                                         "name of buffer can not be empty"):
+                net.register_buffer("", var)
+
+            net.attr_name = 10
+            with self.assertRaisesRegexp(KeyError, "already exists"):
+                net.register_buffer("attr_name", var)
+
+            del net.attr_name
+            net.attr_name = ParamBase([2, 2], 'float32')
+            with self.assertRaisesRegexp(KeyError, "already exists"):
+                net.register_buffer("attr_name", var)
+
+    def test_register_buffer_same_name(self):
+        with fluid.dygraph.guard():
+            net = fluid.Layer()
+            var1 = to_variable(np.zeros([1]))
+            var2 = to_variable(np.zeros([2]))
+            var3 = to_variable(np.zeros([3]))
+
+            net.register_buffer("buffer_name", var1)
+            self.assert_var_base_equal(net.buffer_name, var1)
+            net.register_buffer("buffer_name", var2)
+            self.assert_var_base_equal(net.buffer_name, var2)
+            net.register_buffer("buffer_name", var3)
+            self.assert_var_base_equal(net.buffer_name, var3)
+
+    def test_buffer_not_persistable(self):
+        with fluid.dygraph.guard():
+            net = fluid.Layer()
+            var1 = to_variable(np.zeros([1]))
+
+            net.register_buffer("buffer_name", var1, persistable=False)
+            self.assertEqual(len(net.buffers()), 1)
+            self.assertEqual(len(net.state_dict()), 0)
+
+    def test_buffer_not_persistable_del(self):
+        with fluid.dygraph.guard():
+            net = fluid.Layer()
+            var1 = to_variable(np.zeros([1]))
+            net.register_buffer("buffer_name", var1, persistable=False)
+            del net.buffer_name
+            self.assertEqual(len(net.buffers()), 0)
+
+    def test_buffer_not_persistable_overwrite(self):
+        with fluid.dygraph.guard():
+            net = fluid.Layer()
+            var1 = to_variable(np.zeros([1]))
+            var2 = to_variable(np.zeros([2]))
+            net.register_buffer("buffer_name", var1, persistable=False)
+            net.register_buffer("buffer_name", var2)
+
+            # Allow to overwrite a non-persistable buffer with a persistable var.
+            self.assertEqual(len(net.buffers()), 1)
+            self.assertEqual(len(net.state_dict()), 1)
+
+            net.register_buffer("buffer_name", var1, persistable=False)
+            self.assertEqual(len(net.buffers()), 1)
+            self.assertEqual(len(net.state_dict()), 0)
+
+    def test_buffer_not_persistable_assign(self):
+        with fluid.dygraph.guard():
+            net = fluid.Layer()
+            var1 = to_variable(np.zeros([1]))
+            net.register_buffer("buffer_name", var1, persistable=False)
+
+            # Assigning Nones will remove the buffer, but allow to re-assign
+            # to remark it as buffer.
+            net.buffer_name = None
+            self.assertEqual(len(net.buffers()), 0)
+            self.assertEqual(len(net.state_dict()), 0)
+
+            net.buffer_name = var1
+            self.assertEqual(len(net.buffers()), 1)
+            self.assertEqual(len(net.state_dict()), 0)
+
+            # Re-assign a ParamBase will remove the buffer.
+            net.buffer_name = ParamBase([2, 2], 'float32')
+            self.assertEqual(len(net.buffers()), 0)
+            self.assertEqual(len(net.state_dict()), 1)
+
+    def test_buffer_not_persistable_load(self):
+        with fluid.dygraph.guard():
+            net = fluid.Layer()
+            var1 = to_variable(np.zeros([1]))
+            net.register_buffer("buffer_name", var1, persistable=False)
+            net.load_dict({})
+
+    def test_buffer_state_dict(self):
+        with fluid.dygraph.guard():
+            net = fluid.Layer()
+            var1 = to_variable(np.zeros([2, 3]))
+            var2 = to_variable(np.zeros([3, 2]))
+            net.register_buffer("buffer_var1", var1)
+            net.register_buffer("buffer_var2", var2, persistable=False)
+
+            self.assertEqual(len(net.state_dict()), 1)
+            self.assertEqual([name for name, _ in net.state_dict().items()],
+                             ["buffer_var1"])
+
+            # load state_dict
+            net_load = fluid.Layer()
+            var = to_variable(np.ones([2, 3]))
+            net_load.register_buffer("buffer_var1", var)
+            net_load.load_dict(net.state_dict())
+
+            self.assert_var_base_equal(net_load.buffer_var1, var1)
+
+    def assert_var_base_equal(self, var1, var2):
+        self.assertTrue(np.array_equal(var1.numpy(), var2.numpy()))
+
+
 if __name__ == '__main__':
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_bilateral_slice_op.py b/python/paddle/fluid/tests/unittests/test_bilateral_slice_op.py
new file mode 100644
index 0000000000000000000000000000000000000000..51e447dba725c03ad7eea5c94c2be70cc8ea9a7a
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/test_bilateral_slice_op.py
@@ -0,0 +1,194 @@
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+import numpy as np
+from op_test import OpTest
+import paddle
+import math
+
+
+class Gsz:
+    def __init__(self, h, w, gd, gh, gw, input_chans):
+        self.h = h
+        self.w = w
+        self.gd = gd
+        self.gh = gh
+        self.gw = gw
+        self.input_chans = input_chans
+
+
+def diff_abs(x):
+    eps = 1e-8
+    return math.sqrt(x * x + eps)
+
+
+def d_diff_abs(x):
+    eps = 1e-8
+    return x / math.sqrt(x * x + eps)
+
+
+def weight_z(x):
+    abx = diff_abs(x)
+    return max(1.0 - abx, 0.0)
+
+
+def d_weight_z(x):
+    abx = diff_abs(x)
+    if abx > 1.0:
+        return 0.0
+    else:
+        return d_diff_abs(x)
+
+
+def naive_bilateral_slice_forward(output, grid, guide, input, gsz, has_offset,
+                                  total_count, output_chans):
+    h = gsz.h
+    w = gsz.w
+    gd = gsz.gd
+    gh = gsz.gh
+    gw = gsz.gw
+    input_chans = gsz.input_chans
+    coeff_stride = input_chans
+    grid_chans = input_chans * output_chans
+
+    if has_offset:
+        grid_chans += output_chans
+        coeff_stride += 1
+
+    for idx in range(total_count):
+        x = idx % w
+        y = idx // w % h
+        out_c = (idx // (h * w)) % output_chans
+        b = (idx // (output_chans * w * h))
+
+        gx = (x + 0.5) * gw / (1.0 * w)
+        gy = (y + 0.5) * gh / (1.0 * h)
+        gz = guide[int(b), int(y), int(x)] * gd
+
+        fx = int(np.floor(gx - 0.5))
+        fy = int(np.floor(gy - 0.5))
+        fz = int(np.floor(gz - 0.5))
+
+        value = 0.0
+        for in_c in range(0, coeff_stride):
+            coeff_sample = 0.0
+
+            for xx in range(fx, fx + 2):
+                x_ = max(min(xx, gw - 1), 0)
+                wx = max(1.0 - abs(xx + 0.5 - gx), 0.0)
+
+                for yy in range(fy, fy + 2):
+                    y_ = max(min(yy, gh - 1), 0)
+                    wy = max(1.0 - abs(yy + 0.5 - gy), 0.0)
+
+                    for zz in range(fz, fz + 2):
+                        z_ = max(min(zz, gd - 1), 0)
+                        wz = weight_z(zz + 0.5 - gz)
+                        c_ = coeff_stride * out_c + in_c
+
+                        coeff_sample += grid[int(b), int(c_), int(z_), int(y_),
+                                             int(x_)] * wx * wy * wz
+
+            if in_c < input_chans:
+                value += coeff_sample * input[int(b), int(in_c), int(y), int(x)]
+            else:
+                value += coeff_sample
+        output[int(b), int(out_c), int(y), int(x)] = value
+
+
+def naive_bilateral_slice(x, guide, grid, has_offset):
+    bs = x.shape[0]
+    h = x.shape[2]
+    w = x.shape[3]
+    input_chans = x.shape[1]
+
+    coeffs_chans = grid.shape[1]
+    if has_offset:
+        output_chans = coeffs_chans // (input_chans + 1)
+    else:
+        output_chans = coeffs_chans // input_chans
+
+    output = np.zeros([bs, int(output_chans), h, w]).astype(x.dtype)
+
+    gd = grid.shape[2]
+    gh = grid.shape[3]
+    gw = grid.shape[4]
+
+    gsz = Gsz(h, w, gd, gh, gw, input_chans)
+    total_count = bs * h * w * output.shape[1]
+    naive_bilateral_slice_forward(output, grid, guide, x, gsz, has_offset,
+                                  total_count, output.shape[1])
+    return output
+
+
+@unittest.skipIf(not paddle.fluid.is_compiled_with_cuda(),
+                 'CPU testing is not supported')
+class TestBilateralSliceOp(OpTest):
+    def setUp(self):
+        self.initTestCase()
+        self.op_type = 'bilateral_slice'
+        batch_size = 3
+        h = 50
+        w = 30
+        c = 1
+        gh = 5
+        gw = 3
+        gd = 2
+        gc = 2
+        x = np.random.rand(batch_size, c, h, w).astype(self.data_type)
+        guide = np.random.rand(batch_size, h, w).astype(self.data_type)
+        grid = np.random.rand(batch_size, gc, gd, gh, gw).astype(self.data_type)
+        output_np = naive_bilateral_slice(x, guide, grid, self.has_offset)
+
+        self.inputs = {'X': x, 'Grid': grid, 'Guide': guide}
+        self.attrs = {'has_offset': self.has_offset, }
+        self.outputs = {'Out': output_np}
+
+    def test_check_output(self):
+        place = paddle.fluid.CUDAPlace(0)
+        self.check_output_with_place(place, atol=1e-5)
+        self.check_output
+
+    def test_check_grad(self):
+        place = paddle.fluid.CUDAPlace(0)
+        self.check_grad_with_place(place, ['X'], 'Out')
+
+    def initTestCase(self):
+        self.has_offset = False
+        self.data_type = 'float64'
+
+
+@unittest.skipIf(not paddle.fluid.is_compiled_with_cuda(),
+                 'CPU testing is not supported')
+class TestBilateralSliceOp1(TestBilateralSliceOp):
+    def initTestCase(self):
+        self.has_offset = True
+        self.data_type = 'float32'
+
+
+class TestBilateralSliceApi(TestBilateralSliceOp):
+    def test_api(self):
+        x = paddle.fluid.data(
+            name='x', shape=[None, 3, 25, 15], dtype='float32')
+        guide = paddle.fluid.data(
+            name='guide', shape=[None, 25, 15], dtype='float32')
+        grid = paddle.fluid.data(
+            name='grid', shape=[None, 12, 8, 5, 3], dtype='float32')
+        paddle.fluid.contrib.layers.bilateral_slice(x, guide, grid,
+                                                    self.has_offset)
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_boxps.py b/python/paddle/fluid/tests/unittests/test_boxps.py
index a61cc9747e92ec88980b014bfa20820e89e7c730..0eba0e8f26ef8b6778a0b798c748aca5413935c6 100644
--- a/python/paddle/fluid/tests/unittests/test_boxps.py
+++ b/python/paddle/fluid/tests/unittests/test_boxps.py
@@ -87,117 +87,5 @@ class TestRunCmd(unittest.TestCase):
         self.assertTrue(ret2 == 0)
 
 
-class TestBoxPSPreload(unittest.TestCase):
-    """  TestCases for BoxPS Preload """
-
-    def test_boxps_cpu(self):
-        self.run_boxps_preload(True, True)
-        self.run_boxps_preload(True, False)
-
-    def test_boxps_gpu(self):
-        self.run_boxps_preload(False, True)
-        self.run_boxps_preload(False, False)
-
-    def run_boxps_preload(self, is_cpu=True, random_with_lineid=False):
-        program = fluid.Program()
-        with fluid.program_guard(program):
-            x = fluid.layers.data(
-                name='x', shape=[1], dtype='int64', lod_level=0)
-            y = fluid.layers.data(
-                name='y', shape=[1], dtype='int64', lod_level=0)
-            z = layers.data(name='z', shape=[1], dtype='int64')
-            emb_x, emb_y = _pull_box_sparse([x, y], size=2)
-            emb_xp = _pull_box_sparse(x, size=2)
-            concat = layers.concat([emb_x, emb_y], axis=1)
-            fc = layers.fc(input=concat,
-                           name="fc",
-                           size=1,
-                           num_flatten_dims=1,
-                           bias_attr=False)
-            loss = layers.reduce_mean(fc)
-            place = fluid.CPUPlace(
-            ) if is_cpu or not core.is_compiled_with_cuda(
-            ) else fluid.CUDAPlace(0)
-            exe = fluid.Executor(place)
-            batch_size = 100
-
-            def binary_print(slot, fout):
-                fout.write(str(len(slot)) + " ")
-                for e in slot:
-                    fout.write(str(e) + " ")
-
-            batch1 = np.ones(
-                (batch_size, 2, 1)).astype("int64").reshape(batch_size, 2, 1)
-            filelist = []
-            place_str = "cpu" if is_cpu else "gpu"
-            for i in range(2):
-                filelist.append("test_hdfs_" + place_str + "_" + str(i))
-            for f in filelist:
-                with open(f, "w") as fout:
-                    for ins in batch1:
-                        for slot in ins:
-                            binary_print(slot, fout)
-                    fout.write("\n")
-
-            def create_dataset():
-                dataset = fluid.DatasetFactory().create_dataset("BoxPSDataset")
-                dataset.set_date("20190930")
-                dataset.set_use_var([x, y])
-                dataset.set_batch_size(2)
-                dataset.set_thread(1)
-                dataset.set_filelist(filelist)
-                return dataset
-
-            datasets = []
-            datasets.append(create_dataset())
-            datasets.append(create_dataset())
-            optimizer = fluid.optimizer.SGD(learning_rate=0.5)
-            optimizer = fluid.optimizer.PipelineOptimizer(
-                optimizer,
-                cut_list=[],
-                place_list=[place],
-                concurrency_list=[1],
-                queue_size=1,
-                sync_steps=-1)
-            optimizer.minimize(loss)
-
-            program._pipeline_opt["dump_fields"] = [
-                "fc.tmp_0", "fc.tmp_0@GRAD", "fake_var", "z",
-                "reduce_mean_3.tmp_0"
-            ]
-            # fake_var: not in scope
-            # z: in scope, but no initialized
-            # reduce_mean_0.tmp_0, dimension is not right
-
-            program._pipeline_opt["dump_fields_path"] = "./dump_log/"
-            program._pipeline_opt["dump_param"] = ["fc.w_0"]
-            program._pipeline_opt["enable_random_dump"] = True
-            program._pipeline_opt["dump_interval"] = 10
-            program._pipeline_opt["random_with_lineid"] = random_with_lineid
-
-            exe.run(fluid.default_startup_program())
-            datasets[0].load_into_memory()
-            datasets[0].begin_pass()
-            datasets[0].slots_shuffle([])
-            datasets[1].preload_into_memory()
-            exe.train_from_dataset(
-                program=fluid.default_main_program(),
-                dataset=datasets[0],
-                print_period=1)
-            datasets[0].end_pass(True)
-            datasets[1].wait_preload_done()
-            datasets[1].begin_pass()
-            exe.train_from_dataset(
-                program=fluid.default_main_program(),
-                dataset=datasets[1],
-                print_period=1,
-                debug=True)
-            datasets[1].end_pass(False)
-            for f in filelist:
-                os.remove(f)
-            if os.path.isdir("dump_log"):
-                shutil.rmtree("dump_log")
-
-
 if __name__ == '__main__':
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_communicator_half_async.py b/python/paddle/fluid/tests/unittests/test_communicator_half_async.py
index 31b476eac0566f962ea452bf1fde62f5cb3c5169..8a7904db95f7a1b8088197fdf16969e1ccfefae2 100644
--- a/python/paddle/fluid/tests/unittests/test_communicator_half_async.py
+++ b/python/paddle/fluid/tests/unittests/test_communicator_half_async.py
@@ -94,7 +94,7 @@ class TestCommunicatorHalfAsyncEnd2End(unittest.TestCase):
             current_id=0,
             role=role_maker.Role.WORKER
             if training_role == "TRAINER" else role_maker.Role.SERVER,
-            worker_num=1,
+            worker_num=2,
             server_endpoints=["127.0.0.1:6002"])
 
         if training_role == "TRAINER":
diff --git a/python/paddle/fluid/tests/unittests/test_complex_trace_layer.py b/python/paddle/fluid/tests/unittests/test_complex_trace_layer.py
index 5293d594be993a85a54508af99cf08b4a40a380b..acc1e41b246309b312426b0a5b0bb7670c2bdfb7 100644
--- a/python/paddle/fluid/tests/unittests/test_complex_trace_layer.py
+++ b/python/paddle/fluid/tests/unittests/test_complex_trace_layer.py
@@ -33,7 +33,7 @@ class TestComplexTraceLayer(unittest.TestCase):
         for place in self._places:
             with dg.guard(place):
                 var_x = dg.to_variable(input)
-                result = cpx.trace(var_x, offset=1, dim1=0, dim2=2).numpy()
+                result = cpx.trace(var_x, offset=1, axis1=0, axis2=2).numpy()
                 target = np.trace(input, offset=1, axis1=0, axis2=2)
                 self.assertTrue(np.allclose(result, target))
 
diff --git a/python/paddle/fluid/tests/unittests/test_data_norm_op.py b/python/paddle/fluid/tests/unittests/test_data_norm_op.py
index 0b7ed20f4b1c8f30f75613a2023eab3e1cea7855..c766cf17f422205521641ae44ab2060b4ab6e81c 100644
--- a/python/paddle/fluid/tests/unittests/test_data_norm_op.py
+++ b/python/paddle/fluid/tests/unittests/test_data_norm_op.py
@@ -437,129 +437,6 @@ class TestDataNormOpWithSlotDim(OpTest):
         self.check_grad(['X'], 'Y', no_grad_set=set([]))
 
 
-class TestDataNormOpWithSyncStats(unittest.TestCase):
-    """
-    test class for data norm op
-    test forward and backward
-    """
-
-    def test_sync_stats(self):
-        if not core.is_compiled_with_cuda():
-            return
-        if os.name == 'nt':
-            print(
-                'Skip TestDataNormOpWithSyncStats because nccl is not supported on windows'
-            )
-            return
-        x = fluid.layers.data(name='x', shape=[1], dtype='int64', lod_level=0)
-        emb = layers.embedding(
-            input=x,
-            param_attr=fluid.ParamAttr(name="embx"),
-            size=[10, 2],
-            is_sparse=False)
-
-        dn = layers.data_norm(
-            input=emb,
-            name="hehe",
-            epsilon=1e-4,
-            param_attr={
-                "batch_size": 1e4,
-                "batch_sum": 1e5,
-                "batch_square": 1e4
-            },
-            summary_decay_rate=1,
-            sync_stats=True)  #[-1,3]
-        loss = layers.mean(dn)
-
-        optimizer = fluid.optimizer.SGD(learning_rate=0.5)
-        optimizer = fluid.optimizer.PipelineOptimizer(
-            optimizer,
-            cut_list=[[emb], [loss]],
-            place_list=[
-                fluid.CUDAPlace(0), fluid.CUDAPlace(0), fluid.CPUPlace()
-            ],
-            concurrency_list=[1, 1, 1],
-            queue_size=1,
-            sync_steps=10000000, )
-
-        all_p = fluid.default_main_program().global_block().all_parameters()
-        parameter_without_datanorm = []
-        for e in all_p:
-            if e.name.find("batch_size") != -1 or e.name.find(
-                    "batch_sq") != -1 or e.name.find("batch_sum") != -1:
-                continue
-            parameter_without_datanorm.append(e.name)
-        optimizer.minimize(loss, parameter_list=parameter_without_datanorm)
-        place = fluid.CUDAPlace(0)
-        exe = fluid.Executor(place)
-        #prepare data
-        batch_size = 1
-
-        def binary_print(slot, fout):
-            num = np.int16(len(slot) + 1)
-            num.tofile(fout)
-            a = np.int64(batch_size)
-            a.tofile(fout)
-            slot.tofile(fout)
-
-        #batch1 = np.array([[0,1], [1,2], [2,3]]).astype("int64").reshape(batch_size,2,1)
-        #batch2 = np.array([[1,2], [2,3], [3,4]]).astype("int64").reshape(batch_size,2,1)
-        batch1 = np.ones(
-            (batch_size, 1)).astype("int64").reshape(batch_size, 1, 1)
-        batch2 = np.ones(
-            (batch_size, 1)).astype("int64").reshape(batch_size, 1, 1)
-        data = [batch1, batch2]
-        data = [batch1]
-        filelist = []
-        for i in range(2):
-            filelist.append("test_pipeline_input_" + str(i))
-        for f in filelist:
-            with open(f, "wb") as fout:
-                for batch_data in data:
-                    for ins in batch_data:
-                        for slot in ins:
-                            binary_print(slot, fout)
-
-        dataset = fluid.DatasetFactory().create_dataset("FileInstantDataset")
-        dataset.set_use_var([x])
-        dataset.set_batch_size(batch_size)
-        dataset.set_filelist(filelist)
-
-        block = fluid.default_startup_program().global_block()
-        block.append_op(
-            type='c_comm_init_all', attrs={'ring_id': 0,
-                                           'devices': [0, 1]})
-        with open("main_program", "w") as fout:
-            fout.write(str(fluid.default_main_program()))
-        with open("startup_program", "w") as fout:
-            fout.write(str(fluid.default_startup_program()))
-        exe.run(fluid.default_startup_program())
-        emb_t = fluid.global_scope().find_var("embx").get_tensor()
-        para = np.ones((10, 2)).astype("float32")
-        emb_t.set(para, place)
-        for epoch in range(1):
-            exe.train_from_dataset(
-                fluid.default_main_program(),
-                dataset,
-                thread=2,
-                debug=False,
-                fetch_list=[],
-                fetch_info=[],
-                print_period=1)
-        batch_size = np.array(fluid.global_scope().find_var("hehe.batch_size")
-                              .get_tensor())
-        self.assertEqual(batch_size[0], 10002)
-        b = np.array(fluid.global_scope().find_var("hehe.batch_sum").get_tensor(
-        ))
-        self.assertEqual(b[0], 100002)
-        c = np.array(fluid.global_scope().find_var("hehe.batch_square_sum")
-                     .get_tensor())
-        self.assertEqual(c[0], 10162)
-
-        for f in filelist:
-            os.remove(f)
-
-
 class TestDataNormOpErrorr(unittest.TestCase):
     def test_errors(self):
         with program_guard(Program(), Program()):
diff --git a/python/paddle/fluid/tests/unittests/test_dataloader_keep_order.py b/python/paddle/fluid/tests/unittests/test_dataloader_keep_order.py
index e3fec5a62feeea9cc40d6e348cf99bc922f465dc..5796e13336ccf680930e0704e5f1fe5eca623937 100644
--- a/python/paddle/fluid/tests/unittests/test_dataloader_keep_order.py
+++ b/python/paddle/fluid/tests/unittests/test_dataloader_keep_order.py
@@ -32,7 +32,7 @@ def create_reader(shape, batch_number):
 class DataLoaderKeepOrderTestBase(unittest.TestCase):
     def initParameters(self):
         self.iterable = False
-        self.break_num = 10000
+        self.break_num = 100
 
     def setUp(self):
         self.epoch_num = 3
@@ -151,7 +151,7 @@ class DataLoaderKeepOrderTestBase(unittest.TestCase):
 class IterableDataLoaderKeepOrderTest2(DataLoaderKeepOrderTestBase):
     def initParameters(self):
         self.iterable = True
-        self.break_num = 10000
+        self.break_num = 100
 
 
 class IterableDataLoaderKeepOrderTest3(DataLoaderKeepOrderTestBase):
diff --git a/python/paddle/fluid/tests/unittests/test_dist_base.py b/python/paddle/fluid/tests/unittests/test_dist_base.py
index ac0713d65ecc4477c9f50b386e8dbf739208729f..ba292f2d87c376ace317fc3fb9b81ce5c5596eb2 100644
--- a/python/paddle/fluid/tests/unittests/test_dist_base.py
+++ b/python/paddle/fluid/tests/unittests/test_dist_base.py
@@ -936,6 +936,7 @@ class TestDistBase(unittest.TestCase):
                          check_error_log=False,
                          need_envs={},
                          log_name=""):
+
         required_envs = self._get_required_envs(check_error_log, need_envs)
 
         local_losses \
@@ -975,6 +976,7 @@ class TestDistBase(unittest.TestCase):
                                      check_error_log=False,
                                      need_envs={},
                                      log_name=""):
+
         # need open p2p or shm otherwise multi cards mode will hang
         need_envs.update({"NCCL_P2P_DISABLE": "0", "NCCL_SHM_DISABLE": "0"})
 
diff --git a/python/paddle/fluid/tests/unittests/test_dist_mnist_dgc_nccl.py b/python/paddle/fluid/tests/unittests/test_dist_mnist_dgc_nccl.py
index 0b9b85d5d52c38f748679a92a99ec61c3dec7903..07746dd9f6cff297feacfa2dac24d89b2af876ab 100644
--- a/python/paddle/fluid/tests/unittests/test_dist_mnist_dgc_nccl.py
+++ b/python/paddle/fluid/tests/unittests/test_dist_mnist_dgc_nccl.py
@@ -39,6 +39,7 @@ class TestDistMnistNCCL2DGC(TestDistBase):
         self._nccl2_mode = True
         self._use_dgc = True
 
+    @unittest.skip(reason="Skip unstable ut")
     def test_dist_train(self):
         import paddle.fluid as fluid
         if fluid.core.is_compiled_with_cuda():
@@ -68,6 +69,7 @@ class TestDistMnistNCCL2DGCMultiCards(TestDistBase):
         self._nccl2_mode = True
         self._use_dgc = True
 
+    @unittest.skip(reason="Skip unstable ut")
     def test_dist_train(self):
         import paddle.fluid as fluid
         if fluid.core.is_compiled_with_cuda():
diff --git a/python/paddle/fluid/tests/unittests/test_dist_simnet_bow.py b/python/paddle/fluid/tests/unittests/test_dist_simnet_bow.py
index a872b5ce4db9867dad37aad299198c3cc268b3cb..3189f092413c1f6f1526a5ca66b27f91c95082b1 100644
--- a/python/paddle/fluid/tests/unittests/test_dist_simnet_bow.py
+++ b/python/paddle/fluid/tests/unittests/test_dist_simnet_bow.py
@@ -46,7 +46,7 @@ class TestDistSimnetBow2x2DenseAsync(TestDistBase):
         self._sync_mode = False
         self._enforce_place = "CPU"
 
-    #FIXME(typhoonzero): fix async tests later
+    # FIXME(typhoonzero): fix async tests later
     def notest_simnet_bow(self):
         need_envs = {
             "IS_DISTRIBUTED": '0',
@@ -107,7 +107,7 @@ class TestDistSimnetBow2x2LookupTableSync(TestDistBase):
 
     def test_simnet_bow(self):
         need_envs = {
-            "IS_DISTRIBUTED": '1',
+            "IS_DISTRIBUTED": '0',
             "IS_SPARSE": '1',
             'IS_SELF_CONTAINED_LR': '1'
         }
@@ -126,7 +126,7 @@ class TestDistSimnetBow2x2LookupTableAsync(TestDistBase):
 
     def test_simnet_bow(self):
         need_envs = {
-            "IS_DISTRIBUTED": '1',
+            "IS_DISTRIBUTED": '0',
             "IS_SPARSE": '1',
             'IS_SELF_CONTAINED_LR': '1'
         }
@@ -145,7 +145,7 @@ class TestDistSimnetBow2x2LookupTableNotContainLRSync(TestDistBase):
 
     def test_simnet_bow(self):
         need_envs = {
-            "IS_DISTRIBUTED": '1',
+            "IS_DISTRIBUTED": '0',
             "IS_SPARSE": '1',
             'IS_SELF_CONTAINED_LR': '0'
         }
diff --git a/python/paddle/fluid/tests/unittests/test_executor_feed_scalar.py b/python/paddle/fluid/tests/unittests/test_executor_feed_non_tensor.py
similarity index 72%
rename from python/paddle/fluid/tests/unittests/test_executor_feed_scalar.py
rename to python/paddle/fluid/tests/unittests/test_executor_feed_non_tensor.py
index 562f3066efba78059d6d42de3f25b5e8a5f54c0d..23c4191f6cfd8cd19777fe4660acbee4f0c98a66 100644
--- a/python/paddle/fluid/tests/unittests/test_executor_feed_scalar.py
+++ b/python/paddle/fluid/tests/unittests/test_executor_feed_non_tensor.py
@@ -84,6 +84,28 @@ class TestExecutor(unittest.TestCase):
             self.assertEqual(_lr._dtype(), fluid.core.VarDesc.VarType.FP32)
             self.assertEqual(type(a), int)
 
+    def test_program_feed_list(self):
+        main_program = fluid.Program()
+        startup_program = fluid.Program()
+        scope = fluid.Scope()
+        with fluid.program_guard(main_program, startup_program):
+            with fluid.scope_guard(scope):
+                cpu = fluid.CPUPlace()
+                exe = fluid.Executor(cpu)
+                lr, cost = self.net()
+                exe.run(startup_program)
+                train_data = [[1.0], [2.0], [3.0], [4.0]]
+                y_true = [[2.0], [4.0], [6.0], [8.0]]
+                a = 0
+                _lr, _ = exe.run(feed={'x': train_data,
+                                       'y': y_true,
+                                       'lr': a},
+                                 fetch_list=[lr, cost],
+                                 return_numpy=False)
+            self.assertEqual(_lr._dtype(), lr.dtype)
+            self.assertEqual(_lr._dtype(), fluid.core.VarDesc.VarType.FP32)
+            self.assertEqual(type(y_true), list)
+
     def test_compiled_program_feed_scalar(self):
         main_program = fluid.Program()
         startup_program = fluid.Program()
@@ -125,10 +147,32 @@ class TestAsLodTensor(unittest.TestCase):
                                               fluid.core.VarDesc.VarType.FP64)
         self.assertEqual(tensor._dtype(), fluid.core.VarDesc.VarType.FP64)
 
-    def test_as_lodtensor_error(self):
+    def test_as_lodtensor_assertion_error(self):
         cpu = fluid.CPUPlace()
         self.assertRaises(AssertionError, fluid.executor._as_lodtensor, 1, cpu)
 
+    def test_as_lodtensor_type_error(self):
+        cpu = fluid.CPUPlace()
+        self.assertRaises(TypeError, fluid.executor._as_lodtensor, {"a": 1},
+                          cpu, fluid.core.VarDesc.VarType.INT32)
+
+    def test_as_lodtensor_list(self):
+        cpu = fluid.CPUPlace()
+        tensor = fluid.executor._as_lodtensor([1, 2], cpu,
+                                              fluid.core.VarDesc.VarType.FP64)
+        self.assertEqual(tensor._dtype(), fluid.core.VarDesc.VarType.FP64)
+
+    def test_as_lodtensor_tuple(self):
+        cpu = fluid.CPUPlace()
+        tensor = fluid.executor._as_lodtensor((1, 2), cpu,
+                                              fluid.core.VarDesc.VarType.FP64)
+        self.assertEqual(tensor._dtype(), fluid.core.VarDesc.VarType.FP64)
+
+    def test_as_lodtensor_nested_list(self):
+        cpu = fluid.CPUPlace()
+        self.assertRaises(TypeError, fluid.executor._as_lodtensor,
+                          [[1], [1, 2]], cpu, fluid.core.VarDesc.VarType.INT32)
+
 
 if __name__ == '__main__':
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_fake_quantize_op.py b/python/paddle/fluid/tests/unittests/test_fake_quantize_op.py
index 6943f3d0ff417798099996bd8f63300775a421df..4314faaf397a2a53a65368ef6625952bc22c9616 100644
--- a/python/paddle/fluid/tests/unittests/test_fake_quantize_op.py
+++ b/python/paddle/fluid/tests/unittests/test_fake_quantize_op.py
@@ -242,6 +242,36 @@ class TestFakeQuantDequantMovingOp(TestMovingOpBase):
         return np.round(self.inputs['X'] / out_scale *
                         range_v) * out_scale / range_v
 
+    def test_check_grad(self):
+        x = self.inputs["X"]
+        gradient = [np.ones(x.shape) / np.product(x.shape)]
+        self.check_grad(["X"], "Out", user_defined_grads=gradient)
+
+
+class TestFakeQuantDequantAbsOp(OpTest):
+    def setUp(self):
+        self.op_type = "fake_quantize_dequantize_abs_max"
+        self.attrs = {'bit_length': 8}
+        self.inputs = {'X': np.random.random((124, 240)).astype("float32"), }
+        scale = np.max(np.abs(self.inputs['X'])).astype("float32")
+        out_data = self.calc_output(scale)
+        self.outputs = {
+            'Out': out_data,
+            'OutScale': np.array(scale).astype("float32"),
+        }
+
+    def calc_output(self, scale):
+        range_v = (1 << (self.attrs['bit_length'] - 1)) - 1
+        return np.round(self.inputs['X'] / scale * range_v) * scale / range_v
+
+    def test_check_output(self):
+        self.check_output()
+
+    def test_check_grad(self):
+        x = self.inputs["X"]
+        gradient = [np.ones(x.shape) / np.product(x.shape)]
+        self.check_grad(["X"], "Out", user_defined_grads=gradient)
+
 
 if __name__ == "__main__":
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_fill_any_like_op.py b/python/paddle/fluid/tests/unittests/test_fill_any_like_op.py
index 52fdf94f4a8728336d6f76ddc013b984305de1d7..b18b5456c12aa23beae6b516c418c60783a85551 100644
--- a/python/paddle/fluid/tests/unittests/test_fill_any_like_op.py
+++ b/python/paddle/fluid/tests/unittests/test_fill_any_like_op.py
@@ -97,185 +97,5 @@ class TestFillAnyLikeOpFloat16(TestFillAnyLikeOp):
         self.dtype = np.float16
 
 
-class TestFillAnyLikeOp_attr_out(unittest.TestCase):
-    """ Test fill_any_like op(whose API is full_like) for attr out. """
-
-    def test_attr_tensor_API(self):
-        startup_program = fluid.Program()
-        train_program = fluid.Program()
-        with fluid.program_guard(train_program, startup_program):
-            fill_value = 2.0
-            input = fluid.data(name='input', dtype='float32', shape=[2, 3])
-            output = paddle.full_like(input, fill_value)
-            output_dtype = paddle.full_like(input, fill_value, dtype='float32')
-
-            place = fluid.CPUPlace()
-            if fluid.core.is_compiled_with_cuda():
-                place = fluid.CUDAPlace(0)
-            exe = fluid.Executor(place)
-            exe.run(startup_program)
-
-            img = np.array([[1, 2, 3], [4, 5, 6]]).astype(np.float32)
-
-            res = exe.run(train_program,
-                          feed={'input': img},
-                          fetch_list=[output])
-
-            out_np = np.array(res[0])
-            self.assertTrue(
-                not (out_np - np.full_like(img, fill_value)).any(),
-                msg="full_like output is wrong, out = " + str(out_np))
-
-
-class TestFillAnyLikeOpError(unittest.TestCase):
-    def test_errors(self):
-        with program_guard(Program(), Program()):
-            #for ci coverage
-
-            input_data = fluid.data(name='input', dtype='float32', shape=[2, 3])
-            output = paddle.full_like(input_data, 2.0)
-
-            def test_input_dtype():
-                paddle.full_like
-
-            self.assertRaises(
-                ValueError,
-                paddle.full_like,
-                input=input_data,
-                fill_value=2,
-                dtype='uint4')
-            self.assertRaises(
-                TypeError,
-                paddle.full_like,
-                input=input_data,
-                fill_value=2,
-                dtype='int16')
-
-
-class ApiOnesLikeTest(unittest.TestCase):
-    def test_out(self):
-        with fluid.program_guard(fluid.Program()):
-            data = fluid.data(shape=[10], dtype="float64", name="data")
-            ones = paddle.ones_like(data, device="cpu")
-            place = fluid.CPUPlace()
-            exe = fluid.Executor(place)
-            result, = exe.run(feed={"data": np.random.rand(10)},
-                              fetch_list=[ones])
-            expected_result = np.ones(10, dtype="float64")
-        self.assertEqual((result == expected_result).all(), True)
-
-        with fluid.program_guard(fluid.Program()):
-            data = fluid.data(shape=[10], dtype="float64", name="data")
-            ones = paddle.ones_like(data, device="cpu", dtype="float32")
-            place = fluid.CPUPlace()
-            exe = fluid.Executor(place)
-            result, = exe.run(feed={"data": np.random.rand(10)},
-                              fetch_list=[ones])
-            expected_result = np.ones(10, dtype="float32")
-        self.assertEqual((result == expected_result).all(), True)
-
-        with fluid.program_guard(fluid.Program()):
-            data = fluid.data(shape=[10], dtype="float64", name="data")
-            ones = paddle.ones_like(data)
-            place = fluid.CPUPlace()
-            exe = fluid.Executor(place)
-            result, = exe.run(feed={"data": np.random.rand(10)},
-                              fetch_list=[ones])
-            expected_result = np.ones(10, dtype="float32")
-        self.assertEqual((result == expected_result).all(), True)
-
-
-class ApiZerosLikeTest(unittest.TestCase):
-    def test_out(self):
-        with fluid.program_guard(fluid.Program()):
-            data = fluid.data(shape=[10], dtype="float64", name="data")
-            zeros = paddle.zeros_like(data, device="cpu")
-            place = fluid.CPUPlace()
-            exe = fluid.Executor(place)
-            result, = exe.run(feed={"data": np.random.rand(10)},
-                              fetch_list=[zeros])
-            expected_result = np.zeros(10, dtype="float64")
-        self.assertEqual((result == expected_result).all(), True)
-
-        with fluid.program_guard(fluid.Program()):
-            data = fluid.data(shape=[10], dtype="float64", name="data")
-            zeros = paddle.zeros_like(data, device="cpu", dtype="float32")
-            place = fluid.CPUPlace()
-            exe = fluid.Executor(place)
-            result, = exe.run(feed={"data": np.random.rand(10)},
-                              fetch_list=[zeros])
-            expected_result = np.zeros(10, dtype="float32")
-        self.assertEqual((result == expected_result).all(), True)
-
-        with fluid.program_guard(fluid.Program()):
-            data = fluid.data(shape=[10], dtype="float64", name="data")
-            zeros = paddle.zeros_like(data)
-            place = fluid.CPUPlace()
-            exe = fluid.Executor(place)
-            result, = exe.run(feed={"data": np.random.rand(10)},
-                              fetch_list=[zeros])
-            expected_result = np.zeros(10, dtype="float32")
-        self.assertEqual((result == expected_result).all(), True)
-
-
-class TestOnesZerosError(unittest.TestCase):
-    def test_errors(self):
-        def test_device_error1():
-            with fluid.program_guard(fluid.Program(), fluid.Program()):
-                data = fluid.data(name="data", shape=[10], dtype="float32")
-                paddle.ones_like(data, device="opu")
-
-        self.assertRaises(ValueError, test_device_error1)
-
-        def test_device_error2():
-            with fluid.program_guard(fluid.Program(), fluid.Program()):
-                data = fluid.data(name="data", shape=[10], dtype="float32")
-                paddle.ones_like(data, dtype="float")
-
-        self.assertRaises(ValueError, test_device_error2)
-
-        def test_device_error3():
-            with fluid.program_guard(fluid.Program(), fluid.Program()):
-                data = fluid.data(name="data", shape=[10], dtype="float32")
-                paddle.zeros_like(data, device="opu")
-
-        self.assertRaises(ValueError, test_device_error3)
-
-        def test_device_error4():
-            with fluid.program_guard(fluid.Program(), fluid.Program()):
-                data = fluid.data(name="data", shape=[10], dtype="float32")
-                paddle.zeros_like(data, dtype="float")
-
-        self.assertRaises(ValueError, test_device_error4)
-
-        def test_ones_like_type_error():
-            with fluid.program_guard(fluid.Program(), fluid.Program()):
-                fluid.layers.ones_like([10], dtype="float")
-
-        self.assertRaises(TypeError, test_ones_like_type_error)
-
-        def test_ones_like_dtype_error():
-            with fluid.program_guard(fluid.Program(), fluid.Program()):
-                data = fluid.data(name="data", shape=[10], dtype="float16")
-                fluid.layers.ones_like(data, dtype="float32")
-
-        self.assertRaises(TypeError, test_ones_like_dtype_error)
-
-        def test_ones_like_out_type_error():
-            with fluid.program_guard(fluid.Program(), fluid.Program()):
-                data = fluid.data(name="data", shape=[10], dtype="float32")
-                fluid.layers.ones_like(data, dtype="float32", out=[10])
-
-        self.assertRaises(TypeError, test_ones_like_out_type_error)
-
-        def test_ones_like_out_dtype_error():
-            with fluid.program_guard(fluid.Program(), fluid.Program()):
-                data = fluid.data(name="data", shape=[10], dtype="float32")
-                out = fluid.data(name="out", shape=[10], dtype="float16")
-                fluid.layers.ones_like(data, dtype="float32", out=out)
-
-        self.assertRaises(TypeError, test_ones_like_out_dtype_error)
-
-
 if __name__ == "__main__":
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_fill_constant_op.py b/python/paddle/fluid/tests/unittests/test_fill_constant_op.py
index 37d4e3e7abb0d7b1dd878a5df20ede401a114606..0bd3516e48d2cf1db4a4f73678faa13b26c64f40 100644
--- a/python/paddle/fluid/tests/unittests/test_fill_constant_op.py
+++ b/python/paddle/fluid/tests/unittests/test_fill_constant_op.py
@@ -22,8 +22,8 @@ import paddle
 import paddle.fluid.core as core
 from paddle.fluid.op import Operator
 import paddle.fluid as fluid
-from paddle.fluid import compiler, Program, program_guard
 import numpy as np
+from paddle.fluid import compiler, Program, program_guard
 
 
 # Situation 1: Attr(shape) is a list(without tensor)
@@ -85,16 +85,14 @@ class TestFillConstantOp4(OpTest):
 
 class TestFillConstantOp5(unittest.TestCase):
     def test_errors(self):
-        with fluid.program_guard(fluid.Program()):
-            data = fluid.data(name="X", shape=[1], dtype="float32")
-            out = paddle.zeros(shape=[1], out=data, dtype="float32")
+        with program_guard(Program()):
+            out_np = np.zeros(shape=(1), dtype='float32')
+            out = paddle.zeros(shape=[1], dtype="float32")
             place = fluid.CPUPlace()
             exe = fluid.Executor(place)
-            result = exe.run(feed={"X": np.array(
-                [0.1], dtype="float32")},
-                             fetch_list=[data, out])
-            self.assertEqual(result[0], result[1])
-        with fluid.program_guard(fluid.Program()):
+            result = exe.run(fetch_list=[out])
+            self.assertEqual((result == out_np).all(), True)
+        with program_guard(Program()):
             data = fluid.data(name="X", shape=[1], dtype="float32")
             out = paddle.ones(shape=[1], out=data, dtype="float32")
             place = fluid.CPUPlace()
@@ -389,98 +387,5 @@ class TestFillConstantOpError(unittest.TestCase):
             self.assertRaises(TypeError, test_shape_tensor_list_dtype)
 
 
-class ApiZerosTest(unittest.TestCase):
-    def test_out(self):
-        with fluid.program_guard(fluid.Program()):
-            zeros = paddle.zeros(shape=[10], dtype="float64")
-            place = fluid.CPUPlace()
-            exe = fluid.Executor(place)
-            result, = exe.run(fetch_list=[zeros])
-            expected_result = np.zeros(10, dtype="float64")
-        self.assertEqual((result == expected_result).all(), True)
-
-        with fluid.program_guard(fluid.Program()):
-            zeros = paddle.zeros(shape=[10], dtype="int64")
-            place = fluid.CPUPlace()
-            exe = fluid.Executor(place)
-            result, = exe.run(fetch_list=[zeros])
-            expected_result = np.zeros(10, dtype="int64")
-        self.assertEqual((result == expected_result).all(), True)
-
-        with fluid.program_guard(fluid.Program()):
-            zeros = paddle.zeros(shape=[10], dtype="int64", device="cpu")
-            place = fluid.CPUPlace()
-            exe = fluid.Executor(place)
-            result, = exe.run(fetch_list=[zeros])
-            expected_result = np.zeros(10, dtype="int64")
-        self.assertEqual((result == expected_result).all(), True)
-
-
-class ApiOnesTest(unittest.TestCase):
-    def test_out(self):
-        with fluid.program_guard(fluid.Program()):
-            ones = paddle.ones(shape=[10], dtype="float64")
-            place = fluid.CPUPlace()
-            exe = fluid.Executor(place)
-            result, = exe.run(fetch_list=[ones])
-            expected_result = np.ones(10, dtype="float64")
-        self.assertEqual((result == expected_result).all(), True)
-
-        with fluid.program_guard(fluid.Program()):
-            ones = paddle.ones(shape=[10], dtype="int64")
-            place = fluid.CPUPlace()
-            exe = fluid.Executor(place)
-            result, = exe.run(fetch_list=[ones])
-            expected_result = np.ones(10, dtype="int64")
-        self.assertEqual((result == expected_result).all(), True)
-
-        with fluid.program_guard(fluid.Program()):
-            ones = paddle.ones(shape=[10], dtype="int64", device="cpu")
-            place = fluid.CPUPlace()
-            exe = fluid.Executor(place)
-            result, = exe.run(fetch_list=[ones])
-            expected_result = np.ones(10, dtype="int64")
-        self.assertEqual((result == expected_result).all(), True)
-
-
-class ApiOnesZerosError(unittest.TestCase):
-    def test_errors(self):
-        def test_error1():
-            with fluid.program_guard(fluid.Program()):
-                ones = paddle.ones(shape=10, dtype="int64", device="opu")
-
-        self.assertRaises(ValueError, test_error1)
-
-        def test_error2():
-            with fluid.program_guard(fluid.Program()):
-                ones = paddle.ones(shape=10, dtype="int64", device="opu")
-
-        self.assertRaises(ValueError, test_error2)
-
-        def test_error3():
-            with fluid.program_guard(fluid.Program()):
-                ones = fluid.layers.ones(shape=10, dtype="int64")
-
-        self.assertRaises(TypeError, test_error3)
-
-        def test_error4():
-            with fluid.program_guard(fluid.Program()):
-                ones = fluid.layers.ones(shape=[10], dtype="int8")
-
-        self.assertRaises(TypeError, test_error4)
-
-        def test_error5():
-            with fluid.program_guard(fluid.Program()):
-                ones = fluid.layers.zeros(shape=10, dtype="int64")
-
-        self.assertRaises(TypeError, test_error5)
-
-        def test_error6():
-            with fluid.program_guard(fluid.Program()):
-                ones = fluid.layers.zeros(shape=[10], dtype="int8")
-
-        self.assertRaises(TypeError, test_error6)
-
-
 if __name__ == "__main__":
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_fl_listen_and_serv_op.py b/python/paddle/fluid/tests/unittests/test_fl_listen_and_serv_op.py
deleted file mode 100644
index de6b48e2cec602f8f73d7cf5f3f9b1fc66d55be6..0000000000000000000000000000000000000000
--- a/python/paddle/fluid/tests/unittests/test_fl_listen_and_serv_op.py
+++ /dev/null
@@ -1,219 +0,0 @@
-#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""test f1 listen and serv_op."""
-
-from __future__ import print_function
-
-import paddle
-import paddle.fluid as fluid
-from paddle.fluid import Program
-import os
-import signal
-import subprocess
-import time
-import unittest
-from multiprocessing import Process
-from op_test import OpTest
-import numpy
-import urllib
-import sys
-from dist_test_utils import *
-
-cache_path = os.path.expanduser('~/.cache/paddle/dataset')
-
-
-def run_trainer(use_cuda, sync_mode, ip, port, trainers, trainer_id):
-    ''' 
-    This function is run trainer.
-    Args:
-        use_cuda (bool): whether use cuda.
-        sync_mode (nouse): specify sync mode.
-        ip (string): the ip address.
-        port (string): the port for listening.
-        trainers (int): the count of trainer.
-        trainer_id (int): the id of trainer.
-
-    Returns:
-        None
-    '''
-    x = fluid.layers.data(name='x', shape=[1], dtype='float32')
-    y_predict = fluid.layers.fc(input=x, size=1, act=None)
-    y = fluid.layers.data(name='y', shape=[1], dtype='float32')
-    # loss function
-    cost = fluid.layers.square_error_cost(input=y_predict, label=y)
-    avg_cost = fluid.layers.mean(cost)
-    # optimizer
-    sgd_optimizer = fluid.optimizer.SGD(learning_rate=0.001)
-    sgd_optimizer.minimize(avg_cost)
-    with open("{}/trainer_recv_program.dms".format(cache_path), "rb") as f:
-        trainer_recv_program_desc_str = f.read()
-    with open("{}/trainer_main_program.dms".format(cache_path), "rb") as f:
-        trainer_main_program_desc_str = f.read()
-    with open("{}/trainer_send_program.dms".format(cache_path), "rb") as f:
-        trainer_send_program_desc_str = f.read()
-    recv_program = Program.parse_from_string(trainer_recv_program_desc_str)
-    main_program = Program.parse_from_string(trainer_main_program_desc_str)
-    send_program = Program.parse_from_string(trainer_send_program_desc_str)
-
-    trainer_startup_program = fluid.default_startup_program()
-    place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace()
-    exe = fluid.Executor(place)
-
-    exe.run(trainer_startup_program)
-    for i in range(5):
-        exe.run(recv_program)
-        exe.run(fluid.default_main_program(),
-                feed={
-                    "x": numpy.array([1, 2]).astype('float32').reshape(2, 1),
-                    "y": numpy.array([2, 3]).astype('float32').reshape(2, 1)
-                })
-        exe.run(send_program)
-
-
-def run_pserver(use_cuda, sync_mode, ip, port, trainers, trainer_id):
-    ''' 
-    This function is run trainer.
-    Args:
-        use_cuda (bool): whether use cuda.
-        sync_mode (nouse): specify sync mode.
-        ip (string): the ip address.
-        port (string): the port for listening.
-        trainers (int): the count of trainer.
-        trainer_id (int): the id of trainer.
-
-    Returns:
-        None
-    '''
-    remove_ps_flag(os.getpid())
-    x = fluid.layers.data(name='x', shape=[1], dtype='float32')
-    y_predict = fluid.layers.fc(input=x, size=1, act=None)
-    y = fluid.layers.data(name='y', shape=[1], dtype='float32')
-    # loss function
-    cost = fluid.layers.square_error_cost(input=y_predict, label=y)
-    avg_cost = fluid.layers.mean(cost)
-    # optimizer
-    sgd_optimizer = fluid.optimizer.SGD(learning_rate=0.001)
-    sgd_optimizer.minimize(avg_cost)
-    with open("{}/pserver_startup_program.dms".format(cache_path), "rb") as f:
-        pserver_startup_program_desc_str = f.read()
-    with open("{}/pserver_main_program.dms".format(cache_path), "rb") as f:
-        pserver_main_program_desc_str = f.read()
-
-    startup_program = Program.parse_from_string(
-        pserver_startup_program_desc_str)
-    main_program = Program.parse_from_string(pserver_main_program_desc_str)
-
-    place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace()
-    exe = fluid.Executor(place)
-    exe.run(startup_program)
-    exe.run(main_program)
-
-
-class TestFlListenAndServOp(unittest.TestCase):
-    """This class is Test Fl Listen And ServOp."""
-
-    def setUp(self):
-        """This function si set Up."""
-        self.ps_timeout = 5
-        self.ip = "127.0.0.1"
-        self.port = "6000"
-        self.trainers = 2
-        self.trainer_id = 0
-
-    def _start_pserver(self, use_cuda, sync_mode, pserver_func):
-        """This function is start pserver."""
-        p = Process(
-            target=pserver_func,
-            args=(use_cuda, sync_mode, self.ip, self.port, self.trainers,
-                  self.trainer_id))
-        p.daemon = True
-        p.start()
-        return p
-
-    def _start_trainer0(self, use_cuda, sync_mode, pserver_func):
-        """This function is start trainer0."""
-        p = Process(
-            target=pserver_func,
-            args=(use_cuda, sync_mode, self.ip, self.port, self.trainers, 0))
-        p.daemon = True
-        p.start()
-        return p
-
-    def _start_trainer1(self, use_cuda, sync_mode, pserver_func):
-        """This function is start trainer1."""
-        p = Process(
-            target=pserver_func,
-            args=(use_cuda, sync_mode, self.ip, self.port, self.trainers, 1))
-        p.daemon = True
-        p.start()
-        return p
-
-    def _wait_ps_ready(self, pid):
-        """This function is wait ps ready."""
-        start_left_time = self.ps_timeout
-        sleep_time = 0.5
-        while True:
-            assert start_left_time >= 0, "wait ps ready failed"
-            time.sleep(sleep_time)
-            try:
-                os.stat("/tmp/paddle.%d.port" % pid)
-                return
-            except os.error:
-                start_left_time -= sleep_time
-
-    def test_rpc_interfaces(self):
-        """TODO(Yancey1989): need to make sure the rpc interface correctly."""
-        # TODO(Yancey1989): need to make sure the rpc interface correctly.
-        pass
-
-    def test_handle_signal_in_serv_op(self):
-        """run pserver on CPU in sync mode."""
-        # run pserver on CPU in sync mode
-        if sys.platform == 'win32' or sys.platform == 'sys.platform':
-            pass
-        else:
-            print(sys.platform)
-            file_list = [
-                'pserver_startup_program.dms', 'pserver_main_program.dms',
-                'trainer_recv_program.dms', 'trainer_main_program.dms',
-                'trainer_send_program.dms'
-            ]
-            if not os.path.exists(cache_path):
-                os.makedirs(cache_path)
-            prefix = 'wget --no-check-certificate https://paddlefl.bj.bcebos.com/test_fl_listen_and_serv/'
-            for f in file_list:
-                if not os.path.exists('{}/{}'.format(cache_path, f)):
-                    cmd = "wget --no-check-certificate https://paddlefl.bj.bcebos.com/test_fl_listen_and_serv/{} -P {}/".format(
-                        f, cache_path)
-                    os.system(cmd)
-            p1 = self._start_pserver(False, True, run_pserver)
-            self._wait_ps_ready(p1.pid)
-            time.sleep(5)
-            t1 = self._start_trainer0(False, True, run_trainer)
-            time.sleep(2)
-            t2 = self._start_trainer1(False, True, run_trainer)
-            # raise SIGTERM to pserver
-            time.sleep(2)
-            cmd_del = "rm trainer*dms* pserver*dms*"
-            os.system(cmd_del)
-            os.kill(p1.pid, signal.SIGINT)
-            p1.join()
-            os.kill(t1.pid, signal.SIGINT)
-            t1.join()
-            os.kill(t2.pid, signal.SIGINT)
-            t2.join()
-
-
-if __name__ == '__main__':
-    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_fleet_checkpoint.py b/python/paddle/fluid/tests/unittests/test_fleet_checkpoint.py
index 35c2801bde31cc1b69a4cf9baf53fccc23f765c4..3318b67cadc74080468c0eab7ae1df1e834b5952 100644
--- a/python/paddle/fluid/tests/unittests/test_fleet_checkpoint.py
+++ b/python/paddle/fluid/tests/unittests/test_fleet_checkpoint.py
@@ -17,11 +17,14 @@ import paddle.fluid as fluid
 import paddle.fluid.incubate.fleet.base.role_maker as role_maker
 from paddle.fluid.incubate.fleet.collective import CollectiveOptimizer, fleet, TrainStatus
 import os
-from paddle.distributed.fs_wrapper import LocalFS, BDFS
+import sys
+
+from paddle.fluid.incubate.fleet.utils.fs import LocalFS
+from paddle.fluid.incubate.fleet.utils.hdfs import HDFSClient
 
 
 class FleetTest(unittest.TestCase):
-    def _test_check_point(self, fs, dir_path):
+    def _test_checkpoint(self, fs, dir_path):
         file_name = "persistables"
 
         os.environ["TRAINING_ROLE"] = "TRAINER"
@@ -47,30 +50,60 @@ class FleetTest(unittest.TestCase):
         exe.run(fluid.default_startup_program())
 
         status = TrainStatus(2)
-        fleet.save_check_point(exe, dir_path, train_status=status, fs=fs)
+        fleet.save_checkpoint(exe, dir_path, train_status=status, fs=fs)
         n1 = fleet._get_last_checkpoint_no(dir_path, fs=fs)
 
-        status2 = fleet.load_check_point(exe, dir_path, trainer_id=0, fs=fs)
+        status2 = fleet.load_checkpoint(exe, dir_path, trainer_id=0, fs=fs)
         self.assertEqual(status2, status)
 
-        fleet.save_check_point(exe, dir_path, train_status=status, fs=fs)
+        fleet.save_checkpoint(exe, dir_path, train_status=status, fs=fs)
         n2 = fleet._get_last_checkpoint_no(dir_path, fs=fs)
         self.assertEqual(n2, n1 + 1)
 
-        fleet.clean_redundant_check_points(dir_path, fs=fs)
+        fleet.clean_redundant_checkpoints(dir_path, fs=fs)
 
-    def test_hdfs_check_point(self):
-        try:
-            fs = BDFS("xxxx", "xxxx", 1 * 1000, 1 * 1000)
-            dir_path = "/user/Paddle_Data/gongweibao/edl_test/my_paddle_model"
-            self._test_check_point(fs, dir_path)
-        except Exception as e:
-            print(e)
+        # unnormal
+        # test remain_all_checkpoint 
+        fleet.save_checkpoint(
+            exe,
+            dir_path,
+            train_status=status,
+            fs=fs,
+            remain_all_checkpoint=False)
 
-    def test_local_check_point(self):
+        # can't save under a file
+        fs = LocalFS()
+        cache_path = "./.load_cache"
+        fs.touch(cache_path)
+        try:
+            fleet.save_checkpoint(
+                exe,
+                dir_path,
+                train_status=status,
+                fs=fs,
+                cache_path=cache_path)
+            self.assertFalse(True)
+        except:
+            pass
+
+        # can't load under a file
+        try:
+            status2 = fleet.load_checkpoint(
+                exe, dir_path, trainer_id=0, fs=fs, cache_path=cache_path)
+            self.assertFalse(True)
+        except:
+            pass
+        fs.delete(cache_path)
+
+    def test_hdfs_checkpoint(self):
+        fs = HDFSClient("/usr/local/hadoop-2.7.7", None)
+        dir_path = "./checkpoint_test_hdfs"
+        self._test_checkpoint(fs, os.path.abspath(dir_path))
+
+    def test_local_checkpoint(self):
         fs = LocalFS()
-        dir_path = "./my_paddle_model"
-        self._test_check_point(fs, dir_path)
+        dir_path = "./checkpoint_test_local"
+        self._test_checkpoint(fs, dir_path)
 
 
 if __name__ == '__main__':
diff --git a/python/paddle/fluid/tests/unittests/test_fleet_distributed_strategy.py b/python/paddle/fluid/tests/unittests/test_fleet_distributed_strategy.py
new file mode 100644
index 0000000000000000000000000000000000000000..0668546a703bc00369d55e12d4b03c934c9315c2
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/test_fleet_distributed_strategy.py
@@ -0,0 +1,341 @@
+# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+import paddle
+import os
+
+
+class TestStrategyConfig(unittest.TestCase):
+    def test_amp(self):
+        strategy = paddle.fleet.DistributedStrategy()
+        strategy.amp = True
+        self.assertEqual(strategy.amp, True)
+        strategy.amp = False
+        self.assertEqual(strategy.amp, False)
+        strategy.amp = "True"
+        self.assertEqual(strategy.amp, False)
+
+    def test_amp_loss_scaling(self):
+        strategy = paddle.fleet.DistributedStrategy()
+        strategy.amp_loss_scaling = 32768
+        self.assertEqual(strategy.amp_loss_scaling, 32768)
+        strategy.amp_loss_scaling = 0.1
+        self.assertEqual(strategy.amp_loss_scaling, 32768)
+
+    def test_recompute(self):
+        strategy = paddle.fleet.DistributedStrategy()
+        strategy.recompute = True
+        self.assertEqual(strategy.recompute, True)
+        strategy.recompute = False
+        self.assertEqual(strategy.recompute, False)
+        strategy.recompute = "True"
+        self.assertEqual(strategy.recompute, False)
+
+    def test_recompute_checkpoints(self):
+        strategy = paddle.fleet.DistributedStrategy()
+        strategy.recompute_checkpoints = ["var1", "var2", "var3"]
+        self.assertEqual(len(strategy.recompute_checkpoints), 3)
+        import paddle.fluid as fluid
+        program = fluid.Program()
+        cur_block = program.current_block()
+        var1 = cur_block.create_var(name="var4", shape=[1, 1], dtype="int32")
+        var2 = cur_block.create_var(name="var5", shape=[1, 1], dtype="int32")
+        var3 = cur_block.create_var(name="var6", shape=[1, 1], dtype="int32")
+        strategy.recompute_checkpoints = [var1, var2, var3]
+        self.assertEqual(len(strategy.recompute_checkpoints), 3)
+        self.assertEqual(strategy.recompute_checkpoints[0], "var4")
+        strategy.recompute_checkpoints = [var1, "var2", var3]
+        self.assertEqual(strategy.recompute_checkpoints[1], "var5")
+
+    def test_pipeline(self):
+        strategy = paddle.fleet.DistributedStrategy()
+        strategy.pipeline = True
+        self.assertEqual(strategy.pipeline, True)
+        strategy.pipeline = False
+        self.assertEqual(strategy.pipeline, False)
+        strategy.pipeline = "True"
+        self.assertEqual(strategy.pipeline, False)
+
+    def test_pipeline_micro_batch(self):
+        strategy = paddle.fleet.DistributedStrategy()
+        strategy.pipeline_micro_batch = 1
+        self.assertEqual(strategy.pipeline_micro_batch, 1)
+        strategy.pipeline_micro_batch = 0.1
+        self.assertEqual(strategy.pipeline_micro_batch, 1)
+
+    def test_localsgd(self):
+        strategy = paddle.fleet.DistributedStrategy()
+        strategy.localsgd = True
+        self.assertEqual(strategy.localsgd, True)
+        strategy.localsgd = False
+        self.assertEqual(strategy.localsgd, False)
+        strategy.localsgd = "True"
+        self.assertEqual(strategy.localsgd, False)
+
+    def test_localsgd_k_step(self):
+        strategy = paddle.fleet.DistributedStrategy()
+        strategy.localsgd_k_step = 1
+        self.assertEqual(strategy.localsgd_k_step, 1)
+        strategy.localsgd_k_step = "2"
+        self.assertEqual(strategy.localsgd_k_step, 1)
+
+    def test_dgc(self):
+        strategy = paddle.fleet.DistributedStrategy()
+        strategy.dgc = True
+        self.assertEqual(strategy.dgc, True)
+        strategy.dgc = False
+        self.assertEqual(strategy.dgc, False)
+        strategy.dgc = "True"
+        self.assertEqual(strategy.dgc, False)
+
+    def test_hierachical_allreduce(self):
+        strategy = paddle.fleet.DistributedStrategy()
+        strategy.hierachical_allreduce = True
+        self.assertEqual(strategy.hierachical_allreduce, True)
+        strategy.hierachical_allreduce = False
+        self.assertEqual(strategy.hierachical_allreduce, False)
+        strategy.hierachical_allreduce = "True"
+        self.assertEqual(strategy.hierachical_allreduce, False)
+
+    def test_nccl_comm_num(self):
+        strategy = paddle.fleet.DistributedStrategy()
+        strategy.nccl_comm_num = 1
+        self.assertEqual(strategy.nccl_comm_num, 1)
+        strategy.nccl_comm_num = "2"
+        self.assertEqual(strategy.nccl_comm_num, 1)
+
+    def test_gradient_merge(self):
+        strategy = paddle.fleet.DistributedStrategy()
+        strategy.gradient_merge = True
+        self.assertEqual(strategy.gradient_merge, True)
+        strategy.gradient_merge = False
+        self.assertEqual(strategy.gradient_merge, False)
+        strategy.gradient_merge = "True"
+        self.assertEqual(strategy.gradient_merge, False)
+
+    def test_gradient_merge_k_step(self):
+        strategy = paddle.fleet.DistributedStrategy()
+        strategy.gradient_merge_k_step = 1
+        self.assertEqual(strategy.gradient_merge_k_step, 1)
+        strategy.gradient_merge_k_step = "2"
+        self.assertEqual(strategy.gradient_merge_k_step, 1)
+
+    def test_sequential_execution(self):
+        strategy = paddle.fleet.DistributedStrategy()
+        strategy.sequential_execution = True
+        self.assertEqual(strategy.sequential_execution, True)
+        strategy.sequential_execution = False
+        self.assertEqual(strategy.sequential_execution, False)
+        strategy.sequential_execution = "True"
+        self.assertEqual(strategy.sequential_execution, False)
+
+    def test_lars(self):
+        strategy = paddle.fleet.DistributedStrategy()
+        strategy.lars = True
+        self.assertEqual(strategy.lars, True)
+        strategy.lars = False
+        self.assertEqual(strategy.lars, False)
+        strategy.lars = "True"
+        self.assertEqual(strategy.lars, False)
+
+    def test_lamb(self):
+        strategy = paddle.fleet.DistributedStrategy()
+        strategy.lamb = True
+        self.assertEqual(strategy.lamb, True)
+        strategy.lamb = False
+        self.assertEqual(strategy.lamb, False)
+        strategy.lamb = "True"
+        self.assertEqual(strategy.lamb, False)
+
+    def test_fuse_elewise_add_act_ops(self):
+        strategy = paddle.fleet.DistributedStrategy()
+        strategy.fuse_elewise_add_act_ops = True
+        self.assertEqual(strategy.fuse_elewise_add_act_ops, True)
+        strategy.fuse_elewise_add_act_ops = False
+        self.assertEqual(strategy.fuse_elewise_add_act_ops, False)
+        strategy.fuse_elewise_add_act_ops = "True"
+        self.assertEqual(strategy.fuse_elewise_add_act_ops, False)
+
+    def test_fuse_bn_act_ops(self):
+        strategy = paddle.fleet.DistributedStrategy()
+        strategy.fuse_bn_act_ops = True
+        self.assertEqual(strategy.fuse_bn_act_ops, True)
+        strategy.fuse_bn_act_ops = False
+        self.assertEqual(strategy.fuse_bn_act_ops, False)
+        strategy.fuse_bn_act_ops = "True"
+        self.assertEqual(strategy.fuse_bn_act_ops, False)
+
+    def test_enable_auto_fusion(self):
+        strategy = paddle.fleet.DistributedStrategy()
+        strategy.enable_auto_fusion = True
+        self.assertEqual(strategy.enable_auto_fusion, True)
+        strategy.enable_auto_fusion = False
+        self.assertEqual(strategy.enable_auto_fusion, False)
+        strategy.enable_auto_fusion = "True"
+        self.assertEqual(strategy.enable_auto_fusion, False)
+
+    def test_fuse_relu_depthwise_conv(self):
+        strategy = paddle.fleet.DistributedStrategy()
+        strategy.fuse_relu_depthwise_conv = True
+        self.assertEqual(strategy.fuse_relu_depthwise_conv, True)
+        strategy.fuse_relu_depthwise_conv = False
+        self.assertEqual(strategy.fuse_relu_depthwise_conv, False)
+        strategy.fuse_relu_depthwise_conv = "True"
+        self.assertEqual(strategy.fuse_relu_depthwise_conv, False)
+
+    def test_enable_inplace(self):
+        strategy = paddle.fleet.DistributedStrategy()
+        strategy.enable_inplace = True
+        self.assertEqual(strategy.enable_inplace, True)
+        strategy.enable_inplace = False
+        self.assertEqual(strategy.enable_inplace, False)
+        strategy.enable_inplace = "True"
+        self.assertEqual(strategy.enable_inplace, False)
+
+    def test_fuse_all_reduce_ops(self):
+        strategy = paddle.fleet.DistributedStrategy()
+        strategy.fuse_all_reduce_ops = True
+        self.assertEqual(strategy.fuse_all_reduce_ops, True)
+        strategy.fuse_all_reduce_ops = False
+        self.assertEqual(strategy.fuse_all_reduce_ops, False)
+        strategy.fuse_all_reduce_ops = "True"
+        self.assertEqual(strategy.fuse_all_reduce_ops, False)
+
+    def test_num_iteration_per_drop_scope(self):
+        strategy = paddle.fleet.DistributedStrategy()
+        strategy.num_iteration_per_drop_scope = 1
+        self.assertEqual(strategy.num_iteration_per_drop_scope, 1)
+        strategy.num_iteration_per_drop_scope = 0.1
+        self.assertEqual(strategy.num_iteration_per_drop_scope, 1)
+
+    def test_sync_batch_norm(self):
+        strategy = paddle.fleet.DistributedStrategy()
+        strategy.sync_batch_norm = True
+        self.assertEqual(strategy.sync_batch_norm, True)
+        strategy.sync_batch_norm = False
+        self.assertEqual(strategy.sync_batch_norm, False)
+        strategy.sync_batch_norm = "True"
+        self.assertEqual(strategy.sync_batch_norm, False)
+
+    def test_fuse_all_optimizer_ops(self):
+        strategy = paddle.fleet.DistributedStrategy()
+        strategy.fuse_all_optimizer_ops = True
+        self.assertEqual(strategy.fuse_all_optimizer_ops, True)
+        strategy.fuse_all_optimizer_ops = False
+        self.assertEqual(strategy.fuse_all_optimizer_ops, False)
+        strategy.fuse_all_optimizer_ops = "True"
+        self.assertEqual(strategy.fuse_all_optimizer_ops, False)
+
+    def test_sync(self):
+        strategy = paddle.fleet.DistributedStrategy()
+        strategy.sync = True
+        self.assertEqual(strategy.sync, True)
+        strategy.sync = False
+        self.assertEqual(strategy.sync, False)
+        strategy.sync = "True"
+        self.assertEqual(strategy.sync, False)
+
+    def test_async_k_step(self):
+        strategy = paddle.fleet.DistributedStrategy()
+        strategy.async_k_step = 10000
+        self.assertEqual(strategy.async_k_step, 10000)
+        strategy.async_k_step = 0.1
+        self.assertEqual(strategy.async_k_step, 10000)
+
+    def test_send_queue_size(self):
+        strategy = paddle.fleet.DistributedStrategy()
+        strategy.send_queue_size = 10000
+        self.assertEqual(strategy.send_queue_size, 10000)
+        strategy.send_queue_size = 0.1
+        self.assertEqual(strategy.send_queue_size, 10000)
+
+    def test_independent_recv_thread(self):
+        strategy = paddle.fleet.DistributedStrategy()
+        strategy.independent_recv_thread = True
+        self.assertEqual(strategy.independent_recv_thread, True)
+        strategy.independent_recv_thread = False
+        self.assertEqual(strategy.independent_recv_thread, False)
+        strategy.independent_recv_thread = "True"
+        self.assertEqual(strategy.independent_recv_thread, False)
+
+    def test_min_send_grad_num_before_recv(self):
+        strategy = paddle.fleet.DistributedStrategy()
+        strategy.min_send_grad_num_before_recv = 10000
+        self.assertEqual(strategy.min_send_grad_num_before_recv, 10000)
+        strategy.min_send_grad_num_before_recv = 0.1
+        self.assertEqual(strategy.min_send_grad_num_before_recv, 10000)
+
+    def test_thread_pool_size(self):
+        strategy = paddle.fleet.DistributedStrategy()
+        strategy.thread_pool_size = 10000
+        self.assertEqual(strategy.thread_pool_size, 10000)
+        strategy.thread_pool_size = 0.1
+        self.assertEqual(strategy.thread_pool_size, 10000)
+
+    def test_send_wait_times(self):
+        strategy = paddle.fleet.DistributedStrategy()
+        strategy.send_wait_times = 10000
+        self.assertEqual(strategy.send_wait_times, 10000)
+        strategy.send_wait_times = 0.1
+        self.assertEqual(strategy.send_wait_times, 10000)
+
+    def test_runtime_split_send_recv(self):
+        strategy = paddle.fleet.DistributedStrategy()
+        strategy.runtime_split_send_recv = True
+        self.assertEqual(strategy.runtime_split_send_recv, True)
+        strategy.runtime_split_send_recv = False
+        self.assertEqual(strategy.runtime_split_send_recv, False)
+        strategy.runtime_split_send_recv = "True"
+        self.assertEqual(strategy.runtime_split_send_recv, False)
+
+    def use_thread_barrier(self):
+        strategy = paddle.fleet.DistributedStrategy()
+        strategy.thread_barrier = True
+        self.assertEqual(strategy.thread_barrier, True)
+        strategy.thread_barrier = False
+        self.assertEqual(strategy.thread_barrier, False)
+        strategy.thread_barrier = "True"
+        self.assertEqual(strategy.thread_barrier, False)
+
+    def test_enable_backward_optimizer_op_deps(self):
+        strategy = paddle.fleet.DistributedStrategy()
+        strategy.enable_backward_optimizer_op_deps = True
+        self.assertEqual(strategy.enable_backward_optimizer_op_deps, True)
+        strategy.enable_backward_optimizer_op_deps = False
+        self.assertEqual(strategy.enable_backward_optimizer_op_deps, False)
+        strategy.enable_backward_optimizer_op_deps = "True"
+        self.assertEqual(strategy.enable_backward_optimizer_op_deps, False)
+
+    def test_elastic(self):
+        strategy = paddle.fleet.DistributedStrategy()
+        strategy.elastic = True
+        self.assertEqual(strategy.elastic, True)
+        strategy.elastic = False
+        self.assertEqual(strategy.elastic, False)
+        strategy.elastic = "True"
+        self.assertEqual(strategy.elastic, False)
+
+    def test_auto(self):
+        strategy = paddle.fleet.DistributedStrategy()
+        strategy.auto = True
+        self.assertEqual(strategy.auto, True)
+        strategy.auto = False
+        self.assertEqual(strategy.auto, False)
+        strategy.auto = "True"
+        self.assertEqual(strategy.auto, False)
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_flip.py b/python/paddle/fluid/tests/unittests/test_flip.py
index 77e416e5e6a73c63e775e9a1bdb8b56776bee9d2..6feee9ce57306a49b37223fa527dffe1387c2e40 100644
--- a/python/paddle/fluid/tests/unittests/test_flip.py
+++ b/python/paddle/fluid/tests/unittests/test_flip.py
@@ -30,9 +30,9 @@ class TestFlipOp_API(unittest.TestCase):
         startup_program = fluid.Program()
         train_program = fluid.Program()
         with fluid.program_guard(train_program, startup_program):
-            dims = [0]
+            axis = [0]
             input = fluid.data(name='input', dtype='float32', shape=[2, 3])
-            output = paddle.flip(input, dims)
+            output = paddle.flip(input, axis)
             place = fluid.CPUPlace()
             if fluid.core.is_compiled_with_cuda():
                 place = fluid.CUDAPlace(0)
@@ -68,7 +68,7 @@ class TestFlipOp(OpTest):
         self.outputs = {'Out': self.calc_ref_res()}
 
     def init_attrs(self):
-        self.attrs = {"dims": self.dims}
+        self.attrs = {"axis": self.axis}
 
     def test_check_output(self):
         self.check_output()
@@ -78,11 +78,11 @@ class TestFlipOp(OpTest):
 
     def init_test_case(self):
         self.in_shape = (6, 4, 2, 3)
-        self.dims = [0, 1]
+        self.axis = [0, 1]
 
     def calc_ref_res(self):
         res = self.inputs['X']
-        for axis in self.dims:
+        for axis in self.axis:
             res = np.flip(res, axis)
         return res
 
@@ -90,25 +90,37 @@ class TestFlipOp(OpTest):
 class TestFlipOpAxis1(TestFlipOp):
     def init_test_case(self):
         self.in_shape = (2, 4, 4)
-        self.dims = [0]
+        self.axis = [0]
 
 
 class TestFlipOpAxis2(TestFlipOp):
     def init_test_case(self):
         self.in_shape = (4, 4, 6, 3)
-        self.dims = [0, 2]
+        self.axis = [0, 2]
 
 
 class TestFlipOpAxis3(TestFlipOp):
     def init_test_case(self):
         self.in_shape = (4, 3, 1)
-        self.dims = [0, 1, 2]
+        self.axis = [0, 1, 2]
 
 
 class TestFlipOpAxis4(TestFlipOp):
     def init_test_case(self):
         self.in_shape = (6, 4, 2, 2)
-        self.dims = [0, 1, 2, 3]
+        self.axis = [0, 1, 2, 3]
+
+
+class TestFlipOpEmptyAxis(TestFlipOp):
+    def init_test_case(self):
+        self.in_shape = (6, 4, 2, 2)
+        self.axis = []
+
+
+class TestFlipOpNegAxis(TestFlipOp):
+    def init_test_case(self):
+        self.in_shape = (6, 4, 2, 2)
+        self.axis = [-1]
 
 
 if __name__ == "__main__":
diff --git a/python/paddle/fluid/tests/unittests/test_fs_interface.py b/python/paddle/fluid/tests/unittests/test_fs_interface.py
new file mode 100644
index 0000000000000000000000000000000000000000..0d87b94538f05d734cb3e621fc0dfc7c48e8fea2
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/test_fs_interface.py
@@ -0,0 +1,55 @@
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+import paddle.fluid as fluid
+import paddle.fluid.incubate.fleet.base.role_maker as role_maker
+from paddle.fluid.incubate.fleet.collective import CollectiveOptimizer, fleet, TrainStatus
+import os
+import sys
+import inspect
+
+from paddle.fluid.incubate.fleet.utils.fs import LocalFS, FS
+from paddle.fluid.incubate.fleet.utils.hdfs import HDFSClient
+from paddle.fluid.incubate.fleet.utils.hdfs import FSTimeOut, FSFileExistsError, FSFileNotExistsError
+
+
+class FSTest(unittest.TestCase):
+    def _test_method(self, func):
+        if sys.version_info[0] <= 2:
+            args = inspect.getargspec(func).args
+        else:
+            args = inspect.getfullargspec(func).args
+
+        a = None
+        try:
+            if len(args) == 1:
+                func()
+            elif len(args) == 2:
+                func(a)
+            elif len(args) == 3:
+                func(a, a)
+            print("args:", args, len(args), "func:", func)
+            self.assertFalse(True)
+        except NotImplementedError as e:
+            pass
+
+    def test(self):
+        fs = FS()
+        for name, func in inspect.getmembers(fs, predicate=inspect.ismethod):
+            self._test_method(func)
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_full_like_op.py b/python/paddle/fluid/tests/unittests/test_full_like_op.py
new file mode 100644
index 0000000000000000000000000000000000000000..21cbab193419be9413c487c8631671097016d959
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/test_full_like_op.py
@@ -0,0 +1,91 @@
+#   Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+
+import paddle
+import paddle.fluid.core as core
+from paddle import Program, program_guard
+import paddle.compat as cpt
+import unittest
+import numpy as np
+from op_test import OpTest
+
+
+class TestFullOp(unittest.TestCase):
+    """ Test fill_any_like op(whose API is full_like) for attr out. """
+
+    def test_attr_tensor_API(self):
+        startup_program = Program()
+        train_program = Program()
+        with program_guard(train_program, startup_program):
+            fill_value = 2.0
+            input = paddle.data(name='input', dtype='float32', shape=[2, 3])
+            output = paddle.full_like(input, fill_value)
+            output_dtype = paddle.full_like(input, fill_value, dtype='float32')
+
+            place = paddle.CPUPlace()
+            if core.is_compiled_with_cuda():
+                place = paddle.CUDAPlace(0)
+            exe = paddle.Executor(place)
+            exe.run(startup_program)
+
+            img = np.array([[1, 2, 3], [4, 5, 6]]).astype(np.float32)
+
+            res = exe.run(train_program,
+                          feed={'input': img},
+                          fetch_list=[output])
+
+            out_np = np.array(res[0])
+            self.assertTrue(
+                not (out_np - np.full_like(img, fill_value)).any(),
+                msg="full_like output is wrong, out = " + str(out_np))
+
+    def test_full_like_imperative(self):
+        with paddle.imperative.guard():
+            input = paddle.arange(6, 10, dtype='float32')
+            out = paddle.full_like(input, fill_value=888.88, dtype='float32')
+            out_numpy = np.random.random((4)).astype("float32")
+            out_numpy.fill(888.88)
+            self.assertTrue((out.numpy() == out_numpy).all(), True)
+
+
+class TestFullOpError(unittest.TestCase):
+    def test_errors(self):
+        with program_guard(Program(), Program()):
+            #for ci coverage
+
+            input_data = paddle.data(
+                name='input', dtype='float32', shape=[2, 3])
+            output = paddle.full_like(input_data, 2.0)
+
+            def test_input_dtype():
+                paddle.full_like
+
+            self.assertRaises(
+                TypeError,
+                paddle.full_like,
+                x=input_data,
+                fill_value=2,
+                dtype='uint4')
+            self.assertRaises(
+                TypeError,
+                paddle.full_like,
+                x=input_data,
+                fill_value=2,
+                dtype='int16')
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_full_op.py b/python/paddle/fluid/tests/unittests/test_full_op.py
index 29d5be1ea428cc8608192ef682a5b6f90f43938d..01761b661e47a94bab7c67c376b39c308d635391 100644
--- a/python/paddle/fluid/tests/unittests/test_full_op.py
+++ b/python/paddle/fluid/tests/unittests/test_full_op.py
@@ -37,33 +37,19 @@ class TestFullAPI(unittest.TestCase):
         shape_tensor_int64 = fluid.data(
             name="shape_tensor_int64", shape=[2], dtype="int64")
 
-        out_1 = paddle.full(
-            shape=[1, 2], dtype="float32", fill_value=1.1, device='gpu')
+        out_1 = paddle.full(shape=[1, 2], dtype="float32", fill_value=1.1)
 
         out_2 = paddle.full(
-            shape=[1, positive_2_int32],
-            dtype="float32",
-            fill_value=1.1,
-            device='cpu')
+            shape=[1, positive_2_int32], dtype="float32", fill_value=1.1)
 
         out_3 = paddle.full(
-            shape=[1, positive_2_int64],
-            dtype="float32",
-            fill_value=1.1,
-            device='gpu')
+            shape=[1, positive_2_int64], dtype="float32", fill_value=1.1)
 
         out_4 = paddle.full(
-            shape=shape_tensor_int32,
-            dtype="float32",
-            fill_value=1.2,
-            out=out_3)
+            shape=shape_tensor_int32, dtype="float32", fill_value=1.2)
 
         out_5 = paddle.full(
-            shape=shape_tensor_int64,
-            dtype="float32",
-            fill_value=1.1,
-            device='gpu',
-            stop_gradient=False)
+            shape=shape_tensor_int64, dtype="float32", fill_value=1.1)
 
         out_6 = paddle.full(
             shape=shape_tensor_int64, dtype=np.float32, fill_value=1.1)
@@ -83,7 +69,7 @@ class TestFullAPI(unittest.TestCase):
 
         assert np.array_equal(res_1, np.full([1, 2], 1.1, dtype="float32"))
         assert np.array_equal(res_2, np.full([1, 2], 1.1, dtype="float32"))
-        assert np.array_equal(res_3, np.full([1, 2], 1.2, dtype="float32"))
+        assert np.array_equal(res_3, np.full([1, 2], 1.1, dtype="float32"))
         assert np.array_equal(res_4, np.full([1, 2], 1.2, dtype="float32"))
         assert np.array_equal(res_5, np.full([1, 2], 1.1, dtype="float32"))
         assert np.array_equal(res_6, np.full([1, 2], 1.1, dtype="float32"))
@@ -94,28 +80,11 @@ class TestFullOpError(unittest.TestCase):
     def test_errors(self):
         with program_guard(Program(), Program()):
             #for ci coverage
-            x1 = fluid.layers.data(name='x1', shape=[1], dtype="int16")
-            x2 = np.random.randn(1, 2).astype('int32')
             self.assertRaises(
                 ValueError, paddle.full, shape=[1], fill_value=5, dtype='uint4')
-            self.assertRaises(
-                TypeError,
-                paddle.full,
-                shape=[1],
-                fill_value=5,
-                dtype='int32',
-                out=x2)
-            self.assertRaises(
-                TypeError,
-                paddle.full,
-                shape=[1],
-                fill_value=5,
-                dtype='int16',
-                out=x1)
 
             # The argument dtype of full must be one of bool, float16,
             #float32, float64, int32 or int64
-            x2 = fluid.layers.data(name='x2', shape=[1], dtype="int32")
 
             self.assertRaises(
                 TypeError, paddle.full, shape=[1], fill_value=5, dtype='uint8')
diff --git a/python/paddle/fluid/tests/unittests/test_hdfs.py b/python/paddle/fluid/tests/unittests/test_hdfs.py
new file mode 100644
index 0000000000000000000000000000000000000000..9826542cee3732a48e1c6b6959afb74063bb09d7
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/test_hdfs.py
@@ -0,0 +1,225 @@
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+import paddle.fluid as fluid
+import paddle.fluid.incubate.fleet.base.role_maker as role_maker
+from paddle.fluid.incubate.fleet.collective import CollectiveOptimizer, fleet, TrainStatus
+import os
+import sys
+
+from paddle.fluid.incubate.fleet.utils.fs import LocalFS
+from paddle.fluid.incubate.fleet.utils.hdfs import HDFSClient
+from paddle.fluid.incubate.fleet.utils.hdfs import FSTimeOut, FSFileExistsError, FSFileNotExistsError
+
+java_home = os.environ["JAVA_HOME"]
+
+
+class FSTest(unittest.TestCase):
+    def _test_dirs(self, fs):
+        dir_path = os.path.abspath("./test_dir")
+        fs.delete(dir_path)
+        self.assertTrue(not fs.is_exist(dir_path))
+
+        fs.mkdirs(dir_path)
+        self.assertTrue(fs.is_exist(dir_path))
+        self.assertTrue(not fs.is_file(dir_path))
+        self.assertTrue(fs.is_dir(dir_path))
+
+        new_dir_path = os.path.abspath("./new_test_dir")
+        fs.delete(new_dir_path)
+        try:
+            fs.mv(new_dir_path, dir_path)
+            self.assertFalse(True)
+        except FSFileNotExistsError as e:
+            pass
+
+        fs.mv(dir_path, new_dir_path)
+        self.assertTrue(fs.is_exist(new_dir_path))
+
+        fs.mv(new_dir_path, dir_path)
+        self.assertTrue(fs.is_exist(dir_path))
+        try:
+            fs.mv(dir_path, dir_path)
+            self.assertFalse(True)
+        except FSFileExistsError as e:
+            pass
+
+        fs.delete(dir_path)
+        self.assertTrue(not fs.is_exist(dir_path))
+
+    def _test_touch_file(self, fs):
+        file_path = os.path.abspath("./test_file")
+
+        fs.delete(file_path)
+        self.assertTrue(not fs.is_exist(file_path))
+
+        fs.touch(file_path)
+        self.assertTrue(fs.is_exist(file_path))
+        self.assertTrue(not fs.is_dir(file_path) and fs.is_file(file_path))
+
+        new_file_path = os.path.abspath("./new_test_file")
+        fs.mv(file_path, new_file_path)
+        self.assertTrue(fs.is_exist(new_file_path))
+
+        fs.mv(new_file_path, file_path)
+        self.assertTrue(fs.is_exist(file_path))
+
+        fs.delete(file_path)
+        self.assertTrue(not fs.is_exist(file_path))
+
+    def _test_upload(self, fs):
+        src_file = os.path.abspath("./test_upload.src")
+        dst_file = os.path.abspath("./test_uolpad.dst")
+
+        try:
+            fs.upload(src_file, dst_file)
+            self.assertFalse(True)
+        except FSFileNotExistsError as e:
+            pass
+
+        local = LocalFS()
+        local.touch(src_file)
+        fs.delete(dst_file)
+
+        assert fs.need_upload_download()
+
+        fs.upload(src_file, dst_file)
+        try:
+            fs.upload(src_file, dst_file)
+            self.assertFalse(True)
+        except FSFileExistsError as e:
+            pass
+
+        self.assertTrue(fs.is_exist(dst_file))
+        fs.delete(dst_file)
+        fs.delete(src_file)
+
+    def _test_download(self, fs):
+        src_file = os.path.abspath("./test_download.src")
+        dst_file = os.path.abspath("./test_download.dst")
+        fs.delete(dst_file)
+        fs.delete(src_file)
+
+        try:
+            fs.download(src_file, dst_file)
+            self.assertFalse(True)
+        except FSFileNotExistsError as e:
+            pass
+
+        local = LocalFS()
+        local.touch(src_file)
+        fs.delete(dst_file)
+
+        assert fs.need_upload_download()
+
+        fs.download(src_file, dst_file)
+        try:
+            fs.download(src_file, dst_file)
+            self.assertFalse(True)
+        except FSFileExistsError as e:
+            pass
+
+        self.assertTrue(fs.is_exist(dst_file))
+        fs.delete(dst_file)
+        fs.delete(src_file)
+
+    def _test_mkdirs(self, fs):
+        dir_name = "./test_mkdir"
+        fs.mkdirs(dir_name)
+        fs.mkdirs(dir_name)
+
+    def test_exists(self):
+        fs = HDFSClient("/usr/local/hadoop-2.7.7/", None, time_out=15 * 1000)
+        self.assertFalse(fs.is_exist(os.path.abspath("./xxxx")))
+        self.assertFalse(fs.is_dir(os.path.abspath("./xxxx")))
+        self.assertTrue(fs.is_dir(os.path.abspath("./xxx/..")))
+        dirs, files = fs.ls_dir(os.path.abspath("./test_hdfs.py"))
+        self.assertTrue(dirs == [])
+        self.assertTrue(len(files) == 1)
+        dirs, files = fs.ls_dir(os.path.abspath("./xxx/.."))
+
+    def test_hdfs(self):
+        fs = HDFSClient("/usr/local/hadoop-2.7.7/", None, time_out=15 * 1000)
+        self._test_dirs(fs)
+        self._test_upload(fs)
+
+        self._test_download(fs)
+        self._test_mkdirs(fs)
+        self._test_list_dir(fs)
+
+    def test_local(self):
+        fs = LocalFS()
+        self._test_dirs(fs)
+        self._test_touch_file(fs)
+        self._test_mkdirs(fs)
+        self._test_list_dir(fs)
+
+    def test_timeout(self):
+        fs = HDFSClient(
+            "/usr/local/hadoop-2.7.7/",
+            None,
+            time_out=6 * 1000,
+            sleep_inter=2000)
+        src = "hdfs_test_timeout"
+        dst = "new_hdfs_test_timeout"
+        fs.delete(dst)
+        fs.mkdirs(src)
+        fs.mkdirs(dst)
+        fs.mkdirs(dst + "/" + src)
+        output = ""
+        try:
+            fs.mv(src, dst, test_exists=False)
+            self.assertFalse(1, "can't execute cmd:{} output:{}".format(cmd,
+                                                                        output))
+        except FSTimeOut as e:
+            print("execute mv {} to {} timeout".format(src, dst))
+
+        cmd = "{} -mv {} {}".format(fs._base_cmd, src, dst)
+        ret, output = fluid.core.shell_execute_cmd(cmd, 6 * 1000, 2 * 1000)
+        self.assertNotEqual(ret, 0)
+        print("second mv ret:{} output:{}".format(ret, output))
+
+    def test_is_dir(self):
+        fs = HDFSClient("/usr/local/hadoop-2.7.7/", None, time_out=15 * 1000)
+        self.assertFalse(fs.is_dir("./test_hdfs.py"))
+        s = """
+java.io.IOException: Input/output error
+ responseErrorMsg : failed to getFileStatus, errorCode: 3, path: /user/PUBLIC_KM_Data/wangxi16/data/serving_model, lparam: d868f6bb6822c621, errorMessage: inner error
+	at org.apache.hadoop.util.FileSystemUtil.throwException(FileSystemUtil.java:164)
+	at org.apache.hadoop.util.FileSystemUtil.dealWithResponse(FileSystemUtil.java:118)
+	at org.apache.hadoop.lite.client.LiteClientImpl.getFileStatus(LiteClientImpl.java:696)
+	at org.apache.hadoop.fs.LibDFileSystemImpl.getFileStatus(LibDFileSystemImpl.java:297)
+	at org.apache.hadoop.fs.LiteFileSystem.getFileStatus(LiteFileSystem.java:514)
+	at org.apache.hadoop.fs.FsShell.test(FsShell.java:1092)
+	at org.apache.hadoop.fs.FsShell.run(FsShell.java:2285)
+	at org.apache.hadoop.util.ToolRunner.run(ToolRunner.java:65)
+	at org.apache.hadoop.util.ToolRunner.run(ToolRunner.java:79)
+	at org.apache.hadoop.fs.FsShell.main(FsShell.java:2353)
+        """
+
+        print("split lines:", s.splitlines())
+        self.assertTrue(fs._test_match(s.splitlines()) != None)
+
+    def test_config(self):
+        config = {"fs.default.name": "hdfs://xxx", "hadoop.job.ugi": "ugi"}
+        fs = HDFSClient("/usr/local/hadoop-2.7.7/", config, time_out=15 * 1000)
+
+    def _test_list_dir(self, fs):
+        fs = HDFSClient("/usr/local/hadoop-2.7.7/", None, time_out=15 * 1000)
+        fs.ls_dir("test_not_exists")
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_hsigmoid_op.py b/python/paddle/fluid/tests/unittests/test_hsigmoid_op.py
index 421a6c695364dc467d9faa7355c4ba51e8d61d7b..5777bb3c6f5e34f035c32ed963906b5ccc03ba85 100644
--- a/python/paddle/fluid/tests/unittests/test_hsigmoid_op.py
+++ b/python/paddle/fluid/tests/unittests/test_hsigmoid_op.py
@@ -70,9 +70,9 @@ def hsigmoid(x, w, label, bias, num_classes):
     batch_size = x.shape[0]
     code_length = find_latest_set(num_classes - 1)
     code_table = [0 for _ in range(code_length)]
-    pre_output = np.zeros((batch_size, code_length))
-    pre_sum = np.zeros((batch_size, 1))
-    out = np.zeros((batch_size, 1))
+    pre_output = np.zeros((batch_size, code_length)).astype('float64')
+    pre_sum = np.zeros((batch_size, 1)).astype('float64')
+    out = np.zeros((batch_size, 1)).astype('float64')
     for i in range(batch_size):
         code_table = CodeTable(num_classes, label[i])
         length = code_table.get_length()
@@ -105,9 +105,9 @@ def hsigmoid(x, w, label, bias, num_classes):
 
 def hsigmoid_grad(x, w, label, bias, num_classes):
     batch_size = x.shape[0]
-    dx = np.zeros(x.shape)
-    dw = np.zeros(w.shape)
-    db = np.zeros(bias.shape)
+    dx = np.zeros(x.shape).astype('float64')
+    dw = np.zeros(w.shape).astype('float64')
+    db = np.zeros(bias.shape).astype('float64')
     for i in range(batch_size):
         code_table = CodeTable(num_classes, label[i])
         length = code_table.get_length()
@@ -133,9 +133,9 @@ def hsigmoidWithCustomTree(x, w, path_table, path_code, label, bias,
     code_length = len(path_table[0])
     code_table = [0 for _ in range(code_length)]
     # init pre_out with shape [N, code_length]
-    pre_output = np.zeros((batch_size, code_length))
-    pre_sum = np.zeros((batch_size, 1))
-    out = np.zeros((batch_size, 1))
+    pre_output = np.zeros((batch_size, code_length)).astype('float64')
+    pre_sum = np.zeros((batch_size, 1)).astype('float64')
+    out = np.zeros((batch_size, 1)).astype('float64')
     if isinstance(bias, np.ndarray):
         for i in range(batch_size):
             code_table = CodeTableWithCustomTree(path_table, path_code, i)
@@ -173,10 +173,13 @@ class TestHSigmoidOp(OpTest):
         num_classes = 101
         feature_size = 5
         batch_size = 20
-        x = np.random.uniform(-1, 1, (batch_size, feature_size))
-        w = np.random.uniform(-1, 1, (num_classes - 1, feature_size))
-        label = np.random.randint(0, num_classes, (batch_size, 1))
-        bias = np.random.uniform(-1, 1, (num_classes - 1, 1))
+        x = np.random.uniform(-1, 1,
+                              (batch_size, feature_size)).astype('float64')
+        w = np.random.uniform(-1, 1,
+                              (num_classes - 1, feature_size)).astype('float64')
+        label = np.random.randint(0, num_classes,
+                                  (batch_size, 1)).astype('int64')
+        bias = np.random.uniform(-1, 1, (num_classes - 1, 1)).astype('float64')
         self.attrs = {'num_classes': num_classes, 'is_sparse': False}
         self.inputs = {'X': x, 'W': w, 'Label': label, 'Bias': bias}
         pre_output, out = hsigmoid(x, w, label, bias, num_classes)
@@ -189,7 +192,6 @@ class TestHSigmoidOp(OpTest):
     def test_check_grad(self):
         self.check_grad(
             ['X', 'W', 'Bias'], ['Out'], user_defined_grads=self.user_grads)
-        #self.check_grad(['X', 'W', 'Bias'], ['Out'])
 
 
 @skip_check_grad_ci(
@@ -203,13 +205,15 @@ class TestHSigmoidOpSparse(OpTest):
         batch_size = 4
         x = np.random.random((batch_size, feature_size))
         w = np.random.random((num_classes - 1, feature_size))
-        label = np.array([0, 1, 4, 5])
-        path_table = np.array(
-            [(0, 2, -1, -1, -1), (0, 1, 3, -1, -1), (0, 1, 4, -1, -1),
-             (0, 2, -1, -1,
-              -1)])  #np.array to store 1,2,5,6s' non-leaf path(root -> leaf)
-        path_code = np.array([(0, 0, -1, -1, -1), (1, 1, 1, -1, -1), (
-            1, 0, 0, -1, -1), (0, 1, -1, -1, -1)])  #np.array to store 
+        label = np.array([0, 1, 4, 5]).astype('int64')
+        path_table = np.array([
+            (0, 2, -1, -1, -1), (0, 1, 3, -1, -1), (0, 1, 4, -1, -1), (0, 2, -1,
+                                                                       -1, -1)
+        ]).astype(
+            'int64')  #np.array to store 1,2,5,6s' non-leaf path(root -> leaf)
+        path_code = np.array(
+            [(0, 0, -1, -1, -1), (1, 1, 1, -1, -1), (1, 0, 0, -1, -1),
+             (0, 1, -1, -1, -1)]).astype('int64')  #np.array to store 
         bias = np.random.random((num_classes - 1, 1))
         self.attrs = {'num_classes': num_classes, 'is_sparse': True}
         self.inputs = {
@@ -265,9 +269,9 @@ class TestHSigmoidOpWithSparseGrad(unittest.TestCase):
             start_up = fluid.default_startup_program()
             start_up.random_seed = 1  # Fix random seed
             x = np.arange(6).reshape(6)
-            path_table = np.array([(1, 2, -1), (1, 2, -1)])
-            path_code = np.array([(1, 0, -1), (0, 0, -1)])
-            label = np.array([1, 4])
+            path_table = np.array([(1, 2, -1), (1, 2, -1)]).astype('int64')
+            path_code = np.array([(1, 0, -1), (0, 0, -1)]).astype('int64')
+            label = np.array([1, 4]).astype('int64')
 
             loss, data_list = self.hs_net_conf(is_sparse)
             optimizer = fluid.optimizer.SGD(learning_rate=1e-3)
@@ -307,13 +311,15 @@ class TestHSigmoidOpWithCostumTree(OpTest):
         batch_size = 4
         x = np.random.uniform(-1, 1, (batch_size, feature_size))
         w = np.random.uniform(-1, 1, (num_classes - 1, feature_size))
-        label = np.array([0, 1, 4, 5])
-        path_table = np.array(
-            [(0, 2, -1, -1, -1), (0, 1, 3, -1, -1), (0, 1, 4, -1, -1),
-             (0, 2, -1, -1,
-              -1)])  #np.array to store 1,2,5,6s' non-leaf path(root -> leaf)
-        path_code = np.array([(0, 0, -1, -1, -1), (1, 1, 1, -1, -1), (
-            1, 0, 0, -1, -1), (0, 1, -1, -1, -1)])  #np.array to store 
+        label = np.array([0, 1, 4, 5]).astype('int64')
+        path_table = np.array([
+            (0, 2, -1, -1, -1), (0, 1, 3, -1, -1), (0, 1, 4, -1, -1), (0, 2, -1,
+                                                                       -1, -1)
+        ]).astype(
+            'int64')  #np.array to store 1,2,5,6s' non-leaf path(root -> leaf)
+        path_code = np.array(
+            [(0, 0, -1, -1, -1), (1, 1, 1, -1, -1), (1, 0, 0, -1, -1),
+             (0, 1, -1, -1, -1)]).astype('int64')  #np.array to store 
         bias = np.random.random((num_classes - 1, 1))
         self.attrs = {'num_classes': num_classes, 'is_sparse': False}
         self.inputs = {
@@ -346,13 +352,15 @@ class TestHSigmoidOpWithCostumTreeWithoutBias(OpTest):
         batch_size = 4
         x = np.random.uniform(-1, 1, (batch_size, feature_size))
         w = np.random.uniform(-1, 1, (num_classes - 1, feature_size))
-        label = np.array([0, 1, 4, 5])
-        path_table = np.array(
-            [(0, 2, -1, -1, -1), (0, 1, 3, -1, -1), (0, 1, 4, -1, -1),
-             (0, 2, -1, -1,
-              -1)])  #np.array to store 1,2,5,6s' non-leaf path(root -> leaf)
-        path_code = np.array([(0, 0, -1, -1, -1), (1, 1, 1, -1, -1), (
-            1, 0, 0, -1, -1), (0, 1, -1, -1, -1)])  #np.array to store 
+        label = np.array([0, 1, 4, 5]).astype('int64')
+        path_table = np.array([
+            (0, 2, -1, -1, -1), (0, 1, 3, -1, -1), (0, 1, 4, -1, -1), (0, 2, -1,
+                                                                       -1, -1)
+        ]).astype(
+            'int64')  #np.array to store 1,2,5,6s' non-leaf path(root -> leaf)
+        path_code = np.array(
+            [(0, 0, -1, -1, -1), (1, 1, 1, -1, -1), (1, 0, 0, -1, -1),
+             (0, 1, -1, -1, -1)]).astype('int64')  #np.array to store 
         # bias = np.random.random((num_classes - 1, 1)).astype("float32")
         self.attrs = {'num_classes': num_classes, 'is_sparse': False}
         self.inputs = {
diff --git a/python/paddle/fluid/tests/unittests/test_imperative_framework.py b/python/paddle/fluid/tests/unittests/test_imperative_framework.py
index 78ad00fb9a76d2ccec8b2a42fb76aac0ac56365a..68628918391cb3a152cf52b0b49dab8235e46756 100644
--- a/python/paddle/fluid/tests/unittests/test_imperative_framework.py
+++ b/python/paddle/fluid/tests/unittests/test_imperative_framework.py
@@ -62,5 +62,5 @@ class TestDygraphFramework(unittest.TestCase):
     def test_dygraph_to_string(self):
         np_inp = np.array([[1.0, 2.0], [3.0, 4.0]], dtype=np.float32)
         with fluid.dygraph.guard():
-            var_inp = fluid.dygraph.base.to_variable(np_inp)
-            var_inp.to_string(throw_on_error=True)
+            var_inp = fluid.dygraph.to_variable(np_inp)
+            print(str(var_inp))
diff --git a/python/paddle/fluid/tests/unittests/test_imperative_layer_trainable.py b/python/paddle/fluid/tests/unittests/test_imperative_layer_trainable.py
new file mode 100644
index 0000000000000000000000000000000000000000..909b1be0f7f27e56bd1284b8b6af60fff8c6720c
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/test_imperative_layer_trainable.py
@@ -0,0 +1,47 @@
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+import paddle.fluid as fluid
+import numpy as np
+
+import paddle.fluid.dygraph as dygraph
+
+
+class TestImperativeLayerTrainable(unittest.TestCase):
+    def test_set_trainable(self):
+        with fluid.dygraph.guard():
+            label = np.random.uniform(-1, 1, [10, 10]).astype(np.float32)
+
+            label = dygraph.to_variable(label)
+
+            linear = dygraph.Linear(10, 10)
+            y = linear(label)
+            self.assertTrue(y.stop_gradient == False)
+
+            linear.weight.trainable = False
+            linear.bias.trainable = False
+
+            self.assertTrue(linear.weight.trainable == False)
+            self.assertTrue(linear.weight.stop_gradient == True)
+
+            y = linear(label)
+            self.assertTrue(y.stop_gradient == True)
+
+            with self.assertRaises(ValueError):
+                linear.weight.trainable = "1"
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_imperative_lod_tensor_to_selected_rows.py b/python/paddle/fluid/tests/unittests/test_imperative_lod_tensor_to_selected_rows.py
index 477b3be76f35ff98c36db0a01d00a518c90905bc..69fd7d80327f1a666870dc76e041449366565b01 100644
--- a/python/paddle/fluid/tests/unittests/test_imperative_lod_tensor_to_selected_rows.py
+++ b/python/paddle/fluid/tests/unittests/test_imperative_lod_tensor_to_selected_rows.py
@@ -186,7 +186,8 @@ class TestDygraphSimpleNet(unittest.TestCase):
                                     k - 1]] = out[k]
 
                 self.assertTrue(
-                    np.array_equal(static_loss_value, dy_loss_value))
+                    np.allclose(
+                        static_loss_value, dy_loss_value, rtol=1e-3))
                 for key, value in six.iteritems(static_param_init):
                     self.assertTrue(np.array_equal(value, dy_param_init[key]))
                 for key, value in six.iteritems(static_param_updated):
diff --git a/python/paddle/fluid/tests/unittests/test_imperative_named_members.py b/python/paddle/fluid/tests/unittests/test_imperative_named_members.py
index c7151463d15f3aefab60d100ae9c5e37845f46ea..721453c51242198230edf306b1fdf14ce857ff93 100644
--- a/python/paddle/fluid/tests/unittests/test_imperative_named_members.py
+++ b/python/paddle/fluid/tests/unittests/test_imperative_named_members.py
@@ -79,6 +79,41 @@ class TestImperativeNamedParameters(unittest.TestCase):
 
             self.assertListEqual(expected_named_parameters, named_parameters)
 
+    def test_dir_layer(self):
+        with fluid.dygraph.guard():
+
+            class Mymodel(fluid.dygraph.Layer):
+                def __init__(self):
+                    super(Mymodel, self).__init__()
+                    self.linear1 = fluid.dygraph.Linear(10, 10)
+                    self.linear2 = fluid.dygraph.Linear(5, 5)
+                    self.conv2d = fluid.dygraph.Conv2D(3, 2, 3)
+                    self.embedding = fluid.dygraph.Embedding(size=[128, 16])
+                    self.h_0 = fluid.dygraph.to_variable(
+                        np.zeros([10, 10]).astype('float32'))
+                    self.weight = self.create_parameter(
+                        shape=[2, 3],
+                        attr=fluid.ParamAttr(),
+                        dtype="float32",
+                        is_bias=False)
+
+            model = Mymodel()
+
+            expected_members = dir(model)
+
+            self.assertTrue("linear1" in expected_members,
+                            "model should contain Layer: linear1")
+            self.assertTrue("linear2" in expected_members,
+                            "model should contain Layer: linear2")
+            self.assertTrue("conv2d" in expected_members,
+                            "model should contain Layer: conv2d")
+            self.assertTrue("embedding" in expected_members,
+                            "model should contain Layer: embedding")
+            self.assertTrue("h_0" in expected_members,
+                            "model should contain buffer: h_0")
+            self.assertTrue("weight" in expected_members,
+                            "model should contain parameter: weight")
+
 
 if __name__ == '__main__':
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_imperative_optimizer.py b/python/paddle/fluid/tests/unittests/test_imperative_optimizer.py
index 71d6c101d0051905112693abd61baec9e0bcfecc..a7783afc5cff3da97b623aec3297881013724a78 100644
--- a/python/paddle/fluid/tests/unittests/test_imperative_optimizer.py
+++ b/python/paddle/fluid/tests/unittests/test_imperative_optimizer.py
@@ -18,6 +18,7 @@ import contextlib
 import unittest
 import numpy as np
 import six
+import itertools
 
 import paddle
 import paddle.fluid as fluid
@@ -428,6 +429,46 @@ class TestOptimizerLearningRate(unittest.TestCase):
 
                 self.assertTrue(np.allclose(lr, ret[i], rtol=1e-06, atol=0.0))
 
+    def test_set_lr(self):
+        with fluid.dygraph.guard():
+            a = np.random.uniform(-0.1, 0.1, [10, 10]).astype("float32")
+
+            linear = fluid.dygraph.nn.Linear(10, 10)
+
+            a = fluid.dygraph.to_variable(a)
+
+            b = linear(a)
+
+            loss = fluid.layers.reduce_mean(b)
+
+            adam = fluid.optimizer.Adam(0.1, parameter_list=linear.parameters())
+
+            lr_list = [0.2, 0.3, 0.4, 0.5, 0.6]
+            for i in range(5):
+                adam.set_lr(lr_list[i])
+                adam.minimize(loss)
+                lr = adam.current_step_lr()
+                self.assertTrue(
+                    np.allclose(
+                        lr, lr_list[i], rtol=1e-06, atol=0.0))
+
+            lr_var = fluid.layers.create_global_var(
+                shape=[1], value=0.7, dtype='float32')
+            adam.set_lr(lr_var)
+            adam.minimize(loss)
+            lr = adam.current_step_lr()
+            self.assertTrue(np.allclose(lr, 0.7, rtol=1e-06, atol=0.0))
+
+            with self.assertRaises(RuntimeError):
+                adam = fluid.optimizer.Adam(
+                    fluid.dygraph.NaturalExpDecay(
+                        learning_rate=0.1,
+                        decay_steps=3,
+                        decay_rate=0.5,
+                        staircase=True),
+                    parameter_list=linear.parameters())
+                adam.set_lr(0.01)
+
 
 class TestImperativeMomentumOptimizer(TestImperativeOptimizerBase):
     def get_optimizer_dygraph(self, parameter_list):
@@ -659,5 +700,30 @@ class TestImperativeRecomputeOptimizer(TestImperativeOptimizerBase):
         self._check_exception(exception_message)
 
 
+class TestImperativeOptimizerList(unittest.TestCase):
+    def test_parameter_list(self):
+        with fluid.dygraph.guard():
+            linear_1 = Linear(10, 10)
+            linear_2 = Linear(10, 10)
+
+            sgd = SGDOptimizer(
+                1.0,
+                parameter_list=itertools.chain(linear_1.parameters(),
+                                               linear_2.parameters()))
+
+            in_np = np.random.uniform(-0.1, 0.1, [10, 10]).astype("float32")
+            in_data = fluid.dygraph.to_variable(in_np)
+
+            y = linear_1(in_data)
+            y = linear_2(y)
+            loss = fluid.layers.reduce_mean(y)
+            loss.backward()
+            sgd.minimize(loss)
+
+            self.assertTrue(
+                len(sgd._parameter_list) ==
+                len(linear_1.parameters() + linear_2.parameters()))
+
+
 if __name__ == '__main__':
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_imperative_save_load.py b/python/paddle/fluid/tests/unittests/test_imperative_save_load.py
index 3dac9324e4eeb2073a4a61fddb02217969dafe50..927e51b56d727f92b75930eb0915fb5da8931f01 100644
--- a/python/paddle/fluid/tests/unittests/test_imperative_save_load.py
+++ b/python/paddle/fluid/tests/unittests/test_imperative_save_load.py
@@ -277,8 +277,11 @@ class TestDygraphPtbRnn(unittest.TestCase):
             self.opti_dict = adam.state_dict()
             self.base_opti = {}
             for k, v in self.opti_dict.items():
-                self.base_opti[v.name] = v.numpy()
-                self.assertTrue(np.sum(np.abs(v.numpy())) != 0)
+                if isinstance(v, core.VarBase):
+                    self.base_opti[v.name] = v.numpy()
+                    self.assertTrue(np.sum(np.abs(v.numpy())) != 0)
+                else:
+                    self.base_opti[k] = v
 
             fluid.save_dygraph(self.opti_dict, "./test_dy")
 
@@ -360,11 +363,12 @@ class TestDygraphPtbRnn(unittest.TestCase):
             opti_dict = adam.state_dict()
             # set to zero
             for k, v in opti_dict.items():
-                np_t = v.numpy()
-                var = v.value().get_tensor()
-                var.set(np.zeros_like(np_t), place)
+                if isinstance(v, core.VarBase):
+                    np_t = v.numpy()
+                    var = v.value().get_tensor()
+                    var.set(np.zeros_like(np_t), place)
 
-                self.assertTrue(np.sum(np.abs(v.numpy())) == 0)
+                    self.assertTrue(np.sum(np.abs(v.numpy())) == 0)
 
             if isinstance(adam._learning_rate, LearningRateDecay):
                 adam._learning_rate.step_num = 0
@@ -375,8 +379,11 @@ class TestDygraphPtbRnn(unittest.TestCase):
 
             opti_dict = adam.state_dict()
             for k, v in opti_dict.items():
-                self.assertTrue(
-                    np.array_equal(v.numpy(), self.base_opti[v.name]))
+                if isinstance(v, core.VarBase):
+                    self.assertTrue(
+                        np.array_equal(v.numpy(), self.base_opti[v.name]))
+                else:
+                    self.assertEqual(v, self.base_opti[k])
 
             # check parameter
             state_dict = ptb_model.state_dict()
@@ -466,21 +473,24 @@ class TestDygraphPtbRnn(unittest.TestCase):
             opti_dict = adam.state_dict()
             # set to zero
             for k, v in opti_dict.items():
-                np_t = v.numpy()
-                var = v.value().get_tensor()
-                var.set(np.zeros_like(np_t), place)
+                if isinstance(v, core.VarBase):
+                    np_t = v.numpy()
+                    var = v.value().get_tensor()
+                    var.set(np.zeros_like(np_t), place)
 
-                self.assertTrue(np.sum(np.abs(v.numpy())) == 0)
+                    self.assertTrue(np.sum(np.abs(v.numpy())) == 0)
 
             if isinstance(adam._learning_rate, LearningRateDecay):
                 adam._learning_rate.step_num = 0
 
             adam.set_dict(self.opti_dict)
-
             opti_dict = adam.state_dict()
             for k, v in opti_dict.items():
-                self.assertTrue(
-                    np.array_equal(v.numpy(), self.base_opti[v.name]))
+                if isinstance(v, core.VarBase):
+                    self.assertTrue(
+                        np.array_equal(v.numpy(), self.base_opti[v.name]))
+                else:
+                    self.assertEqual(v, self.base_opti[k])
 
             # check parameter
             state_dict = ptb_model.state_dict()
@@ -571,12 +581,14 @@ class TestDygraphPtbRnn(unittest.TestCase):
             np_opti_dict = {}
             # set to zero
             for k, v in opti_dict.items():
-                np_t = v.numpy()
-                np_opti_dict[v.name] = np_t
-                var = v.value().get_tensor()
-                var.set(np.zeros_like(np_t), place)
-
-                self.assertTrue(np.sum(np.abs(v.numpy())) == 0)
+                if isinstance(v, core.VarBase):
+                    np_t = v.numpy()
+                    np_opti_dict[v.name] = np_t
+                    var = v.value().get_tensor()
+                    var.set(np.zeros_like(np_t), place)
+                    self.assertTrue(np.sum(np.abs(v.numpy())) == 0)
+                else:
+                    np_opti_dict[k] = v
 
             if isinstance(adam._learning_rate, LearningRateDecay):
                 adam._learning_rate.step_num = 0
@@ -585,8 +597,11 @@ class TestDygraphPtbRnn(unittest.TestCase):
 
             opti_dict = adam.state_dict()
             for k, v in opti_dict.items():
-                self.assertTrue(
-                    np.array_equal(v.numpy(), self.base_opti[v.name]))
+                if isinstance(v, core.VarBase):
+                    self.assertTrue(
+                        np.array_equal(v.numpy(), self.base_opti[v.name]))
+                else:
+                    self.assertEqual(v, self.base_opti[k])
 
             # check parameter
             state_dict = ptb_model.state_dict()
@@ -827,7 +842,10 @@ class TestDygraphPtbRnn(unittest.TestCase):
             np_state_dict = {}
 
             for k, v in self.opti_dict.items():
-                np_opti_dict[v.name] = v.numpy()
+                if isinstance(v, core.VarBase):
+                    np_opti_dict[v.name] = v.numpy()
+                else:
+                    np_opti_dict[k] = v
 
             for k, v in self.state_dict.items():
                 np_state_dict[k] = v.numpy()
diff --git a/python/paddle/fluid/tests/unittests/test_index_select_op.py b/python/paddle/fluid/tests/unittests/test_index_select_op.py
index 50d04c1a72378d4491b3570fd687a76f045b9551..e551989ed322db0a49e1e95338aeb5fb356a4951 100644
--- a/python/paddle/fluid/tests/unittests/test_index_select_op.py
+++ b/python/paddle/fluid/tests/unittests/test_index_select_op.py
@@ -83,7 +83,7 @@ class TestIndexSelectAPI(unittest.TestCase):
             x = fluid.layers.data(name='x', shape=[-1, 4])
             index = fluid.layers.data(
                 name='index', shape=[3], dtype='int32', append_batch_size=False)
-            z = paddle.index_select(x, index, dim=1)
+            z = paddle.index_select(x, index, axis=1)
             exe = fluid.Executor(fluid.CPUPlace())
             res, = exe.run(feed={'x': self.data_x,
                                  'index': self.data_index},
@@ -124,7 +124,7 @@ class TestIndexSelectAPI(unittest.TestCase):
         with fluid.dygraph.guard():
             x = fluid.dygraph.to_variable(self.data_x)
             index = fluid.dygraph.to_variable(self.data_index)
-            z = paddle.index_select(x, index, dim=1)
+            z = paddle.index_select(x, index, axis=1)
             np_z = z.numpy()
         expect_out = np.array([[1.0, 2.0, 2.0], [5.0, 6.0, 6.0],
                                [9.0, 10.0, 10.0]])
diff --git a/python/paddle/fluid/tests/unittests/test_instance_norm_op.py b/python/paddle/fluid/tests/unittests/test_instance_norm_op.py
index 39e994873dcbab941c35dae081d19e9cb24ee041..b7fcc63ca59b148807014132ca90becd02c5ae6d 100644
--- a/python/paddle/fluid/tests/unittests/test_instance_norm_op.py
+++ b/python/paddle/fluid/tests/unittests/test_instance_norm_op.py
@@ -20,6 +20,7 @@ import paddle.fluid as fluid
 from paddle.fluid.op import Operator
 from op_test import OpTest
 from paddle.fluid import Program, program_guard
+from paddle.fluid.dygraph import to_variable
 
 
 def _reference_instance_norm_naive(x, scale, bias, epsilon, mean, var):
@@ -214,5 +215,63 @@ class TestInstanceNormOpError(unittest.TestCase):
             self.assertRaises(TypeError, fluid.layers.instance_norm, x2)
 
 
+class TestElasticNormOp(unittest.TestCase):
+    def init_test_case(self):
+        self.epsilon = 1e-5
+        self.places = [core.CPUPlace()]
+        if core.is_compiled_with_cuda() and core.op_support_gpu(
+                "instance_norm"):
+            self.places.append(core.CUDAPlace(0))
+
+    def test_norm(self):
+        self.init_test_case()
+        inputs = np.random.random((2, 3, 5, 5)).astype(np.float32)
+        shape = inputs.shape
+        n, c, h, w = shape[0], shape[1], shape[2], shape[3]
+        scale_shape = [c]
+        mean_shape = [n * c]
+        scale = np.ones(scale_shape).astype(np.float32)
+        bias = np.zeros(scale_shape).astype(np.float32)
+        mean, variance = _cal_mean_variance(inputs, self.epsilon, mean_shape)
+        out_np, _, _ = _reference_instance_norm_naive(
+            inputs, scale, bias, self.epsilon, mean, variance)
+
+        for place in self.places:
+            with fluid.dygraph.guard(place):
+                instance_norm = fluid.dygraph.InstanceNorm(
+                    5, param_attr=False, bias_attr=False)
+                outputs = instance_norm(to_variable(inputs))
+                self.assertTrue(np.allclose(outputs.numpy(), out_np, atol=1e-6))
+
+
+class TestElasticNormOpCase2(unittest.TestCase):
+    def init_test_case(self):
+        self.epsilon = 1e-5
+        self.places = [core.CPUPlace()]
+        if core.is_compiled_with_cuda() and core.op_support_gpu(
+                "instance_norm"):
+            self.places.append(core.CUDAPlace(0))
+
+    def test_norm(self):
+        self.init_test_case()
+        inputs = np.random.random((2, 3, 5, 5)).astype(np.float32)
+        shape = inputs.shape
+        n, c, h, w = shape[0], shape[1], shape[2], shape[3]
+        scale_shape = [c]
+        mean_shape = [n * c]
+        scale = np.ones(scale_shape).astype(np.float32)
+        bias = np.zeros(scale_shape).astype(np.float32)
+        mean, variance = _cal_mean_variance(inputs, self.epsilon, mean_shape)
+        out_np, _, _ = _reference_instance_norm_naive(
+            inputs, scale, bias, self.epsilon, mean, variance)
+
+        for place in self.places:
+            with fluid.dygraph.guard(place):
+                instance_norm = fluid.dygraph.InstanceNorm(
+                    3, param_attr=True, bias_attr=True)
+                outputs = instance_norm(to_variable(inputs))
+                self.assertTrue(np.allclose(outputs.numpy(), out_np, atol=1e-6))
+
+
 if __name__ == '__main__':
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_jit_save_load.py b/python/paddle/fluid/tests/unittests/test_jit_save_load.py
new file mode 100644
index 0000000000000000000000000000000000000000..640e966354b44b733f67f71e11f79472c184a9ea
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/test_jit_save_load.py
@@ -0,0 +1,234 @@
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+
+import unittest
+import numpy as np
+
+import paddle
+import paddle.fluid as fluid
+from paddle.fluid.dygraph import Linear
+from paddle.fluid.dygraph import declarative
+
+BATCH_SIZE = 32
+BATCH_NUM = 20
+SEED = 10
+
+
+def random_batch_reader():
+    def _get_random_images_and_labels(image_shape, label_shape):
+        np.random.seed(SEED)
+        image = np.random.random(size=image_shape).astype('float32')
+        label = np.random.random(size=label_shape).astype('int64')
+        return image, label
+
+    def __reader__():
+        for _ in range(BATCH_NUM):
+            batch_image, batch_label = _get_random_images_and_labels(
+                [BATCH_SIZE, 784], [BATCH_SIZE, 1])
+            yield batch_image, batch_label
+
+    return __reader__
+
+
+class LinearNet(fluid.dygraph.Layer):
+    def __init__(self, in_size, out_size):
+        super(LinearNet, self).__init__()
+        self._linear = Linear(in_size, out_size)
+
+    @declarative
+    def forward(self, x):
+        return self._linear(x)
+
+
+class LinearNetNotDeclarative(fluid.dygraph.Layer):
+    def __init__(self, in_size, out_size):
+        super(LinearNetNotDeclarative, self).__init__()
+        self._linear = Linear(in_size, out_size)
+
+    def forward(self, x):
+        return self._linear(x)
+
+
+class LinearNetReturnLoss(fluid.dygraph.Layer):
+    def __init__(self, in_size, out_size):
+        super(LinearNetReturnLoss, self).__init__()
+        self._linear = Linear(in_size, out_size)
+
+    @declarative
+    def forward(self, x):
+        y = self._linear(x)
+        z = self._linear(y)
+        loss = fluid.layers.mean(z)
+        return z, loss
+
+
+def train(layer):
+    # create optimizer
+    adam = fluid.optimizer.AdamOptimizer(
+        learning_rate=0.1, parameter_list=layer.parameters())
+    # create data loader
+    train_loader = fluid.io.DataLoader.from_generator(capacity=5)
+    train_loader.set_batch_generator(random_batch_reader())
+    # train
+    for data in train_loader():
+        img, label = data
+        label.stop_gradient = True
+
+        cost = layer(img)
+
+        loss = fluid.layers.cross_entropy(cost, label)
+        avg_loss = fluid.layers.mean(loss)
+
+        avg_loss.backward()
+        adam.minimize(avg_loss)
+        layer.clear_gradients()
+    return [img], layer, avg_loss
+
+
+def infer(layer):
+    x = fluid.dygraph.to_variable(np.random.random((1, 784)).astype('float32'))
+    return layer(x)
+
+
+class TestJitSaveLoad(unittest.TestCase):
+    def setUp(self):
+        self.model_path = "model.test_jit_save_load"
+        # enable dygraph mode
+        fluid.enable_dygraph()
+        # config seed
+        fluid.default_main_program().random_seed = SEED
+
+    def train_and_save_model(self):
+        layer = LinearNet(784, 1)
+        example_inputs, layer, _ = train(layer)
+        fluid.dygraph.jit.save(
+            layer=layer, model_path=self.model_path, input_spec=example_inputs)
+        return layer
+
+    def test_save(self):
+        # train and save model
+        self.train_and_save_model()
+
+    def test_load_infernece(self):
+        # train and save model
+        train_layer = self.train_and_save_model()
+        # load model
+        infer_layer = fluid.dygraph.jit.load(self.model_path)
+        train_layer.eval()
+        # inference & compare
+        x = fluid.dygraph.to_variable(
+            np.random.random((1, 784)).astype('float32'))
+        self.assertTrue(
+            np.array_equal(train_layer(x).numpy(), infer_layer(x).numpy()))
+
+    def test_load_finetune(self):
+        # train and save model
+        train_layer = self.train_and_save_model()
+        # load model
+        load_train_layer = fluid.dygraph.jit.load(self.model_path)
+        load_train_layer.train()
+        # train & compare
+        _, _, train_loss = train(train_layer)
+        _, _, load_train_loss = train(load_train_layer)
+        self.assertTrue(
+            np.array_equal(train_loss.numpy(), load_train_loss.numpy()))
+
+    def test_save_get_program_failed(self):
+        layer = LinearNetNotDeclarative(784, 1)
+        example_inputs, layer, _ = train(layer)
+        with self.assertRaises(RuntimeError):
+            fluid.dygraph.jit.save(
+                layer=layer,
+                model_path=self.model_path,
+                input_spec=example_inputs)
+
+
+class TestJitSaveLoadConfig(unittest.TestCase):
+    def setUp(self):
+        # enable dygraph mode
+        fluid.enable_dygraph()
+        # config seed
+        fluid.default_main_program().random_seed = SEED
+
+    def basic_save_load(self, layer, model_path, configs):
+        # 1. train & save
+        example_inputs, train_layer, _ = train(layer)
+        fluid.dygraph.jit.save(
+            layer=train_layer,
+            model_path=model_path,
+            input_spec=example_inputs,
+            configs=configs)
+        # 2. load 
+        infer_layer = fluid.dygraph.jit.load(model_path, configs=configs)
+        train_layer.eval()
+        # 3. inference & compare
+        x = fluid.dygraph.to_variable(
+            np.random.random((1, 784)).astype('float32'))
+        self.assertTrue(
+            np.array_equal(train_layer(x).numpy(), infer_layer(x).numpy()))
+
+    def test_model_filename(self):
+        layer = LinearNet(784, 1)
+        model_path = "model.save_load_config.output_spec"
+        configs = fluid.dygraph.jit.SaveLoadConfig()
+        configs.model_filename = "__simplenet__"
+        self.basic_save_load(layer, model_path, configs)
+
+    def test_params_filename(self):
+        layer = LinearNet(784, 1)
+        model_path = "model.save_load_config.params_filename"
+        configs = fluid.dygraph.jit.SaveLoadConfig()
+        configs.params_filename = "__params__"
+        self.basic_save_load(layer, model_path, configs)
+
+    def test_separate_params(self):
+        layer = LinearNet(784, 1)
+        model_path = "model.save_load_config.separate_params"
+        configs = fluid.dygraph.jit.SaveLoadConfig()
+        configs.separate_params = True
+        self.basic_save_load(layer, model_path, configs)
+
+    def test_output_spec(self):
+        train_layer = LinearNetReturnLoss(8, 8)
+        adam = fluid.optimizer.AdamOptimizer(
+            learning_rate=0.1, parameter_list=train_layer.parameters())
+        x = fluid.dygraph.to_variable(
+            np.random.random((4, 8)).astype('float32'))
+        for i in range(10):
+            out, loss = train_layer(x)
+            loss.backward()
+            adam.minimize(loss)
+            train_layer.clear_gradients()
+
+        model_path = "model.save_load_config.output_spec"
+        configs = fluid.dygraph.jit.SaveLoadConfig()
+        configs.output_spec = [out]
+        fluid.dygraph.jit.save(
+            layer=train_layer,
+            model_path=model_path,
+            input_spec=[x],
+            configs=configs)
+
+        train_layer.eval()
+        infer_layer = fluid.dygraph.jit.load(model_path, configs=configs)
+        x = fluid.dygraph.to_variable(
+            np.random.random((4, 8)).astype('float32'))
+        self.assertTrue(
+            np.array_equal(train_layer(x)[0].numpy(), infer_layer(x).numpy()))
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_layers.py b/python/paddle/fluid/tests/unittests/test_layers.py
index ad091b7e5fee9cd868b2b911b7654d074238b585..a1ead2aef63f7b186ed2d5e8a6598349ae50509d 100644
--- a/python/paddle/fluid/tests/unittests/test_layers.py
+++ b/python/paddle/fluid/tests/unittests/test_layers.py
@@ -253,6 +253,38 @@ class TestLayer(LayerTest):
 
         self.assertTrue(np.allclose(static_ret, dy_ret_value))
 
+    def test_leakyrelu(self):
+        inputs = np.random.uniform(-1, 1, (10, 10)).astype('float32')
+        with self.static_graph():
+            t = layers.data(name='t', shape=[10, 10], dtype='float32')
+            ret = layers.leaky_relu(t, alpha=0.01)
+            static_ret = self.get_static_graph_result(
+                feed={'t': inputs}, fetch_list=[ret])[0]
+
+        with self.dynamic_graph():
+            lrelu = paddle.nn.LeakyReLU(alpha=0.01)
+            dy_ret = lrelu(base.to_variable(inputs))
+            dy_ret_value = dy_ret.numpy()
+
+        self.assertTrue(np.allclose(static_ret, dy_ret_value))
+
+    def test_pad2d(self):
+        with self.static_graph():
+            t = layers.data(name='t', shape=[-1, 3, 5, 5], dtype='float32')
+            ret = layers.pad2d(t, paddings=[1, 1, 1, 1])
+            static_ret = self.get_static_graph_result(
+                feed={'t': np.ones(
+                    [3, 3, 5, 5], dtype='float32')},
+                fetch_list=[ret])[0]
+
+        with self.dynamic_graph():
+            t = np.ones([3, 3, 5, 5], dtype='float32')
+            my_pad2d = paddle.nn.Pad2D(paddings=1)
+            dy_ret = my_pad2d(base.to_variable(t))
+            dy_ret_value = dy_ret.numpy()
+
+        self.assertTrue(np.allclose(static_ret, dy_ret_value))
+
     def test_matmul(self):
         with self.static_graph():
             t = layers.data(name='t', shape=[3, 3], dtype='float32')
@@ -2709,6 +2741,13 @@ class TestBook(LayerTest):
             out = layers.softsign(input, name='softsign')
             return (out)
 
+    def make_mish(self):
+        with program_guard(fluid.default_main_program(),
+                           fluid.default_startup_program()):
+            input = self._get_data(name="input", shape=[16], dtype="float32")
+            out = layers.mish(input, name='mish')
+            return (out)
+
     def make_cross_entropy(self):
         with program_guard(fluid.default_main_program(),
                            fluid.default_startup_program()):
diff --git a/python/paddle/fluid/tests/unittests/test_learning_rate_scheduler.py b/python/paddle/fluid/tests/unittests/test_learning_rate_scheduler.py
index 8b66035c57ab3abf76bae3191b46de755aa5a3f9..71b452d4a2dd192c756599eb24949084bfa0860e 100644
--- a/python/paddle/fluid/tests/unittests/test_learning_rate_scheduler.py
+++ b/python/paddle/fluid/tests/unittests/test_learning_rate_scheduler.py
@@ -98,8 +98,128 @@ def noam_decay(global_step, d_model, warmup_steps, learning_rate=1.0):
     return decayed_lr
 
 
-class TestNoamLearningRateDecayDygraphMode(unittest.TestCase):
-    def test_dygraph_mode(self):
+def linear_lr_warmup(global_step, warmup_steps, start_lr, end_lr):
+    linear_step = end_lr - start_lr
+    decayed_lr = start_lr + linear_step * (global_step / warmup_steps)
+    return decayed_lr
+
+
+def multi_step_decay(global_step, learning_rate, milestones, decay_rate=0.1):
+    for i in range(len(milestones)):
+        if global_step < milestones[i]:
+            return learning_rate * math.pow(decay_rate, i)
+
+    return learning_rate * math.pow(decay_rate, len(milestones))
+
+
+def step_decay(global_step, learning_rate, step_size, decay_rate=0.1):
+    return learning_rate * math.pow(decay_rate, global_step // step_size)
+
+
+def lambda_decay(global_step, learning_rate, lr_lambda):
+    return learning_rate * lr_lambda(global_step)
+
+
+class TestLearningRateDecayDygraph(unittest.TestCase):
+    def test_LR_state_dict(self):
+        with fluid.dygraph.guard():
+            x = np.random.uniform(-1, 1, [3, 10]).astype("float32")
+            linear = fluid.dygraph.Linear(10, 10)
+            input = fluid.dygraph.to_variable(x)
+
+            Exponential_scheduler = fluid.dygraph.ExponentialDecay(
+                learning_rate=0.1,
+                decay_steps=10000,
+                decay_rate=0.5,
+                staircase=True)
+            Step_scheduler = fluid.dygraph.StepDecay(0.5, step_size=3)
+            Reducelr_scheduler = fluid.dygraph.ReduceLROnPlateau(
+                learning_rate=1.0, decay_rate=0.5, patience=5, cooldown=3)
+
+            adam1 = fluid.optimizer.Adam(
+                learning_rate=Exponential_scheduler,
+                parameter_list=linear.parameters())
+            adam2 = fluid.optimizer.Adam(
+                learning_rate=Step_scheduler,
+                parameter_list=linear.parameters())
+            adam3 = fluid.optimizer.Adam(
+                learning_rate=Reducelr_scheduler,
+                parameter_list=linear.parameters())
+            print(adam3.state_dict())
+
+            for epoch in range(10):
+                out = linear(input)
+                loss = fluid.layers.reduce_mean(out)
+                loss.backward()
+                adam1.minimize(loss)
+                adam2.minimize(loss)
+                adam3.minimize(loss)
+                linear.clear_gradients()
+
+                Step_scheduler.epoch()
+                Reducelr_scheduler.step(loss)
+
+            fluid.dygraph.save_dygraph(linear.state_dict(), "save_path")
+
+            Exponential_scheduler_test = fluid.dygraph.ExponentialDecay(
+                learning_rate=0.1,
+                decay_steps=10000,
+                decay_rate=0.5,
+                staircase=True)
+            Step_scheduler_test = fluid.dygraph.StepDecay(0.5, step_size=3)
+            Reducelr_scheduler_test = fluid.dygraph.ReduceLROnPlateau(
+                learning_rate=1.0, decay_rate=0.5, patience=5, cooldown=3)
+
+            fluid.dygraph.save_dygraph(adam1.state_dict(), "save_path")
+            _, opt_state = fluid.dygraph.load_dygraph("save_path")
+            adam_test = fluid.optimizer.Adam(
+                learning_rate=Exponential_scheduler_test,
+                parameter_list=linear.parameters())
+            adam_test.set_dict(opt_state)
+            self.assertEqual(adam_test._learning_rate.step_num,
+                             adam1._learning_rate.step_num,
+                             "epoch_num is different before and after set_dict")
+
+            fluid.dygraph.save_dygraph(adam2.state_dict(), "save_path")
+            _, opt_state = fluid.dygraph.load_dygraph("save_path")
+            adam_test = fluid.optimizer.Adam(
+                learning_rate=Step_scheduler_test,
+                parameter_list=linear.parameters())
+            adam_test.set_dict(opt_state)
+            self.assertEqual(adam_test._learning_rate.epoch_num,
+                             adam2._learning_rate.epoch_num,
+                             "epoch_num is different before and after set_dict")
+            self.assertEqual(
+                adam_test._learning_rate(),
+                adam2._learning_rate(),
+                "current learning rate is different before and after set_dict")
+
+            fluid.dygraph.save_dygraph(adam3.state_dict(), "save_path")
+            _, opt_state = fluid.dygraph.load_dygraph("save_path")
+            adam_test = fluid.optimizer.Adam(
+                learning_rate=Reducelr_scheduler_test,
+                parameter_list=linear.parameters())
+            adam_test.set_dict(opt_state)
+            self.assertEqual(adam_test._learning_rate.best_loss,
+                             adam3._learning_rate.best_loss.numpy()[0],
+                             "best_loss is different before and after set_dict")
+            self.assertEqual(
+                adam_test._learning_rate.cooldown_counter,
+                adam3._learning_rate.cooldown_counter,
+                "cooldown_counter is different before and after set_dict")
+            self.assertEqual(
+                adam_test._learning_rate.num_bad_epochs,
+                adam3._learning_rate.num_bad_epochs,
+                "num_bad_epochs is different before and after set_dict")
+            self.assertEqual(adam_test._learning_rate.epoch_num,
+                             adam3._learning_rate.epoch_num,
+                             "epoch is different before and after set_dict")
+            self.assertEqual(
+                adam_test._learning_rate(),
+                adam3._learning_rate(),
+                "current learning rate is different before and after set_dict")
+
+    def test_NoamDecay(self):
         with fluid.dygraph.guard():
             d_model = 0.01
             warmup_steps = 200
@@ -117,6 +237,115 @@ class TestNoamLearningRateDecayDygraphMode(unittest.TestCase):
                     msg='Failed lr scheduler in step {0}, Python result is {1}, Fluid result is {2}'.
                     format(step, right_result, fluid_result[0]))
 
+    def test_LinearLrWarmup(self):
+        with fluid.dygraph.guard():
+            lr = fluid.layers.polynomial_decay(
+                learning_rate=1.0,
+                decay_steps=10,
+                end_learning_rate=0.0,
+                power=1.0)
+            lr = fluid.layers.linear_lr_warmup(
+                learning_rate=lr, warmup_steps=2, start_lr=0.0, end_lr=1.0)
+
+            right_result = [0.5, 0.9, 0.8, 0.7, 0.6]
+            for i in range(5):
+
+                t = lr()
+
+                self.assertTrue(
+                    np.allclose((t.numpy())[0].item(), right_result[i]))
+
+            with self.assertRaises(TypeError):
+                lr = fluid.layers.linear_lr_warmup(
+                    learning_rate="fake_lr",
+                    warmup_steps=2,
+                    start_lr=0.0,
+                    end_lr=1.0)
+
+    def test_MultiStepDecay(self):
+        with fluid.dygraph.guard():
+            learning_rate = 0.5
+            milestones = [2, 4, 8]
+            decay_rate = 0.2
+            linear = fluid.dygraph.Linear(10, 10)
+
+            scheduler = fluid.dygraph.MultiStepDecay(learning_rate, milestones,
+                                                     decay_rate)
+
+            adam = fluid.optimizer.AdamOptimizer(
+                learning_rate=scheduler, parameter_list=linear.parameters())
+            for epoch in range(10):
+                right_result = multi_step_decay(epoch, learning_rate,
+                                                milestones, decay_rate)
+                fluid_result = adam.current_step_lr()
+                scheduler.epoch()
+                self.assertAlmostEqual(
+                    right_result,
+                    fluid_result,
+                    msg='Failed lr scheduler in epoch {0}, Python result is {1}, Fluid result is {2}'.
+                    format(epoch, right_result, fluid_result))
+
+            with self.assertRaises(ValueError):
+                lr = fluid.dygraph.MultiStepDecay(learning_rate, [30, 50, 20],
+                                                  0.1)
+
+            with self.assertRaises(ValueError):
+                lr = fluid.dygraph.MultiStepDecay(learning_rate, [20, 30, 50],
+                                                  1)
+
+            with self.assertRaises(TypeError):
+                lr = fluid.dygraph.MultiStepDecay("test", [20, 30, 50])
+
+            with self.assertRaises(ValueError):
+                lr = fluid.dygraph.MultiStepDecay(-1, [20, 30, 50])
+
+    def test_StepDecay(self):
+        with fluid.dygraph.guard():
+            learning_rate = 0.5
+            step_size = 3
+            decay_rate = 0.2
+            scheduler = fluid.dygraph.StepDecay(learning_rate, step_size,
+                                                decay_rate)
+            for epoch in range(10):
+                right_result = step_decay(epoch, learning_rate, step_size,
+                                          decay_rate)
+                fluid_result = scheduler().numpy()[0]
+                scheduler.epoch()
+                self.assertAlmostEqual(
+                    right_result,
+                    fluid_result,
+                    msg='Failed lr scheduler in epoch {0}, Python result is {1}, Fluid result is {2}'.
+                    format(epoch, right_result, fluid_result))
+
+            with self.assertRaises(TypeError):
+                lr = fluid.dygraph.StepDecay(learning_rate, "test", 0.1)
+
+            with self.assertRaises(ValueError):
+                lr = fluid.dygraph.StepDecay(learning_rate, 20, 2)
+
+    def test_LambdaDecay(self):
+        with fluid.dygraph.guard():
+            learning_rate = 0.5
+            lr_lambda = lambda x: 0.95**x
+            scheduler = fluid.dygraph.LambdaDecay(learning_rate, lr_lambda)
+
+            linear = fluid.dygraph.nn.Linear(10, 10)
+            adam = fluid.optimizer.Adam(
+                scheduler, parameter_list=linear.parameters())
+
+            for epoch in range(30):
+                right_result = lambda_decay(epoch, learning_rate, lr_lambda)
+                fluid_result = scheduler().numpy()[0]
+                scheduler.epoch()
+                self.assertAlmostEqual(
+                    right_result,
+                    fluid_result,
+                    msg='Failed lr scheduler in epoch {0}, Python result is {1}, Fluid result is {2}'.
+                    format(epoch, right_result, fluid_result))
+
+            with self.assertRaises(TypeError):
+                lr = fluid.dygraph.LambdaDecay(learning_rate, "test")
+
 
 class TestLearningRateDecay(unittest.TestCase):
     def check_decay(self, python_decay_fn, fluid_decay_fn, kwargs):
@@ -171,31 +400,26 @@ class TestLearningRateDecay(unittest.TestCase):
             (natural_exp_decay, layers.natural_exp_decay, common_kwargs_false),
             (inverse_time_decay, layers.inverse_time_decay, common_kwargs_true),
             (inverse_time_decay, layers.inverse_time_decay,
-             common_kwargs_false),
-            (polynomial_decay, layers.polynomial_decay, {
-                "learning_rate": 1.0,
-                "decay_steps": 5,
-                "cycle": True
-            }),
-            (polynomial_decay, layers.polynomial_decay, {
-                "learning_rate": 1.0,
-                "decay_steps": 5,
-                "cycle": False
-            }),
-            (piecewise_decay, layers.piecewise_decay, {
-                "boundaries": [3, 6, 9],
-                "values": [0.1, 0.2, 0.3, 0.4]
-            }),
-            (cosine_decay, layers.cosine_decay, {
-                "learning_rate": 0.1,
-                "step_each_epoch": 100,
-                "epochs": 120
-            }),
-            (noam_decay, layers.noam_decay, {
-                "d_model": 0.01,
-                "warmup_steps": 200,
-                "learning_rate": 2.0
-            }),
+             common_kwargs_false), (polynomial_decay, layers.polynomial_decay, {
+                 "learning_rate": 1.0,
+                 "decay_steps": 5,
+                 "cycle": True
+             }), (polynomial_decay, layers.polynomial_decay, {
+                 "learning_rate": 1.0,
+                 "decay_steps": 5,
+                 "cycle": False
+             }), (piecewise_decay, layers.piecewise_decay, {
+                 "boundaries": [3, 6, 9],
+                 "values": [0.1, 0.2, 0.3, 0.4]
+             }), (cosine_decay, layers.cosine_decay, {
+                 "learning_rate": 0.1,
+                 "step_each_epoch": 100,
+                 "epochs": 120
+             }), (noam_decay, layers.noam_decay, {
+                 "d_model": 0.01,
+                 "warmup_steps": 200,
+                 "learning_rate": 2.0
+             })
         ]
 
         for py_decay_fn, fluid_decay_fn, kwargs in decay_fns:
@@ -207,13 +431,7 @@ class TestLearningRateDecay(unittest.TestCase):
                 self.check_decay(py_decay_fn, fluid_decay_fn, kwargs)
 
 
-def linear_lr_warmup(global_step, warmup_steps, start_lr, end_lr):
-    linear_step = end_lr - start_lr
-    decayed_lr = start_lr + linear_step * (global_step / warmup_steps)
-    return decayed_lr
-
-
-class TestLinearWamrupLearningRateDecay(TestLearningRateDecay):
+class TestLinearWamrupLearningRateDecay(unittest.TestCase):
     def check_decay_with_place(self, place, python_decay_fn, fluid_decay_fn,
                                kwargs):
         main_prog = fluid.Program()
@@ -304,37 +522,6 @@ class TestLinearWamrupLearningRateDecayWithScalarInput(unittest.TestCase):
         run_places(lr, start_lr, end_lr)
 
 
-class TestLinearWamrupLearningRateDecayDygraphMode(unittest.TestCase):
-    def test_dygraph_mode(self):
-        with fluid.dygraph.guard():
-            lr = fluid.layers.polynomial_decay(
-                learning_rate=1.0,
-                decay_steps=10,
-                end_learning_rate=0.0,
-                power=1.0)
-            lr = fluid.layers.linear_lr_warmup(
-                learning_rate=lr, warmup_steps=2, start_lr=0.0, end_lr=1.0)
-
-            right_result = [0.5, 0.9, 0.8, 0.7, 0.6]
-            for i in range(5):
-
-                t = lr()
-
-                self.assertTrue(
-                    np.allclose((t.numpy())[0].item(), right_result[i]))
-
-
-class TestLinearWamrupLearningRateDecayDygraphModeTypeCheck(unittest.TestCase):
-    def test_dygraph_mode(self):
-        with fluid.dygraph.guard():
-            with self.assertRaises(TypeError):
-                lr = fluid.layers.linear_lr_warmup(
-                    learning_rate="fake_lr",
-                    warmup_steps=2,
-                    start_lr=0.0,
-                    end_lr=1.0)
-
-
 def reduce_lr_on_plateau(decay_rate, threshold, cooldown, patience, m, n, loss,
                          var_list):
     def is_better(current, best, m, n):
diff --git a/python/paddle/fluid/tests/unittests/test_linspace.py b/python/paddle/fluid/tests/unittests/test_linspace.py
index 7d034d224ddc8f4ac486d9792e0233c64901c1e2..c7bab1a135bc439eefa822087869e08a43de0c51 100644
--- a/python/paddle/fluid/tests/unittests/test_linspace.py
+++ b/python/paddle/fluid/tests/unittests/test_linspace.py
@@ -20,6 +20,7 @@ from op_test import OpTest
 import paddle
 import paddle.fluid as fluid
 from paddle.fluid import compiler, Program, program_guard
+from paddle.fluid import core
 
 
 class TestLinspaceOpCommonCase(OpTest):
@@ -71,33 +72,36 @@ class TestLinspaceOpNumOneCase(OpTest):
 
 
 class TestLinspaceAPI(unittest.TestCase):
-    def test_out(self):
-        with program_guard(fluid.Program()):
-            out_1 = fluid.data(name="out_1", shape=[5], dtype="float32")
-            out_2 = paddle.tensor.linspace(0, 10, 5, dtype='float32', out=out_1)
-            exe = fluid.Executor(place=fluid.CPUPlace())
-            ipt = {'out_1': np.random.random([5]).astype('float32')}
-            res_1, res_2 = exe.run(fluid.default_main_program(),
-                                   feed=ipt,
-                                   fetch_list=[out_1, out_2])
-            assert np.array_equal(res_1, res_2)
+    def test_dtype(self):
+        out_1 = paddle.linspace(0, 10, 5, dtype='float32')
+        out_2 = paddle.linspace(0, 10, 5, dtype=np.float32)
+        out_3 = paddle.linspace(0, 10, 5, dtype=core.VarDesc.VarType.FP32)
+        exe = fluid.Executor(place=fluid.CPUPlace())
+        res_1, res_2, res_3 = exe.run(fluid.default_main_program(),
+                                      fetch_list=[out_1, out_2, out_3])
+        assert np.array_equal(res_1, res_2)
 
     def test_name(self):
-        with fluid.program_guard(fluid.Program()):
+        with paddle.program_guard(paddle.Program()):
             out = paddle.linspace(
                 0, 10, 5, dtype='float32', name='linspace_res')
             assert 'linspace_res' in out.name
 
+    def test_imperative(self):
+        with paddle.imperative.guard():
+            out = paddle.linspace(0, 10, 5, dtype='float32')
+            np_out = np.linspace(0, 10, 5, dtype='float32')
+        self.assertEqual((out.numpy() == np_out).all(), True)
+
 
 class TestLinspaceOpError(unittest.TestCase):
     def test_errors(self):
         with program_guard(Program(), Program()):
-            # for ci coverage
-            # The device of fill_constant must be in 'cpu', 'gpu' or None
-            def test_device_value():
-                paddle.linspace(0, 10, 1, dtype="float32", device='xxxpu')
 
-            self.assertRaises(ValueError, test_device_value)
+            def test_dtype():
+                fluid.layers.linspace(0, 10, 1, dtype="int32")
+
+            self.assertRaises(TypeError, test_dtype)
 
             def test_start_type():
                 fluid.layers.linspace([0], 10, 1, dtype="float32")
diff --git a/python/paddle/fluid/tests/unittests/test_matrix_nms_op.py b/python/paddle/fluid/tests/unittests/test_matrix_nms_op.py
new file mode 100644
index 0000000000000000000000000000000000000000..cf756ae8384486ea30ebc2fd25079484061ed380
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/test_matrix_nms_op.py
@@ -0,0 +1,307 @@
+#   Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+import unittest
+import numpy as np
+import copy
+from op_test import OpTest
+import paddle.fluid as fluid
+from paddle.fluid import Program, program_guard
+
+
+def softmax(x):
+    # clip to shiftx, otherwise, when calc loss with
+    # log(exp(shiftx)), may get log(0)=INF
+    shiftx = (x - np.max(x)).clip(-64.)
+    exps = np.exp(shiftx)
+    return exps / np.sum(exps)
+
+
+def iou_matrix(a, b, norm=True):
+    tl_i = np.maximum(a[:, np.newaxis, :2], b[:, :2])
+    br_i = np.minimum(a[:, np.newaxis, 2:], b[:, 2:])
+
+    pad = not norm and 1 or 0
+
+    area_i = np.prod(br_i - tl_i + pad, axis=2) * (tl_i < br_i).all(axis=2)
+    area_a = np.prod(a[:, 2:] - a[:, :2] + pad, axis=1)
+    area_b = np.prod(b[:, 2:] - b[:, :2] + pad, axis=1)
+    area_o = (area_a[:, np.newaxis] + area_b - area_i)
+    return area_i / (area_o + 1e-10)
+
+
+def matrix_nms(boxes,
+               scores,
+               score_threshold,
+               post_threshold=0.,
+               nms_top_k=400,
+               normalized=True,
+               use_gaussian=False,
+               gaussian_sigma=2.):
+    all_scores = copy.deepcopy(scores)
+    all_scores = all_scores.flatten()
+    selected_indices = np.where(all_scores > score_threshold)[0]
+    all_scores = all_scores[selected_indices]
+
+    sorted_indices = np.argsort(-all_scores, axis=0, kind='mergesort')
+    sorted_scores = all_scores[sorted_indices]
+    sorted_indices = selected_indices[sorted_indices]
+    if nms_top_k > -1 and nms_top_k < sorted_indices.shape[0]:
+        sorted_indices = sorted_indices[:nms_top_k]
+        sorted_scores = sorted_scores[:nms_top_k]
+
+    selected_boxes = boxes[sorted_indices, :]
+    ious = iou_matrix(selected_boxes, selected_boxes)
+    ious = np.triu(ious, k=1)
+    iou_cmax = ious.max(0)
+    N = iou_cmax.shape[0]
+    iou_cmax = np.repeat(iou_cmax[:, np.newaxis], N, axis=1)
+
+    if use_gaussian:
+        decay = np.exp((iou_cmax**2 - ious**2) * gaussian_sigma)
+    else:
+        decay = (1 - ious) / (1 - iou_cmax)
+    decay = decay.min(0)
+    decayed_scores = sorted_scores * decay
+
+    if post_threshold > 0.:
+        inds = np.where(decayed_scores > post_threshold)[0]
+        selected_boxes = selected_boxes[inds, :]
+        decayed_scores = decayed_scores[inds]
+        sorted_indices = sorted_indices[inds]
+
+    return decayed_scores, selected_boxes, sorted_indices
+
+
+def multiclass_nms(boxes, scores, background, score_threshold, post_threshold,
+                   nms_top_k, keep_top_k, normalized, use_gaussian,
+                   gaussian_sigma):
+    all_boxes = []
+    all_cls = []
+    all_scores = []
+    all_indices = []
+    for c in range(scores.shape[0]):
+        if c == background:
+            continue
+        decayed_scores, selected_boxes, indices = matrix_nms(
+            boxes, scores[c], score_threshold, post_threshold, nms_top_k,
+            normalized, use_gaussian, gaussian_sigma)
+        all_cls.append(np.full(len(decayed_scores), c, decayed_scores.dtype))
+        all_boxes.append(selected_boxes)
+        all_scores.append(decayed_scores)
+        all_indices.append(indices)
+
+    all_cls = np.concatenate(all_cls)
+    all_boxes = np.concatenate(all_boxes)
+    all_scores = np.concatenate(all_scores)
+    all_indices = np.concatenate(all_indices)
+    all_pred = np.concatenate(
+        (all_cls[:, np.newaxis], all_scores[:, np.newaxis], all_boxes), axis=1)
+
+    num_det = len(all_pred)
+    if num_det == 0:
+        return all_pred, np.array([], dtype=np.float32)
+
+    inds = np.argsort(-all_scores, axis=0, kind='mergesort')
+    all_pred = all_pred[inds, :]
+    all_indices = all_indices[inds]
+
+    if keep_top_k > -1 and num_det > keep_top_k:
+        num_det = keep_top_k
+        all_pred = all_pred[:keep_top_k, :]
+        all_indices = all_indices[:keep_top_k]
+
+    return all_pred, all_indices
+
+
+def batched_multiclass_nms(boxes,
+                           scores,
+                           background,
+                           score_threshold,
+                           post_threshold,
+                           nms_top_k,
+                           keep_top_k,
+                           normalized=True,
+                           use_gaussian=False,
+                           gaussian_sigma=2.):
+    batch_size = scores.shape[0]
+    det_outs = []
+    index_outs = []
+    lod = []
+    for n in range(batch_size):
+        nmsed_outs, indices = multiclass_nms(
+            boxes[n], scores[n], background, score_threshold, post_threshold,
+            nms_top_k, keep_top_k, normalized, use_gaussian, gaussian_sigma)
+        nmsed_num = len(nmsed_outs)
+        lod.append(nmsed_num)
+        if nmsed_num == 0:
+            continue
+        indices += n * scores.shape[2]
+        det_outs.append(nmsed_outs)
+        index_outs.append(indices)
+    if det_outs:
+        det_outs = np.concatenate(det_outs)
+        index_outs = np.concatenate(index_outs)
+    return det_outs, index_outs, lod
+
+
+class TestMatrixNMSOp(OpTest):
+    def set_argument(self):
+        self.post_threshold = 0.
+        self.use_gaussian = False
+
+    def setUp(self):
+        self.set_argument()
+        N = 7
+        M = 1200
+        C = 21
+        BOX_SIZE = 4
+        background = 0
+        nms_top_k = 400
+        keep_top_k = 200
+        score_threshold = 0.01
+        post_threshold = self.post_threshold
+        use_gaussian = False
+        if hasattr(self, 'use_gaussian'):
+            use_gaussian = self.use_gaussian
+        gaussian_sigma = 2.
+
+        scores = np.random.random((N * M, C)).astype('float32')
+
+        scores = np.apply_along_axis(softmax, 1, scores)
+        scores = np.reshape(scores, (N, M, C))
+        scores = np.transpose(scores, (0, 2, 1))
+
+        boxes = np.random.random((N, M, BOX_SIZE)).astype('float32')
+        boxes[:, :, 0:2] = boxes[:, :, 0:2] * 0.5
+        boxes[:, :, 2:4] = boxes[:, :, 2:4] * 0.5 + 0.5
+
+        det_outs, index_outs, lod = batched_multiclass_nms(
+            boxes, scores, background, score_threshold, post_threshold,
+            nms_top_k, keep_top_k, True, use_gaussian, gaussian_sigma)
+
+        empty = len(det_outs) == 0
+        det_outs = np.array([], dtype=np.float32) if empty else det_outs
+        index_outs = np.array([], dtype=np.float32) if empty else index_outs
+        nmsed_outs = det_outs.astype('float32')
+
+        self.op_type = 'matrix_nms'
+        self.inputs = {'BBoxes': boxes, 'Scores': scores}
+        self.outputs = {
+            'Out': (nmsed_outs, [lod]),
+            'Index': (index_outs[:, None], [lod])
+        }
+        self.attrs = {
+            'background_label': 0,
+            'nms_top_k': nms_top_k,
+            'keep_top_k': keep_top_k,
+            'score_threshold': score_threshold,
+            'post_threshold': post_threshold,
+            'use_gaussian': use_gaussian,
+            'gaussian_sigma': gaussian_sigma,
+            'normalized': True,
+        }
+
+    def test_check_output(self):
+        self.check_output()
+
+
+class TestMatrixNMSOpNoOutput(TestMatrixNMSOp):
+    def set_argument(self):
+        self.post_threshold = 2.0
+
+
+class TestMatrixNMSOpGaussian(TestMatrixNMSOp):
+    def set_argument(self):
+        self.post_threshold = 0.
+        self.use_gaussian = True
+
+
+class TestMatrixNMSError(unittest.TestCase):
+    def test_errors(self):
+        with program_guard(Program(), Program()):
+            M = 1200
+            N = 7
+            C = 21
+            BOX_SIZE = 4
+            nms_top_k = 400
+            keep_top_k = 200
+            score_threshold = 0.01
+            post_threshold = 0.
+
+            boxes_np = np.random.random((M, C, BOX_SIZE)).astype('float32')
+            scores = np.random.random((N * M, C)).astype('float32')
+            scores = np.apply_along_axis(softmax, 1, scores)
+            scores = np.reshape(scores, (N, M, C))
+            scores_np = np.transpose(scores, (0, 2, 1))
+
+            boxes_data = fluid.data(
+                name='bboxes', shape=[M, C, BOX_SIZE], dtype='float32')
+            scores_data = fluid.data(
+                name='scores', shape=[N, C, M], dtype='float32')
+
+            def test_bboxes_Variable():
+                # the bboxes type must be Variable
+                fluid.layers.matrix_nms(
+                    bboxes=boxes_np,
+                    scores=scores_data,
+                    nms_top_k=nms_top_k,
+                    keep_top_k=keep_top_k,
+                    score_threshold=score_threshold,
+                    post_threshold=post_threshold)
+
+            def test_scores_Variable():
+                # the scores type must be Variable
+                fluid.layers.matrix_nms(
+                    bboxes=boxes_data,
+                    scores=scores_np,
+                    nms_top_k=nms_top_k,
+                    keep_top_k=keep_top_k,
+                    score_threshold=score_threshold,
+                    post_threshold=post_threshold)
+
+            def test_empty():
+                # when all score are lower than threshold
+                try:
+                    fluid.layers.matrix_nms(
+                        bboxes=boxes_data,
+                        scores=scores_data,
+                        nms_top_k=nms_top_k,
+                        keep_top_k=keep_top_k,
+                        score_threshold=10.,
+                        post_threshold=post_threshold)
+                except Exception as e:
+                    self.fail(e)
+
+            def test_coverage():
+                # cover correct workflow
+                try:
+                    fluid.layers.matrix_nms(
+                        bboxes=boxes_data,
+                        scores=scores_data,
+                        nms_top_k=nms_top_k,
+                        keep_top_k=keep_top_k,
+                        score_threshold=score_threshold,
+                        post_threshold=post_threshold)
+                except Exception as e:
+                    self.fail(e)
+
+            self.assertRaises(TypeError, test_bboxes_Variable)
+            self.assertRaises(TypeError, test_scores_Variable)
+            test_coverage()
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_meshgrid_op.py b/python/paddle/fluid/tests/unittests/test_meshgrid_op.py
index eea1ca3282c93a40cc9fcf3149329a358cadcf41..3f03d5ed650a8f5cb1491d7dbf2c6ff98eefd43a 100644
--- a/python/paddle/fluid/tests/unittests/test_meshgrid_op.py
+++ b/python/paddle/fluid/tests/unittests/test_meshgrid_op.py
@@ -79,7 +79,7 @@ class TestMeshgridOp3(unittest.TestCase):
         out_2 = np.broadcast_to(out_2, [100, 200])
 
         exe = fluid.Executor(place=fluid.CPUPlace())
-        grid_x, grid_y = paddle.tensor.meshgrid([x, y])
+        grid_x, grid_y = paddle.tensor.meshgrid(x, y)
         res_1, res_2 = exe.run(fluid.default_main_program(),
                                feed={'x': input_1,
                                      'y': input_2},
@@ -90,21 +90,72 @@ class TestMeshgridOp3(unittest.TestCase):
 
 
 class TestMeshgridOp4(unittest.TestCase):
-    def test_errors(self):
-        with program_guard(Program(), Program()):
+    def test_list_input(self):
+        x = fluid.data(shape=[100], dtype='int32', name='x')
+        y = fluid.data(shape=[200], dtype='int32', name='y')
 
-            def test_input_type():
-                x = fluid.data(shape=[200], dtype='float32', name='x2')
-                paddle.tensor.meshgrid(x)
+        input_1 = np.random.randint(0, 100, [100, ]).astype('int32')
+        input_2 = np.random.randint(0, 100, [200, ]).astype('int32')
 
-        self.assertRaises(TypeError, test_input_type)
+        out_1 = np.reshape(input_1, [100, 1])
+        out_1 = np.broadcast_to(out_1, [100, 200])
+        out_2 = np.reshape(input_2, [1, 200])
+        out_2 = np.broadcast_to(out_2, [100, 200])
+
+        exe = fluid.Executor(place=fluid.CPUPlace())
+        grid_x, grid_y = paddle.tensor.meshgrid([x, y])
+        res_1, res_2 = exe.run(fluid.default_main_program(),
+                               feed={'x': input_1,
+                                     'y': input_2},
+                               fetch_list=[grid_x, grid_y])
+
+        assert np.array_equal(res_1, out_1)
+        assert np.array_equal(res_2, out_2)
 
 
 class TestMeshgridOp5(unittest.TestCase):
+    def test_tuple_input(self):
+        x = fluid.data(shape=[100], dtype='int32', name='x')
+        y = fluid.data(shape=[200], dtype='int32', name='y')
+
+        input_1 = np.random.randint(0, 100, [100, ]).astype('int32')
+        input_2 = np.random.randint(0, 100, [200, ]).astype('int32')
+
+        out_1 = np.reshape(input_1, [100, 1])
+        out_1 = np.broadcast_to(out_1, [100, 200])
+        out_2 = np.reshape(input_2, [1, 200])
+        out_2 = np.broadcast_to(out_2, [100, 200])
+
+        exe = fluid.Executor(place=fluid.CPUPlace())
+        grid_x, grid_y = paddle.tensor.meshgrid((x, y))
+        res_1, res_2 = exe.run(fluid.default_main_program(),
+                               feed={'x': input_1,
+                                     'y': input_2},
+                               fetch_list=[grid_x, grid_y])
+
+        assert np.array_equal(res_1, out_1)
+        assert np.array_equal(res_2, out_2)
+
+
+class TestMeshgridOp6(unittest.TestCase):
     def test_api_with_dygraph(self):
         input_3 = np.random.randint(0, 100, [100, ]).astype('int32')
         input_4 = np.random.randint(0, 100, [200, ]).astype('int32')
 
+        with fluid.dygraph.guard():
+            tensor_3 = fluid.dygraph.to_variable(input_3)
+            tensor_4 = fluid.dygraph.to_variable(input_4)
+            res_3, res_4 = paddle.tensor.meshgrid(tensor_3, tensor_4)
+
+            assert np.array_equal(res_3.shape, [100, 200])
+            assert np.array_equal(res_4.shape, [100, 200])
+
+
+class TestMeshgridOp7(unittest.TestCase):
+    def test_api_with_dygraph_list_input(self):
+        input_3 = np.random.randint(0, 100, [100, ]).astype('int32')
+        input_4 = np.random.randint(0, 100, [200, ]).astype('int32')
+
         with fluid.dygraph.guard():
             tensor_3 = fluid.dygraph.to_variable(input_3)
             tensor_4 = fluid.dygraph.to_variable(input_4)
@@ -114,5 +165,19 @@ class TestMeshgridOp5(unittest.TestCase):
             assert np.array_equal(res_4.shape, [100, 200])
 
 
+class TestMeshgridOp7(unittest.TestCase):
+    def test_api_with_dygraph_tuple_input(self):
+        input_3 = np.random.randint(0, 100, [100, ]).astype('int32')
+        input_4 = np.random.randint(0, 100, [200, ]).astype('int32')
+
+        with fluid.dygraph.guard():
+            tensor_3 = fluid.dygraph.to_variable(input_3)
+            tensor_4 = fluid.dygraph.to_variable(input_4)
+            res_3, res_4 = paddle.tensor.meshgrid((tensor_3, tensor_4))
+
+            assert np.array_equal(res_3.shape, [100, 200])
+            assert np.array_equal(res_4.shape, [100, 200])
+
+
 if __name__ == '__main__':
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_mish_op.py b/python/paddle/fluid/tests/unittests/test_mish_op.py
new file mode 100644
index 0000000000000000000000000000000000000000..8cc785e450f0bac54f2193aac45165bc9b800b73
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/test_mish_op.py
@@ -0,0 +1,102 @@
+#   Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+
+import unittest
+import numpy as np
+import six
+import paddle.fluid as fluid
+import paddle.fluid.core as core
+from paddle.fluid import Program, program_guard
+from op_test import OpTest, skip_check_grad_ci
+
+
+class TestMishOpError(unittest.TestCase):
+    def test_errors(self):
+        with program_guard(Program()):
+            # The input type must be Variable.
+            self.assertRaises(TypeError, fluid.layers.mish, 0.1, 20)
+            # The input dtype must be float16, float32, float64.
+            x_int32 = fluid.data(name='x_int32', shape=[12, 10], dtype='int32')
+            self.assertRaises(TypeError, fluid.layers.mish, x_int32, 20)
+            # support the input dtype is float32
+            x_fp16 = fluid.layers.data(
+                name='x_fp16', shape=[12, 10], dtype='float32')
+            fluid.layers.mish(x_fp16, threshold=20)
+
+
+class MishTest(OpTest):
+    def setUp(self):
+        self.init_dtype()
+        self.init_input_shape()
+        self.init_input_range()
+        self.init_threshold()
+        self.op_type = "mish"
+
+        x_np = np.random.uniform(self.x_range[0], self.x_range[1],
+                                 self.x_shape).astype(self.dtype)
+        self.inputs = {'X': x_np}
+
+        softplus = x_np * (x_np > self.threshold) + np.exp(x_np) * \
+                    (x_np < -self.threshold) + np.log(np.exp(x_np) + 1.) * \
+                    (x_np >= -self.threshold) * (x_np <= self.threshold)
+        out_np = x_np * np.tanh(softplus)
+
+        self.outputs = {'Out': out_np}
+        self.attrs = {'threshold': self.threshold}
+
+    def init_dtype(self):
+        self.dtype = 'float32'
+
+    def init_input_shape(self):
+        self.x_shape = (10, 12)
+
+    def init_input_range(self):
+        self.x_range = [-1, 1]
+
+    def init_threshold(self):
+        self.threshold = 5.
+
+    def test_check_output(self):
+        self.check_output()
+
+    def test_check_grad(self):
+        self.check_grad(['X'], 'Out')
+
+
+class MishTestUpperThresh(MishTest):
+    def init_input_range(self):
+        self.x_range = [6, 7]
+
+
+class MishTestLowerThresh(MishTest):
+    def init_input_range(self):
+        self.x_range = [-7, -6]
+
+
+# mish op contain calculation like: tanh, exp, log, while tanh
+# may have diff on CPUPlace(see test_activation_op.py::TestTanh),
+# especially when abs(x) is a large value, only check input value
+# in range [-1, 1] for float64 here.
+class MishTestFP64(MishTest):
+    def init_dtype(self):
+        self.dtype = 'float64'
+
+    def init_input_range(self):
+        self.x_range = [-1, 1]
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_mul_op.py b/python/paddle/fluid/tests/unittests/test_mul_op.py
index 4f2466c9b70426d81e28feedd278647b1201f834..8ca06aa952184daec6be59a09330c8f16f6ee1d6 100644
--- a/python/paddle/fluid/tests/unittests/test_mul_op.py
+++ b/python/paddle/fluid/tests/unittests/test_mul_op.py
@@ -175,35 +175,5 @@ class TestFP16MulOp2(TestMulOp2):
                 no_grad_set=set('Y'))
 
 
-class TestMulOpAttr(unittest.TestCase):
-    def test_out(self):
-        with fluid.program_guard(fluid.Program()):
-            x = fluid.data(name="x", shape=[2, 3], dtype="float32")
-            y = fluid.data(name='y', shape=[3, 2], dtype='float32')
-
-            res = fluid.data(name="output", shape=[2, 2], dtype="float32")
-            y_1 = paddle.mul(x, y, out=res)
-
-            place = fluid.CPUPlace()
-            exe = fluid.Executor(place)
-            data1 = np.array([[1, 2, 3], [4, 5, 6]], dtype='float32')
-            data2 = np.array([[1, 2], [1, 2], [1, 2]], dtype='float32')
-            np_res, np_y_1 = exe.run(feed={'x': data1,
-                                           'y': data2},
-                                     fetch_list=[res, y_1])
-
-            self.assertEqual((np_res == np_y_1).all(), True)
-
-    def test_name(self):
-        with fluid.program_guard(fluid.Program()):
-            x = fluid.data(name="x", shape=[2, 3], dtype="float32")
-            y = fluid.data(name='y', shape=[3, 2], dtype='float32')
-
-            res = fluid.data(name="output", shape=[2, 2], dtype="float32")
-            y_1 = paddle.mul(x, y, name='mul_res')
-            y_2 = paddle.mul(x, y, out=res, name='mul_res')
-            self.assertEqual(('mul_res' in y_1.name), True)
-
-
 if __name__ == "__main__":
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_multiprocess_dataloader_dynamic.py b/python/paddle/fluid/tests/unittests/test_multiprocess_dataloader_dynamic.py
index 6af273faf3942f19e0612811f644f6606417394c..0706eb53d537da58a5a248e060759b748b30af19 100644
--- a/python/paddle/fluid/tests/unittests/test_multiprocess_dataloader_dynamic.py
+++ b/python/paddle/fluid/tests/unittests/test_multiprocess_dataloader_dynamic.py
@@ -28,12 +28,7 @@ from paddle.fluid.dygraph.nn import Linear
 from paddle.fluid.dygraph.base import to_variable
 
 from test_multiprocess_dataloader_static import RandomDataset, prepare_places
-
-EPOCH_NUM = 5
-BATCH_SIZE = 16
-IMAGE_SIZE = 784
-SAMPLE_NUM = 400
-CLASS_NUM = 10
+from test_multiprocess_dataloader_static import EPOCH_NUM, BATCH_SIZE, IMAGE_SIZE, SAMPLE_NUM, CLASS_NUM
 
 
 class SimpleFCNet(fluid.dygraph.Layer):
diff --git a/python/paddle/fluid/tests/unittests/test_multiprocess_dataloader_exception.py b/python/paddle/fluid/tests/unittests/test_multiprocess_dataloader_exception.py
index e7e6999112e498c3311c7b5a037912db1000eaf8..617527242f5b2345b13342c5e20f62d3a546c6df 100644
--- a/python/paddle/fluid/tests/unittests/test_multiprocess_dataloader_exception.py
+++ b/python/paddle/fluid/tests/unittests/test_multiprocess_dataloader_exception.py
@@ -23,6 +23,7 @@ import multiprocessing
 import numpy as np
 
 import paddle.fluid as fluid
+import paddle.fluid.core as core
 from paddle.io import Dataset, BatchSampler, DataLoader
 from paddle.fluid.dygraph.nn import Linear
 from paddle.fluid.dygraph.base import to_variable
@@ -109,6 +110,8 @@ class TestDataLoaderAssert(unittest.TestCase):
 
 # CI Converage cannot record stub in subprocess,
 # HACK a _worker_loop in main process call here
+@unittest.skipIf(not core.is_compiled_with_cuda(),
+                 "core is not compiled with CUDA")
 class TestDataLoaderWorkerLoop(unittest.TestCase):
     def run_without_worker_done(self, use_shared_memory=True):
         try:
@@ -151,7 +154,7 @@ class TestDataLoaderWorkerLoop(unittest.TestCase):
 
     def run_with_worker_done(self, use_shared_memory=True):
         try:
-            place = fluid.cpu_places()[0]
+            place = fluid.CUDAPlace(0)
             with fluid.dygraph.guard(place):
                 dataset = RandomDataset(800)
 
@@ -190,7 +193,8 @@ class TestDataLoaderWorkerLoop(unittest.TestCase):
             self.assertTrue(False)
 
     def test_main(self):
-        for use_shared_memory in [True, False]:
+        # only HACK a subprocess call here, do not need to use_shared_memory
+        for use_shared_memory in [False]:
             self.run_without_worker_done(use_shared_memory)
             self.run_with_worker_done(use_shared_memory)
 
diff --git a/python/paddle/fluid/tests/unittests/test_multiprocess_dataloader_static.py b/python/paddle/fluid/tests/unittests/test_multiprocess_dataloader_static.py
index 2d75126ec4207ff7b1db38d7acf7aa534b9658ad..38497f91fc18847e40efa691a65c2a7adc20e51c 100644
--- a/python/paddle/fluid/tests/unittests/test_multiprocess_dataloader_static.py
+++ b/python/paddle/fluid/tests/unittests/test_multiprocess_dataloader_static.py
@@ -25,10 +25,10 @@ import numpy as np
 import paddle.fluid as fluid
 from paddle.io import Dataset, BatchSampler, DataLoader
 
-EPOCH_NUM = 5
-BATCH_SIZE = 16
-IMAGE_SIZE = 784
-SAMPLE_NUM = 400
+EPOCH_NUM = 3
+BATCH_SIZE = 8
+IMAGE_SIZE = 32
+SAMPLE_NUM = 100
 CLASS_NUM = 10
 
 
diff --git a/python/paddle/fluid/tests/unittests/test_norm_nn_grad.py b/python/paddle/fluid/tests/unittests/test_norm_nn_grad.py
index 4f29467a3c5515eaff985e22aed4eccf16867757..c44ea454271f3aa6cb12451cd85490b57284ea35 100644
--- a/python/paddle/fluid/tests/unittests/test_norm_nn_grad.py
+++ b/python/paddle/fluid/tests/unittests/test_norm_nn_grad.py
@@ -49,5 +49,24 @@ class TestInstanceNormDoubleGradCheck(unittest.TestCase):
             self.func(p)
 
 
+class TestInstanceNormDoubleGradCheckWithoutParamBias(
+        TestInstanceNormDoubleGradCheck):
+    @prog_scope()
+    def func(self, place):
+        prog = fluid.Program()
+        with fluid.program_guard(prog):
+            np.random.seed()
+            shape = [2, 3, 4, 5]
+            dtype = "float32"
+            eps = 0.005
+            atol = 1e-4
+            x = layers.create_parameter(dtype=dtype, shape=shape, name='x')
+            z = fluid.layers.instance_norm(
+                input=x, param_attr=False, bias_attr=False)
+            x_arr = np.random.uniform(-1, 1, shape).astype(dtype)
+            gradient_checker.double_grad_check(
+                [x], z, x_init=x_arr, atol=atol, place=place, eps=eps)
+
+
 if __name__ == "__main__":
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_ones_op.py b/python/paddle/fluid/tests/unittests/test_ones_op.py
new file mode 100644
index 0000000000000000000000000000000000000000..6061bfcff442ec869f887a7b9499978ac417a47f
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/test_ones_op.py
@@ -0,0 +1,83 @@
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+
+import unittest
+import numpy as np
+from op_test import OpTest
+
+import paddle
+import paddle.fluid.core as core
+from paddle.fluid.op import Operator
+import paddle.fluid as fluid
+import numpy as np
+
+
+class ApiOnesTest(unittest.TestCase):
+    def test_out(self):
+        with fluid.program_guard(fluid.Program()):
+            ones = paddle.ones(shape=[10], dtype="float64")
+            place = fluid.CPUPlace()
+            exe = fluid.Executor(place)
+            result, = exe.run(fetch_list=[ones])
+            expected_result = np.ones(10, dtype="float64")
+        self.assertEqual((result == expected_result).all(), True)
+
+        with fluid.program_guard(fluid.Program()):
+            ones = paddle.ones(shape=[10], dtype="int64")
+            place = fluid.CPUPlace()
+            exe = fluid.Executor(place)
+            result, = exe.run(fetch_list=[ones])
+            expected_result = np.ones(10, dtype="int64")
+        self.assertEqual((result == expected_result).all(), True)
+
+        with fluid.program_guard(fluid.Program()):
+            ones = paddle.ones(shape=[10], dtype="int64", device="cpu")
+            place = fluid.CPUPlace()
+            exe = fluid.Executor(place)
+            result, = exe.run(fetch_list=[ones])
+            expected_result = np.ones(10, dtype="int64")
+        self.assertEqual((result == expected_result).all(), True)
+
+
+class ApiOnesZerosError(unittest.TestCase):
+    def test_errors(self):
+        def test_error1():
+            with fluid.program_guard(fluid.Program()):
+                ones = paddle.ones(shape=10, dtype="int64", device="opu")
+
+        self.assertRaises(ValueError, test_error1)
+
+        def test_error2():
+            with fluid.program_guard(fluid.Program()):
+                ones = paddle.ones(shape=10, dtype="int64", device="opu")
+
+        self.assertRaises(ValueError, test_error2)
+
+        def test_error3():
+            with fluid.program_guard(fluid.Program()):
+                ones = fluid.layers.ones(shape=10, dtype="int64")
+
+        self.assertRaises(TypeError, test_error3)
+
+        def test_error4():
+            with fluid.program_guard(fluid.Program()):
+                ones = fluid.layers.ones(shape=[10], dtype="int8")
+
+        self.assertRaises(TypeError, test_error4)
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_paddlebox_datafeed.py b/python/paddle/fluid/tests/unittests/test_paddlebox_datafeed.py
deleted file mode 100644
index 94bc8ff28861b266015101707be12c6077379055..0000000000000000000000000000000000000000
--- a/python/paddle/fluid/tests/unittests/test_paddlebox_datafeed.py
+++ /dev/null
@@ -1,146 +0,0 @@
-#   Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-from __future__ import print_function
-import paddle.fluid as fluid
-import paddle.fluid.core as core
-import os
-import unittest
-import paddle.fluid.layers as layers
-
-
-class TestDataFeed(unittest.TestCase):
-    """  TestBaseCase(Merge PV)   """
-
-    def setUp(self):
-        self.batch_size = 10
-        self.pv_batch_size = 10
-        self.enable_pv_merge = True
-        self.merge_by_sid = True
-
-    def set_data_config(self):
-        self.dataset = fluid.DatasetFactory().create_dataset("BoxPSDataset")
-        self.dataset.set_feed_type("PaddleBoxDataFeed")
-        self.dataset.set_parse_logkey(True)
-        self.dataset.set_thread(1)
-        self.dataset.set_enable_pv_merge(self.enable_pv_merge)
-        self.dataset.set_batch_size(self.batch_size)
-        if self.enable_pv_merge:
-            self.dataset.set_merge_by_sid(self.merge_by_sid)
-            self.dataset.set_rank_offset("rank_offset")
-            self.dataset.set_pv_batch_size(self.pv_batch_size)
-
-    def test_pboxdatafeed(self):
-        self.run_dataset(False)
-
-    def test_pboxdatafeed(self):
-        self.run_dataset(True)
-
-    def run_dataset(self, is_cpu):
-        x = fluid.layers.data(name='x', shape=[1], dtype='int64', lod_level=0)
-        y = fluid.layers.data(name='y', shape=[1], dtype='int64', lod_level=0)
-        rank_offset = fluid.layers.data(
-            name="rank_offset",
-            shape=[-1, 7],
-            dtype="int32",
-            lod_level=0,
-            append_batch_size=False)
-
-        emb_x, emb_y = fluid.contrib.layers._pull_box_extended_sparse(
-            [x, y], size=2, extend_size=128)
-        concat = layers.concat([emb_x[0], emb_x[1], emb_y[0], emb_y[1]], axis=1)
-        fc = layers.fc(input=concat,
-                       name="fc",
-                       size=1,
-                       num_flatten_dims=1,
-                       bias_attr=False)
-        loss = layers.reduce_mean(fc)
-        place = fluid.CPUPlace() if is_cpu or not core.is_compiled_with_cuda(
-        ) else fluid.CUDAPlace(0)
-        exe = fluid.Executor(place)
-
-        with open("test_run_with_dump_a.txt", "w") as f:
-            data = "1 1702f830eee19501ad7429505f714c1d 1 1 1 9\n"
-            data += "1 1702f830eee19502ad7429505f714c1d 1 2 1 8\n"
-            data += "1 1702f830eee19503ad7429505f714c1d 1 3 1 7\n"
-            data += "1 1702f830eee0de01ad7429505f714c2d 1 4 1 6\n"
-            data += "1 1702f830eee0df01ad7429505f714c3d 1 5 1 5\n"
-            data += "1 1702f830eee0df02ad7429505f714c3d 1 6 1 4\n"
-            f.write(data)
-        with open("test_run_with_dump_b.txt", "w") as f:
-            data = "1 1702f830fff22201ad7429505f715c1d 1 1 1 1\n"
-            data += "1 1702f830fff22202ad7429505f715c1d 1 2 1 2\n"
-            data += "1 1702f830fff22203ad7429505f715c1d 1 3 1 3\n"
-            data += "1 1702f830fff22101ad7429505f714ccd 1 4 1 4\n"
-            data += "1 1702f830fff22102ad7429505f714ccd 1 5 1 5\n"
-            data += "1 1702f830fff22103ad7429505f714ccd 1 6 1 6\n"
-            data += "1 1702f830fff22104ad7429505f714ccd 1 6 1 7\n"
-            f.write(data)
-
-        self.set_data_config()
-        self.dataset.set_use_var([x, y])
-        self.dataset.set_filelist(
-            ["test_run_with_dump_a.txt", "test_run_with_dump_b.txt"])
-
-        optimizer = fluid.optimizer.SGD(learning_rate=0.5)
-        optimizer = fluid.optimizer.PipelineOptimizer(
-            optimizer,
-            cut_list=[],
-            place_list=[place],
-            concurrency_list=[1],
-            queue_size=1,
-            sync_steps=-1)
-        optimizer.minimize(loss)
-        exe.run(fluid.default_startup_program())
-        self.dataset.set_current_phase(1)
-        self.dataset.load_into_memory()
-        self.dataset.preprocess_instance()
-        self.dataset.begin_pass()
-        pv_num = self.dataset.get_pv_data_size()
-
-        exe.train_from_dataset(
-            program=fluid.default_main_program(),
-            dataset=self.dataset,
-            print_period=1)
-        self.dataset.set_current_phase(0)
-        self.dataset.postprocess_instance()
-        exe.train_from_dataset(
-            program=fluid.default_main_program(),
-            dataset=self.dataset,
-            print_period=1)
-        self.dataset.end_pass(True)
-        os.remove("test_run_with_dump_a.txt")
-        os.remove("test_run_with_dump_b.txt")
-
-
-class TestDataFeed2(TestDataFeed):
-    """  TestBaseCase(Merge PV not merge by sid)   """
-
-    def setUp(self):
-        self.batch_size = 10
-        self.pv_batch_size = 10
-        self.enable_pv_merge = True
-        self.merge_by_sid = False
-
-
-class TestDataFeed3(TestDataFeed):
-    """  TestBaseCase(Not Merge PV)   """
-
-    def setUp(self):
-        self.batch_size = 10
-        self.pv_batch_size = 10
-        self.enable_pv_merge = False
-
-
-if __name__ == '__main__':
-    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_parallel_executor_fetch_isolated_var.py b/python/paddle/fluid/tests/unittests/test_parallel_executor_fetch_isolated_var.py
index 54030695f34474d48453031d5f3c976121259a7a..13932238705f5b51b4eaef15d29670e4dba7bd31 100644
--- a/python/paddle/fluid/tests/unittests/test_parallel_executor_fetch_isolated_var.py
+++ b/python/paddle/fluid/tests/unittests/test_parallel_executor_fetch_isolated_var.py
@@ -27,7 +27,7 @@ class TestParallelExecutorFetchIsolatedVarBase(unittest.TestCase):
     def build_network(self, is_training):
         x = fluid.data(name='x', shape=[-1, 10], dtype='float32')
         y = fluid.data(name='y', shape=[-1, 10], dtype='float32')
-        fc = fluid.layers.fc(x, size=30)
+        fc = fluid.layers.fc(x, size=30, bias_attr=False)
         loss = fluid.layers.reduce_mean(fc)
         if is_training:
             adam = fluid.optimizer.Adam(learning_rate=1e-3)
diff --git a/python/paddle/fluid/tests/unittests/test_parallel_executor_profiler.py b/python/paddle/fluid/tests/unittests/test_parallel_executor_profiler.py
new file mode 100644
index 0000000000000000000000000000000000000000..62ecb2207cded5f1e77592f178c0af2b7b40de0d
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/test_parallel_executor_profiler.py
@@ -0,0 +1,43 @@
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+
+import unittest
+import numpy as np
+import paddle.fluid as fluid
+import paddle.fluid.core as core
+from paddle.fluid.tests.unittests.test_profiler import TestProfiler
+
+
+class TestPEProfiler(TestProfiler):
+    def test_cpu_profiler(self):
+        exe = fluid.Executor(fluid.CPUPlace())
+        self.net_profiler(exe, 'CPU', "Default", use_parallel_executor=True)
+
+    @unittest.skipIf(not core.is_compiled_with_cuda(),
+                     "profiler is enabled only with GPU")
+    def test_cuda_profiler(self):
+        exe = fluid.Executor(fluid.CUDAPlace(0))
+        self.net_profiler(exe, 'GPU', "OpDetail", use_parallel_executor=True)
+
+    @unittest.skipIf(not core.is_compiled_with_cuda(),
+                     "profiler is enabled only with GPU")
+    def test_all_profiler(self):
+        exe = fluid.Executor(fluid.CUDAPlace(0))
+        self.net_profiler(exe, 'All', "AllOpDetail", use_parallel_executor=True)
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_pipeline.py b/python/paddle/fluid/tests/unittests/test_pipeline.py
index dbf14e047579437b2540ab2552e64ff8dc90099e..1f884195a47f19ca0c69912dfa68cf608317ddc8 100644
--- a/python/paddle/fluid/tests/unittests/test_pipeline.py
+++ b/python/paddle/fluid/tests/unittests/test_pipeline.py
@@ -19,137 +19,192 @@ import numpy as np
 import os
 import shutil
 import unittest
+import math
+
+
+def conv_bn_layer(input, num_filters, filter_size, stride=1, groups=1,
+                  act=None):
+    conv = fluid.layers.conv2d(
+        input=input,
+        num_filters=num_filters,
+        filter_size=filter_size,
+        stride=stride,
+        padding=(filter_size - 1) // 2,
+        groups=groups,
+        act=None,
+        bias_attr=False)
+    return fluid.layers.batch_norm(
+        input=conv,
+        act=act, )
+
+
+def shortcut(input, ch_out, stride, is_first):
+    ch_in = input.shape[1]
+    if ch_in != ch_out or stride != 1 or is_first == True:
+        return conv_bn_layer(input, ch_out, 1, stride)
+    else:
+        return input
+
+
+def bottleneck_block(input, num_filters, stride):
+    conv0 = conv_bn_layer(
+        input=input, num_filters=num_filters, filter_size=1, act='relu')
+    conv1 = conv_bn_layer(
+        input=conv0,
+        num_filters=num_filters,
+        filter_size=3,
+        stride=stride,
+        act='relu')
+    conv2 = conv_bn_layer(
+        input=conv1, num_filters=num_filters * 4, filter_size=1, act=None)
+
+    short = shortcut(input, num_filters * 4, stride, is_first=False)
+
+    return fluid.layers.elementwise_add(x=short, y=conv2, act='relu')
+
+
+def basic_block(input, num_filters, stride, is_first):
+    conv0 = conv_bn_layer(
+        input=input,
+        num_filters=num_filters,
+        filter_size=3,
+        act='relu',
+        stride=stride)
+    conv1 = conv_bn_layer(
+        input=conv0, num_filters=num_filters, filter_size=3, act=None)
+    short = shortcut(input, num_filters, stride, is_first)
+    return fluid.layers.elementwise_add(x=short, y=conv1, act='relu')
+
+
+def build_network(input, layers=50, class_dim=1000):
+    supported_layers = [18, 34, 50, 101, 152]
+    assert layers in supported_layers
+    depth = None
+    if layers == 18:
+        depth = [2, 2, 2, 2]
+    elif layers == 34 or layers == 50:
+        depth = [3, 4, 6, 3]
+    elif layers == 101:
+        depth = [3, 4, 23, 3]
+    elif layers == 152:
+        depth = [3, 8, 36, 3]
+    num_filters = [64, 128, 256, 512]
+    with fluid.device_guard("cpu"):
+        conv = conv_bn_layer(
+            input=input, num_filters=64, filter_size=7, stride=2, act='relu')
+        conv = fluid.layers.pool2d(
+            input=conv,
+            pool_size=3,
+            pool_stride=2,
+            pool_padding=1,
+            pool_type='max')
+    if layers >= 50:
+        for block in range(len(depth)):
+            with fluid.device_guard("gpu:0"):
+                for i in range(depth[block]):
+                    conv = bottleneck_block(
+                        input=conv,
+                        num_filters=num_filters[block],
+                        stride=2 if i == 0 and block != 0 else 1)
+
+        with fluid.device_guard("gpu:0"):
+            pool = fluid.layers.pool2d(
+                input=conv, pool_size=7, pool_type='avg', global_pooling=True)
+            stdv = 1.0 / math.sqrt(pool.shape[1] * 1.0)
+            out = fluid.layers.fc(
+                input=pool,
+                size=class_dim,
+                param_attr=fluid.param_attr.ParamAttr(
+                    initializer=fluid.initializer.Uniform(-stdv, stdv)))
+    else:
+        for block in range(len(depth)):
+            with fluid.device_guard("gpu:0"):
+                for i in range(depth[block]):
+                    conv = basic_block(
+                        input=conv,
+                        num_filters=num_filters[block],
+                        stride=2 if i == 0 and block != 0 else 1,
+                        is_first=block == i == 0)
+        with fluid.device_guard("gpu:0"):
+            pool = fluid.layers.pool2d(
+                input=conv, pool_size=7, pool_type='avg', global_pooling=True)
+            stdv = 1.0 / math.sqrt(pool.shape[1] * 1.0)
+            out = fluid.layers.fc(
+                input=pool,
+                size=class_dim,
+                param_attr=fluid.param_attr.ParamAttr(
+                    initializer=fluid.initializer.Uniform(-stdv, stdv)))
+    return out
 
 
-class TestPipelineConfig(unittest.TestCase):
-    """  TestCases for Config in Pipeline Training. """
-
-    def config(self, filelist_length, pipeline_num, reader_concurrency):
-        filelist = []
-        for i in range(filelist_length):
-            filelist.append("file" + str(i))
-        self.dataset.set_filelist(filelist)
-        self.pipeline_opt["concurrency_list"][0] = reader_concurrency
-        self.pipeline_num = pipeline_num
-
-    def helper(self, in_filelist_length, in_pipeline_num, in_reader_concurrency,
-               out_pipeline_num, out_reader_concurrency, out_dataset_thread):
-        self.config(in_filelist_length, in_pipeline_num, in_reader_concurrency)
-        res = self.exe._adjust_pipeline_resource(
-            self.pipeline_opt, self.dataset, self.pipeline_num)
-        self.assertEqual(self.pipeline_opt["concurrency_list"][0],
-                         out_reader_concurrency)
-        self.assertEqual(res, out_pipeline_num)
-        self.assertEqual(self.dataset.thread_num, out_dataset_thread)
-
-    def test_adjust_pipeline_resource(self):
-        self.exe = fluid.Executor(fluid.CPUPlace())
-        self.dataset = fluid.DatasetFactory().create_dataset(
-            "FileInstantDataset")
-        self.pipeline_opt = {"concurrency_list": [0, 1, 2]}
-        self.pipeline_num = 0
-
-        self.helper(7, 2, 2, 2, 2, 4)
-        self.helper(7, 2, 3, 2, 3, 6)
-        self.helper(7, 2, 4, 2, 3, 6)
+class TestPipeline(unittest.TestCase):
+    """  TestCases for Pipeline Training. """
 
-        self.helper(8, 2, 3, 2, 3, 6)
-        self.helper(8, 2, 4, 2, 4, 8)
-        self.helper(8, 2, 5, 2, 4, 8)
+    def _run(self, debug):
+        main_prog = fluid.Program()
+        startup_prog = fluid.Program()
+        with fluid.program_guard(main_prog, startup_prog):
+            with fluid.device_guard("cpu"):
+                image = fluid.layers.data(
+                    name="image", shape=[3, 224, 224], dtype="float32")
+                label = fluid.layers.data(
+                    name="label", shape=[1], dtype="int64")
+                data_loader = fluid.io.DataLoader.from_generator(
+                    feed_list=[image, label],
+                    capacity=64,
+                    use_double_buffer=True,
+                    iterable=False)
+                fc = build_network(image, layers=50)
+            with fluid.device_guard("gpu:0"):
+                out, prob = fluid.layers.softmax_with_cross_entropy(
+                    logits=fc, label=label, return_softmax=True)
+                loss = fluid.layers.mean(out)
+                acc_top1 = fluid.layers.accuracy(input=prob, label=label, k=1)
+                acc_top5 = fluid.layers.accuracy(input=prob, label=label, k=5)
+
+            base_lr = 0.1
+            passes = [30, 60, 80, 90]
+            total_images = 1281167
+            steps_per_pass = total_images // 128
+            bd = [steps_per_pass * p for p in passes]
+            lr = [base_lr * (0.1**i) for i in range(len(bd) + 1)]
+            lr_val = fluid.layers.piecewise_decay(boundaries=bd, values=lr)
+            optimizer = fluid.optimizer.MomentumOptimizer(
+                lr_val,
+                momentum=0.9,
+                regularization=fluid.regularizer.L2Decay(1e-4))
+            optimizer = fluid.optimizer.PipelineOptimizer(
+                optimizer, num_microbatches=2)
+            optimizer.minimize(loss)
 
-        self.helper(3, 4, 1, 3, 1, 3)
-        self.helper(3, 4, 2, 3, 1, 3)
+        def train_reader():
+            for _ in range(4):
+                img = np.random.random(size=[3, 224, 224]).astype('float32')
+                label = np.random.random(size=[1]).astype('int64')
+                yield img, label
 
+        data_loader.set_sample_generator(train_reader, batch_size=1)
+        place = fluid.CPUPlace()
 
-class TestPipeline(unittest.TestCase):
-    """  TestCases for Pipeline Training. """
+        # The following dataset is only used for the 
+        # interface 'train_from_dataset'.
+        # And it has no actual meaning.
+        dataset = fluid.DatasetFactory().create_dataset('FileInstantDataset')
+        dataset.set_batch_size(1)
+        dataset.set_thread(1)
+        dataset.set_filelist(['/tmp/tmp_2.txt'])
+        dataset.set_use_var([image, label])
+        exe = fluid.Executor(place)
+        exe.run(startup_prog)
+        data_loader.start()
+        exe.train_from_dataset(main_prog, dataset, debug=debug)
 
     def test_pipeline(self):
-        x = fluid.layers.data(name='x', shape=[1], dtype='int64', lod_level=0)
-        y = fluid.layers.data(name='y', shape=[1], dtype='int64', lod_level=0)
-        emb_x = layers.embedding(
-            input=x,
-            param_attr=fluid.ParamAttr(name="embx"),
-            size=[10, 2],
-            is_sparse=False)
-        emb_y = layers.embedding(
-            input=y,
-            param_attr=fluid.ParamAttr(
-                name="emby", learning_rate=0.9),
-            size=[10, 2],
-            is_sparse=False)
-
-        concat = layers.concat([emb_x, emb_y], axis=1)
-
-        fc = layers.fc(input=concat,
-                       name="fc",
-                       size=1,
-                       num_flatten_dims=1,
-                       bias_attr=False)
-        loss = layers.reduce_mean(fc)
+        self._run(False)
+        self._run(True)
 
-        optimizer = fluid.optimizer.SGD(learning_rate=0.5)
-        optimizer = fluid.optimizer.PipelineOptimizer(
-            optimizer,
-            cut_list=[[emb_x, emb_y], [loss]],
-            place_list=[
-                fluid.CPUPlace(), fluid.CUDAPlace(0), fluid.CPUPlace()
-            ],
-            concurrency_list=[1, 1, 1],
-            queue_size=1,
-            sync_steps=10000000, )
-        optimizer.minimize(loss)
-        place = fluid.CPUPlace()
-        exe = fluid.Executor(place)
-        exe.run(fluid.default_startup_program())
-        #prepare data
-        batch_size = 100
-
-        def binary_print(slot, fout):
-            num = np.int16(len(slot) + 1)
-            num.tofile(fout)
-            a = np.int64(batch_size)
-            a.tofile(fout)
-            slot.tofile(fout)
-
-        #batch1 = np.array([[0,1], [1,2], [2,3]]).astype("int64").reshape(batch_size,2,1)
-        #batch2 = np.array([[1,2], [2,3], [3,4]]).astype("int64").reshape(batch_size,2,1)
-        batch1 = np.ones(
-            (batch_size, 2, 1)).astype("int64").reshape(batch_size, 2, 1)
-        batch2 = np.ones(
-            (batch_size, 2, 1)).astype("int64").reshape(batch_size, 2, 1)
-        data = [batch1, batch2]
-        filelist = []
-        for i in range(2):
-            filelist.append("test_pipeline_input_" + str(i))
-        for f in filelist:
-            with open(f, "wb") as fout:
-                for batch_data in data:
-                    for ins in batch_data:
-                        for slot in ins:
-                            binary_print(slot, fout)
-
-        dataset = fluid.DatasetFactory().create_dataset("FileInstantDataset")
-        dataset.set_use_var([x, y])
-        dataset.set_batch_size(batch_size)
-        dataset.set_filelist(filelist)
-
-        for epoch in range(1):
-            exe.train_from_dataset(
-                fluid.default_main_program(),
-                dataset,
-                thread=1,
-                debug=False,
-                fetch_list=[],
-                fetch_info=[],
-                print_period=1)
-
-        for f in filelist:
-            os.remove(f)
-
-    def single_section(self, random_dump):
-        program = fluid.Program()
-        with fluid.program_guard(program):
+    def test_pipeline_noneoptimizer(self):
+        with fluid.device_guard("gpu:0"):
             x = fluid.layers.data(
                 name='x', shape=[1], dtype='int64', lod_level=0)
             y = fluid.layers.data(
@@ -159,94 +214,18 @@ class TestPipeline(unittest.TestCase):
                 param_attr=fluid.ParamAttr(name="embx"),
                 size=[10, 2],
                 is_sparse=False)
-            emb_y = layers.embedding(
-                input=y,
-                param_attr=fluid.ParamAttr(
-                    name="emby", learning_rate=0.9),
-                size=[10, 2],
-                is_sparse=False)
-
-            concat = layers.concat([emb_x, emb_y], axis=1)
 
-            fc = layers.fc(input=concat,
+            fc = layers.fc(input=emb_x,
                            name="fc",
                            size=1,
                            num_flatten_dims=1,
                            bias_attr=False)
             loss = layers.reduce_mean(fc)
 
-            optimizer = fluid.optimizer.SGD(learning_rate=0.5)
+        optimizer = fluid.optimizer.SGD(learning_rate=0.5)
+        with self.assertRaises(ValueError):
             optimizer = fluid.optimizer.PipelineOptimizer(
-                optimizer,
-                cut_list=[],
-                #place_list=[fluid.CPUPlace()],
-                place_list=[fluid.CUDAPlace(0)],
-                concurrency_list=[1],
-                queue_size=1,
-                sync_steps=-1)
-            optimizer.minimize(loss)
-
-            program._pipeline_opt["dump_fields"] = ["fc.tmp_0", "fc.tmp_0@GRAD"]
-            program._pipeline_opt["dump_fields_path"] = "./dump_log/"
-            program._pipeline_opt["dump_param"] = ["embx"]
-            program._pipeline_opt["enable_random_dump"] = random_dump
-            program._pipeline_opt["dump_interval"] = 10
-            program._pipeline_opt["random_with_lineid"] = False
-            #print(program._pipeline_opt)
-            place = fluid.CPUPlace()
-            exe = fluid.Executor(place)
-            exe.run(fluid.default_startup_program())
-            #prepare data
-            batch_size = 100
-
-            def binary_print(slot, fout):
-                num = np.int16(len(slot) + 1)
-                num.tofile(fout)
-                a = np.int64(batch_size)
-                a.tofile(fout)
-                slot.tofile(fout)
-
-            #batch1 = np.array([[0,1], [1,2], [2,3]]).astype("int64").reshape(batch_size,2,1)
-            #batch2 = np.array([[1,2], [2,3], [3,4]]).astype("int64").reshape(batch_size,2,1)
-            batch1 = np.ones(
-                (batch_size, 2, 1)).astype("int64").reshape(batch_size, 2, 1)
-            batch2 = np.ones(
-                (batch_size, 2, 1)).astype("int64").reshape(batch_size, 2, 1)
-            data = [batch1, batch2]
-            filelist = []
-            for i in range(2):
-                filelist.append("test_pipeline_input_" + str(i))
-            for f in filelist:
-                with open(f, "wb") as fout:
-                    for batch_data in data:
-                        for ins in batch_data:
-                            for slot in ins:
-                                binary_print(slot, fout)
-
-            dataset = fluid.DatasetFactory().create_dataset(
-                "FileInstantDataset")
-            dataset.set_use_var([x, y])
-            dataset.set_batch_size(batch_size)
-            dataset.set_filelist(filelist)
-
-            for epoch in range(1):
-                exe.train_from_dataset(
-                    fluid.default_main_program(),
-                    dataset,
-                    thread=1,
-                    debug=True,
-                    fetch_list=[],
-                    fetch_info=[],
-                    print_period=1)
-
-            for f in filelist:
-                os.remove(f)
-            if os.path.isdir("dump_log"):
-                shutil.rmtree("dump_log")
-
-    def test_pipeline(self):
-        self.single_section(True)
-        self.single_section(False)
+                dict(), num_microbatches=2)
 
 
 if __name__ == '__main__':
diff --git a/python/paddle/fluid/tests/unittests/test_profiler.py b/python/paddle/fluid/tests/unittests/test_profiler.py
index 3296a11279b7c6b72488779084d2c7b0635da0f7..1b8852810f2fe43b13503782f1a819d278d8efd1 100644
--- a/python/paddle/fluid/tests/unittests/test_profiler.py
+++ b/python/paddle/fluid/tests/unittests/test_profiler.py
@@ -65,8 +65,13 @@ class TestProfiler(unittest.TestCase):
         opts = optimizer.minimize(avg_cost, startup_program=startup_program)
 
         if compile_program:
+            # TODO(luotao): profiler tool may have bug with multi-thread parallel executor.
+            # https://github.com/PaddlePaddle/Paddle/pull/25200#issuecomment-650483092
+            exec_strategy = fluid.ExecutionStrategy()
+            exec_strategy.num_threads = 1
             train_program = fluid.compiler.CompiledProgram(
-                main_program).with_data_parallel(loss_name=avg_cost.name)
+                main_program).with_data_parallel(
+                    loss_name=avg_cost.name, exec_strategy=exec_strategy)
         else:
             train_program = main_program
         return train_program, startup_program, avg_cost, batch_size, batch_acc
@@ -136,8 +141,9 @@ class TestProfiler(unittest.TestCase):
                     utils.get_profiler().record_step()
                     if batch_range is None and iter == 2:
                         utils.get_profiler().reset()
-
-        self.check_profile_result(profile_path)
+        # TODO(luotao): check why nccl kernel in profile result.
+        # https://github.com/PaddlePaddle/Paddle/pull/25200#issuecomment-650483092
+        # self.check_profile_result(profile_path)
 
     def test_cpu_profiler(self):
         exe = fluid.Executor(fluid.CPUPlace())
@@ -148,7 +154,6 @@ class TestProfiler(unittest.TestCase):
                 "Default",
                 batch_range=[5, 10],
                 use_new_api=use_new_api)
-            #self.net_profiler(exe, 'CPU', "Default", use_parallel_executor=True)
 
     @unittest.skipIf(not core.is_compiled_with_cuda(),
                      "profiler is enabled only with GPU")
@@ -161,8 +166,6 @@ class TestProfiler(unittest.TestCase):
                 "OpDetail",
                 batch_range=[0, 10],
                 use_new_api=use_new_api)
-            #self.net_profiler(
-            #    exe, 'GPU', "OpDetail", use_parallel_executor=True)
 
     @unittest.skipIf(not core.is_compiled_with_cuda(),
                      "profiler is enabled only with GPU")
@@ -175,8 +178,6 @@ class TestProfiler(unittest.TestCase):
                 "AllOpDetail",
                 batch_range=None,
                 use_new_api=use_new_api)
-            #self.net_profiler(
-            #    exe, 'All', "AllOpDetail", use_parallel_executor=True)
 
 
 class TestProfilerAPIError(unittest.TestCase):
diff --git a/python/paddle/fluid/tests/unittests/test_rand_op.py b/python/paddle/fluid/tests/unittests/test_rand_op.py
index 4725e2fae2d0b22919bfc30749373c56589c6598..c8e0130b77dc661d190f568ac501c9986a81f5e4 100644
--- a/python/paddle/fluid/tests/unittests/test_rand_op.py
+++ b/python/paddle/fluid/tests/unittests/test_rand_op.py
@@ -47,71 +47,73 @@ class TestRandOpError(unittest.TestCase):
 
             self.assertRaises(TypeError, test_dtype)
 
-            def test_shape_list():
-                rand(shape=[2.])
-
-            self.assertRaises(TypeError, test_shape_list)
-
-            def test_shape_list2():
-                rand(shape=[2, 3.])
-
-            self.assertRaises(TypeError, test_shape_list2)
-
-            def test_device():
-                rand(shape=[3, 4], device='device')
-
-            self.assertRaises(ValueError, test_device)
-
 
 class TestRandOp(unittest.TestCase):
     """
     This class test the common usages of randop.
-
     """
 
-    def test_run(self):
-        use_cuda = False
+    def run_net(self, use_cuda=False):
         place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace()
         exe = fluid.Executor(place)
 
         train_program = fluid.Program()
         startup_program = fluid.Program()
         with fluid.program_guard(train_program, startup_program):
-            result_1 = rand(shape=[3, 4])
+            result_0 = rand([3, 4])
+            result_1 = rand([3, 4], 'float64')
+
             dim_1 = fluid.layers.fill_constant([1], "int64", 3)
             dim_2 = fluid.layers.fill_constant([1], "int32", 5)
             result_2 = rand(shape=[dim_1, dim_2])
+
             var_shape = fluid.data(name='var_shape', shape=[2], dtype="int64")
             result_3 = rand(var_shape)
+
             var_shape_int32 = fluid.data(
                 name='var_shape_int32', shape=[2], dtype="int32")
             result_4 = rand(var_shape_int32)
+
         exe.run(startup_program)
 
         x1 = np.array([3, 2]).astype('int64')
         x2 = np.array([4, 3]).astype('int32')
-        ret = exe.run(train_program,
-                      feed={"var_shape": x1,
-                            "var_shape_int32": x2},
-                      fetch_list=[result_1, result_2, result_3, result_4])
+        ret = exe.run(
+            train_program,
+            feed={"var_shape": x1,
+                  "var_shape_int32": x2},
+            fetch_list=[result_1, result_1, result_2, result_3, result_4])
+
+    def test_run(self):
+        self.run_net(False)
+        if core.is_compiled_with_cuda():
+            self.run_net(True)
 
 
 class TestRandOpForDygraph(unittest.TestCase):
     """
     This class test the common usages of randop.
-
     """
 
-    def test_run(self):
-        use_cuda = False
-        with fluid.dygraph.guard():
-            rand(shape=[3, 4])
+    def run_net(self, use_cuda=False):
+        place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace()
+        with fluid.dygraph.guard(place):
+            rand([3, 4])
+
+            rand([3, 4], 'float64')
+
             dim_1 = fluid.layers.fill_constant([1], "int64", 3)
             dim_2 = fluid.layers.fill_constant([1], "int32", 5)
             rand(shape=[dim_1, dim_2])
+
             var_shape = fluid.dygraph.to_variable(np.array([3, 4]))
             rand(var_shape)
 
+    def test_run(self):
+        self.run_net(False)
+        if core.is_compiled_with_cuda():
+            self.run_net(True)
+
 
 if __name__ == "__main__":
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_randint_op.py b/python/paddle/fluid/tests/unittests/test_randint_op.py
index 40c9480a2c9958a997303e0298e631c87ffa3586..89739a37fd95b1eceb9a1899086975e3a03e98a7 100644
--- a/python/paddle/fluid/tests/unittests/test_randint_op.py
+++ b/python/paddle/fluid/tests/unittests/test_randint_op.py
@@ -17,12 +17,9 @@ from __future__ import print_function
 import unittest
 import numpy as np
 from op_test import OpTest
-
-import paddle.fluid.core as core
-from paddle.fluid.op import Operator
-import paddle.fluid as fluid
-from paddle.fluid import Program, program_guard
 import paddle
+from paddle.fluid import core
+from paddle import Program, program_guard
 
 
 def output_hist(out):
@@ -56,25 +53,10 @@ class TestRandintOp(OpTest):
 
 class TestRandintOpError(unittest.TestCase):
     def test_errors(self):
-        main_prog = Program()
-        start_prog = Program()
-        with program_guard(main_prog, start_prog):
-
-            def test_shape():
-                shape = np.array([2, 3])
-                paddle.randint(5, shape=shape, dtype='int32')
-
-            self.assertRaises(TypeError, test_shape)
-
-            def test_dtype():
-                paddle.randint(5, shape=[32, 32], dtype='float32')
-
-            self.assertRaises(TypeError, test_dtype)
-
-            def test_low_high():
-                paddle.randint(low=5, high=5, shape=[32, 32], dtype='int32')
-
-            self.assertRaises(ValueError, test_low_high)
+        with program_guard(Program(), Program()):
+            self.assertRaises(TypeError, paddle.randint, 5, shape=np.array([2]))
+            self.assertRaises(TypeError, paddle.randint, 5, dtype='float32')
+            self.assertRaises(ValueError, paddle.randint, 5, 5)
 
 
 class TestRandintOp_attr_tensorlist(OpTest):
@@ -127,46 +109,44 @@ class TestRandint_attr_tensor(OpTest):
 # Test python API
 class TestRandintAPI(unittest.TestCase):
     def test_api(self):
-        startup_program = fluid.Program()
-        train_program = fluid.Program()
-        with fluid.program_guard(train_program, startup_program):
+        with program_guard(Program(), Program()):
             # results are from [0, 5).
-            output1 = paddle.randint(5)
+            out1 = paddle.randint(5)
             # shape is a list and dtype is 'int32'
-            output2 = paddle.randint(
+            out2 = paddle.randint(
                 low=-100, high=100, shape=[64, 64], dtype='int32')
             # shape is a tuple and dtype is 'int64'
-            output3 = paddle.randint(
+            out3 = paddle.randint(
                 low=-100, high=100, shape=(32, 32, 3), dtype='int64')
             # shape is a tensorlist and dtype is 'float32'
-            dim_1 = fluid.layers.fill_constant([1], "int64", 32)
-            dim_2 = fluid.layers.fill_constant([1], "int32", 50)
-            output4 = paddle.randint(
-                low=-100, high=100, shape=[dim_1, 5], dtype='int32')
+            dim_1 = paddle.fill_constant([1], "int64", 32)
+            dim_2 = paddle.fill_constant([1], "int32", 50)
+            out4 = paddle.randint(
+                low=-100, high=100, shape=[dim_1, 5, dim_2], dtype='int32')
             # shape is a tensor and dtype is 'float64'
-            var_shape = fluid.data(name='var_shape', shape=[2], dtype="int64")
-            output5 = paddle.randint(
+            var_shape = paddle.nn.data(
+                name='var_shape', shape=[2], dtype="int64")
+            out5 = paddle.randint(
                 low=1, high=1000, shape=var_shape, dtype='int64')
 
-            place = fluid.CPUPlace()
-            if fluid.core.is_compiled_with_cuda():
-                place = fluid.CUDAPlace(0)
-            exe = fluid.Executor(place)
-
-            exe.run(startup_program)
+            place = paddle.CUDAPlace(0) if core.is_compiled_with_cuda(
+            ) else paddle.CPUPlace()
+            exe = paddle.Executor(place)
             outs = exe.run(
-                train_program,
                 feed={'var_shape': np.array([100, 100]).astype('int64')},
-                fetch_list=[output1, output2, output3, output4, output5])
+                fetch_list=[out1, out2, out3, out4, out5])
 
 
-class TestRandintDygraphMode(unittest.TestCase):
-    def test_check_output(self):
-        with fluid.dygraph.guard():
-            x = paddle.randint(10, shape=[10], dtype="int32")
-            x_np = x.numpy()
-            for i in range(10):
-                self.assertTrue((x_np[i] >= 0 and x_np[i] < 10))
+class TestRandintImperative(unittest.TestCase):
+    def test_api(self):
+        n = 10
+        with paddle.imperative.guard():
+            x1 = paddle.randint(n, shape=[10], dtype="int32")
+            x2 = paddle.tensor.randint(n)
+            x3 = paddle.tensor.random.randint(n)
+            for i in [x1, x2, x3]:
+                for j in i.numpy().tolist():
+                    self.assertTrue((j >= 0 and j < n))
 
 
 if __name__ == "__main__":
diff --git a/python/paddle/fluid/tests/unittests/test_randn_op.py b/python/paddle/fluid/tests/unittests/test_randn_op.py
index 808e5a08fd65ea061c04bbfd9e3994b7b6becb0b..f65cc6dc53b7e3541016447d8510bd3d38a53b17 100644
--- a/python/paddle/fluid/tests/unittests/test_randn_op.py
+++ b/python/paddle/fluid/tests/unittests/test_randn_op.py
@@ -17,92 +17,71 @@ from __future__ import print_function
 import unittest
 import numpy as np
 import paddle
-import paddle.fluid as fluid
 import paddle.fluid.core as core
-from paddle.fluid import Program, program_guard
+from paddle import Program, program_guard
 
 
 class TestRandnOp(unittest.TestCase):
     def test_api(self):
-        x1 = paddle.randn(shape=[1000, 784], dtype='float32')
-        x2 = paddle.randn(shape=[1000, 784], dtype='float64')
-        x3 = fluid.layers.fill_constant(
-            shape=[1000, 784], dtype='float32', value=0)
-        paddle.randn(shape=[1000, 784], out=x3, dtype='float32')
-        x4 = paddle.randn(shape=[1000, 784], dtype='float32', device='cpu')
-        x5 = paddle.randn(shape=[1000, 784], dtype='float32', device='gpu')
-        x6 = paddle.randn(
-            shape=[1000, 784],
-            dtype='float32',
-            device='gpu',
-            stop_gradient=False)
-
-        place = fluid.CUDAPlace(0) if core.is_compiled_with_cuda(
-        ) else fluid.CPUPlace()
-        exe = fluid.Executor(place)
-        res = exe.run(fluid.default_main_program(),
-                      feed={},
-                      fetch_list=[x1, x2, x3, x4, x5, x6])
-
-        self.assertAlmostEqual(np.mean(res[0]), .0, delta=0.1)
-        self.assertAlmostEqual(np.std(res[0]), 1., delta=0.1)
-        self.assertAlmostEqual(np.mean(res[1]), .0, delta=0.1)
-        self.assertAlmostEqual(np.std(res[1]), 1., delta=0.1)
-        self.assertAlmostEqual(np.mean(res[2]), .0, delta=0.1)
-        self.assertAlmostEqual(np.std(res[2]), 1., delta=0.1)
-        self.assertAlmostEqual(np.mean(res[3]), .0, delta=0.1)
-        self.assertAlmostEqual(np.std(res[3]), 1., delta=0.1)
-        self.assertAlmostEqual(np.mean(res[4]), .0, delta=0.1)
-        self.assertAlmostEqual(np.std(res[4]), 1., delta=0.1)
-        self.assertAlmostEqual(np.mean(res[5]), .0, delta=0.1)
-        self.assertAlmostEqual(np.std(res[5]), 1., delta=0.1)
+        shape = [1000, 784]
+        train_program = Program()
+        startup_program = Program()
+        with program_guard(train_program, startup_program):
+            x1 = paddle.randn(shape, 'float32')
+            x2 = paddle.randn(shape, 'float64')
+
+            dim_1 = paddle.fill_constant([1], "int64", 20)
+            dim_2 = paddle.fill_constant([1], "int32", 50)
+            x3 = paddle.randn([dim_1, dim_2, 784])
+
+            var_shape = paddle.nn.data('X', [2], 'int32')
+            x4 = paddle.randn(var_shape)
+
+        place = paddle.CUDAPlace(0) if core.is_compiled_with_cuda(
+        ) else paddle.CPUPlace()
+        exe = paddle.Executor(place)
+        res = exe.run(train_program,
+                      feed={'X': np.array(
+                          shape, dtype='int32')},
+                      fetch_list=[x1, x2, x3, x4])
+
+        for out in res:
+            self.assertAlmostEqual(np.mean(out), .0, delta=0.1)
+            self.assertAlmostEqual(np.std(out), 1., delta=0.1)
+
+
+class TestRandnOpForDygraph(unittest.TestCase):
+    def test_api(self):
+        shape = [1000, 784]
+        place = paddle.CUDAPlace(0) if core.is_compiled_with_cuda(
+        ) else paddle.CPUPlace()
+        with paddle.imperative.guard(place):
+            x1 = paddle.randn(shape, 'float32')
+            x2 = paddle.randn(shape, 'float64')
+
+            dim_1 = paddle.fill_constant([1], "int64", 20)
+            dim_2 = paddle.fill_constant([1], "int32", 50)
+            x3 = paddle.randn(shape=[dim_1, dim_2, 784])
+
+            var_shape = paddle.imperative.to_variable(np.array(shape))
+            x4 = paddle.randn(var_shape)
+
+            for out in [x1, x2, x3, x4]:
+                self.assertAlmostEqual(np.mean(out.numpy()), .0, delta=0.1)
+                self.assertAlmostEqual(np.std(out.numpy()), 1., delta=0.1)
 
 
 class TestRandnOpError(unittest.TestCase):
     def test_error(self):
         with program_guard(Program(), Program()):
-
             # The argument shape's size of randn_op should not be 0.
-            def test_shape_size():
-                out = paddle.randn(shape=[])
-
-            self.assertRaises(AssertionError, test_shape_size)
+            self.assertRaises(AssertionError, paddle.randn, [])
 
             # The argument shape's type of randn_op should be list or tuple.
-            def test_shape_type():
-                out = paddle.randn(shape=1)
-
-            self.assertRaises(TypeError, test_shape_type)
-
-            # The argument dtype of randn_op should be float32 or float64.
-            def test_dtype_float16():
-                out = paddle.randn(shape=[1, 2], dtype='float16')
-
-            self.assertRaises(TypeError, test_dtype_float16)
+            self.assertRaises(TypeError, paddle.randn, 1)
 
             # The argument dtype of randn_op should be float32 or float64.
-            def test_dtype_int32():
-                out = paddle.randn(shape=[1, 2], dtype='int32')
-
-            self.assertRaises(TypeError, test_dtype_int32)
-
-            # The argument dtype of randn_op should be float32 or float64.
-            def test_dtype_int64():
-                out = paddle.randn(shape=[1, 2], dtype='int64')
-
-            self.assertRaises(TypeError, test_dtype_int64)
-
-            # The argument dtype of randn_op should be float32 or float64.
-            def test_dtype_uint8():
-                out = paddle.randn(shape=[1, 2], dtype='uint8')
-
-            self.assertRaises(TypeError, test_dtype_uint8)
-
-            # The argument dtype of randn_op should be float32 or float64.
-            def test_dtype_bool():
-                out = paddle.randn(shape=[1, 2], dtype='bool')
-
-            self.assertRaises(TypeError, test_dtype_bool)
+            self.assertRaises(TypeError, paddle.randn, [1, 2], 'int32')
 
 
 if __name__ == "__main__":
diff --git a/python/paddle/fluid/tests/unittests/test_randperm_op.py b/python/paddle/fluid/tests/unittests/test_randperm_op.py
index 2fbdc83f3abffb7f832e0b0396b745635cc47a00..6938b8ef1e051777c867796062e5e7cbed6d7fa4 100644
--- a/python/paddle/fluid/tests/unittests/test_randperm_op.py
+++ b/python/paddle/fluid/tests/unittests/test_randperm_op.py
@@ -16,10 +16,8 @@ import unittest
 import numpy as np
 from op_test import OpTest
 import paddle
-import paddle.fluid as fluid
 import paddle.fluid.core as core
-from paddle.fluid.op import Operator
-from paddle.fluid import Program, program_guard
+from paddle import Program, program_guard
 
 
 def check_randperm_out(n, data_np):
@@ -36,8 +34,11 @@ def error_msg(data_np):
 
 
 def convert_dtype(dtype_str):
-    dtype_str_list = ["int32", "int64"]
-    dtype_num_list = [2, 3]
+    dtype_str_list = ["int32", "int64", "float32", "float64"]
+    dtype_num_list = [
+        core.VarDesc.VarType.INT32, core.VarDesc.VarType.INT64,
+        core.VarDesc.VarType.FP32, core.VarDesc.VarType.FP64
+    ]
     assert dtype_str in dtype_str_list, dtype_str + \
         " should in " + str(dtype_str_list)
     return dtype_num_list[dtype_str_list.index(dtype_str)]
@@ -50,8 +51,6 @@ class TestRandpermOp(OpTest):
         self.op_type = "randperm"
         self.n = 200
         self.dtype = "int64"
-        self.device = None
-        self.seed = 0
 
         self.inputs = {}
         self.outputs = {"Out": np.zeros((self.n)).astype(self.dtype)}
@@ -59,8 +58,6 @@ class TestRandpermOp(OpTest):
         self.attrs = {
             "n": self.n,
             "dtype": convert_dtype(self.dtype),
-            "device": self.device,
-            "seed": self.seed,
         }
 
     def init_attrs(self):
@@ -75,100 +72,60 @@ class TestRandpermOp(OpTest):
             check_randperm_out(self.n, out_np), msg=error_msg(out_np))
 
 
-class TestRandpermOp_attr_n(TestRandpermOp):
-    """ Test randperm op for attr n. """
-
+class TestRandpermOpN(TestRandpermOp):
     def init_attrs(self):
         self.n = 10000
 
 
-class TestRandpermOp_attr_int32(TestRandpermOp):
-    """ Test randperm op for attr int32 dtype. """
-
+class TestRandpermOpInt32(TestRandpermOp):
     def init_attrs(self):
         self.dtype = "int32"
 
 
-class TestRandpermOp_attr_device_cpu(TestRandpermOp):
-    """ Test randperm op for cpu device. """
-
+class TestRandpermOpFloat32(TestRandpermOp):
     def init_attrs(self):
-        self.device = "cpu"
+        self.dtype = "float32"
 
 
-class TestRandpermOp_attr_device_gpu(TestRandpermOp):
-    """ Test randperm op for gpu device. """
-
+class TestRandpermOpFloat64(TestRandpermOp):
     def init_attrs(self):
-        self.device = "gpu"
-
-
-class TestRandpermOp_attr_seed(TestRandpermOp):
-    """ Test randperm op for attr seed. """
-
-    def init_attrs(self):
-        self.seed = 10
+        self.dtype = "float64"
 
 
 class TestRandpermOpError(unittest.TestCase):
-    """ Test randperm op for raise error. """
-
     def test_errors(self):
-        main_prog = Program()
-        start_prog = Program()
-        with program_guard(main_prog, start_prog):
+        with program_guard(Program(), Program()):
+            self.assertRaises(ValueError, paddle.randperm, -3)
+            self.assertRaises(TypeError, paddle.randperm, 10, 'int8')
 
-            def test_Variable():
-                out = np.arange(10)
-                paddle.randperm(n=10, out=out)
 
-            self.assertRaises(TypeError, test_Variable)
+class TestRandpermAPI(unittest.TestCase):
+    def test_out(self):
+        n = 10
+        place = paddle.CUDAPlace(0) if core.is_compiled_with_cuda(
+        ) else paddle.CPUPlace()
+        with program_guard(Program(), Program()):
+            x1 = paddle.randperm(n)
+            x2 = paddle.randperm(n, 'float32')
 
-            def test_value():
-                paddle.randperm(n=-3)
+            exe = paddle.Executor(place)
+            res = exe.run(fetch_list=[x1, x2])
 
-            self.assertRaises(ValueError, test_value)
+            self.assertEqual(res[0].dtype, np.int64)
+            self.assertEqual(res[1].dtype, np.float32)
+            self.assertTrue(check_randperm_out(n, res[0]))
+            self.assertTrue(check_randperm_out(n, res[1]))
 
 
-class TestRandpermOp_attr_out(unittest.TestCase):
-    """ Test randperm op for attr out. """
-
-    def test_attr_tensor_API(self):
-        startup_program = fluid.Program()
-        train_program = fluid.Program()
-        with fluid.program_guard(train_program, startup_program):
-            n = 10
-            data_1 = fluid.layers.fill_constant([n], "int64", 3)
-            paddle.randperm(n=n, out=data_1)
-
-            data_2 = paddle.randperm(n=n, dtype="int32", device="cpu")
-
-            place = fluid.CPUPlace()
-            if fluid.core.is_compiled_with_cuda():
-                place = fluid.CUDAPlace(0)
-            exe = fluid.Executor(place)
-
-            exe.run(startup_program)
-            outs = exe.run(train_program, fetch_list=[data_1, data_2])
-
-            out_np = np.array(outs[0])
-            self.assertTrue(
-                check_randperm_out(n, out_np), msg=error_msg(out_np))
-
-
-class TestRandpermDygraphMode(unittest.TestCase):
-    def test_check_output(self):
-        with fluid.dygraph.guard():
+class TestRandpermImperative(unittest.TestCase):
+    def test_out(self):
+        with paddle.imperative.guard():
             n = 10
-            data_1 = paddle.randperm(n, dtype="int64")
-            data_1_np = data_1.numpy()
-            self.assertTrue(
-                check_randperm_out(n, data_1_np), msg=error_msg(data_1_np))
-
-            data_2 = paddle.randperm(n, dtype="int32", device="cpu")
-            data_2_np = data_2.numpy()
-            self.assertTrue(
-                check_randperm_out(n, data_2_np), msg=error_msg(data_2_np))
+            for dtype in ['int32', np.int64, 'float32', 'float64']:
+                data_p = paddle.randperm(n, dtype)
+                data_np = data_p.numpy()
+                self.assertTrue(
+                    check_randperm_out(n, data_np), msg=error_msg(data_np))
 
 
 if __name__ == "__main__":
diff --git a/python/paddle/fluid/tests/unittests/test_roll_op.py b/python/paddle/fluid/tests/unittests/test_roll_op.py
index d05fc45928fb7a924a734c2b88a532ba32bd5925..b20293adf4c4068320c1111d4a7642851c84b7b4 100644
--- a/python/paddle/fluid/tests/unittests/test_roll_op.py
+++ b/python/paddle/fluid/tests/unittests/test_roll_op.py
@@ -28,17 +28,17 @@ class TestRollOp(OpTest):
         self.op_type = "roll"
         self.init_dtype_type()
         self.inputs = {'X': np.random.random(self.x_shape).astype(self.dtype)}
-        self.attrs = {'shifts': self.shifts, 'dims': self.dims}
+        self.attrs = {'shifts': self.shifts, 'axis': self.axis}
         self.outputs = {
             'Out': np.roll(self.inputs['X'], self.attrs['shifts'],
-                           self.attrs['dims'])
+                           self.attrs['axis'])
         }
 
     def init_dtype_type(self):
         self.dtype = np.float64
         self.x_shape = (100, 4, 5)
         self.shifts = [101, -1]
-        self.dims = [0, -2]
+        self.axis = [0, -2]
 
     def test_check_output(self):
         self.check_output()
@@ -50,9 +50,9 @@ class TestRollOp(OpTest):
 class TestRollOpCase2(TestRollOp):
     def init_dtype_type(self):
         self.dtype = np.float32
-        self.x_shape = (100, 100, 5)
+        self.x_shape = (100, 10, 5)
         self.shifts = [8, -1]
-        self.dims = [-1, -2]
+        self.axis = [-1, -2]
 
 
 class TestRollAPI(unittest.TestCase):
@@ -60,7 +60,7 @@ class TestRollAPI(unittest.TestCase):
         self.data_x = np.array(
             [[1.0, 2.0, 3.0], [4.0, 5.0, 6.0], [7.0, 8.0, 9.0]])
 
-    def test_index_select_api(self):
+    def test_roll_op_api(self):
         self.input_data()
 
         # case 1:
@@ -78,7 +78,7 @@ class TestRollAPI(unittest.TestCase):
         # case 2:
         with program_guard(Program(), Program()):
             x = fluid.layers.data(name='x', shape=[-1, 3])
-            z = paddle.roll(x, shifts=1, dims=0)
+            z = paddle.roll(x, shifts=1, axis=0)
             exe = fluid.Executor(fluid.CPUPlace())
             res, = exe.run(feed={'x': self.data_x},
                            fetch_list=[z.name],
@@ -101,12 +101,26 @@ class TestRollAPI(unittest.TestCase):
         # case 2:
         with fluid.dygraph.guard():
             x = fluid.dygraph.to_variable(self.data_x)
-            z = paddle.roll(x, shifts=1, dims=0)
+            z = paddle.roll(x, shifts=1, axis=0)
             np_z = z.numpy()
         expect_out = np.array([[7.0, 8.0, 9.0], [1.0, 2.0, 3.0],
                                [4.0, 5.0, 6.0]])
         self.assertTrue(np.allclose(expect_out, np_z))
 
+    def test_roll_op_false(self):
+        self.input_data()
+
+        def test_axis_out_range():
+            with program_guard(Program(), Program()):
+                x = fluid.layers.data(name='x', shape=[-1, 3])
+                z = paddle.roll(x, shifts=1, axis=10)
+                exe = fluid.Executor(fluid.CPUPlace())
+                res, = exe.run(feed={'x': self.data_x},
+                               fetch_list=[z.name],
+                               return_numpy=False)
+
+        self.assertRaises(ValueError, test_axis_out_range)
+
 
 if __name__ == "__main__":
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_softmax_op.py b/python/paddle/fluid/tests/unittests/test_softmax_op.py
index c393b55d7bd2cecbb89e271555c901e81ff7eadd..1df50d63e3f67424ed1f42b94c317030ed69c6e9 100644
--- a/python/paddle/fluid/tests/unittests/test_softmax_op.py
+++ b/python/paddle/fluid/tests/unittests/test_softmax_op.py
@@ -47,6 +47,7 @@ class TestSoftmaxOp(OpTest):
         self.shape = self.get_x_shape()
         self.axis = self.get_axis()
 
+        np.random.seed(0)
         x = np.random.uniform(0.1, 1, self.shape).astype(self.dtype)
         out = np.apply_along_axis(stable_softmax, self.axis, x)
 
@@ -180,8 +181,7 @@ class TestSoftmaxFP16Op(TestSoftmaxOp):
         pass
 
 
-@unittest.skipIf(not core.is_compiled_with_cuda(),
-                 "core is not compiled with CUDA")
+@unittest.skip('disable TestSoftmaxFP16Op2')
 class TestSoftmaxFP16Op2(TestSoftmaxOp):
     def init_kernel_type(self):
         self.dtype = np.float16
diff --git a/python/paddle/fluid/tests/unittests/test_trace_op.py b/python/paddle/fluid/tests/unittests/test_trace_op.py
index 5d96d149a08afb3109194303b8144b9487062eb8..7441ff24329fa5a5f57e253ba480d1c7d9ab647c 100644
--- a/python/paddle/fluid/tests/unittests/test_trace_op.py
+++ b/python/paddle/fluid/tests/unittests/test_trace_op.py
@@ -38,7 +38,7 @@ class TestTraceOp(OpTest):
     def init_config(self):
         self.case = np.random.randn(20, 6).astype('float64')
         self.inputs = {'Input': self.case}
-        self.attrs = {'offset': 0, 'dim1': 0, 'dim2': 1}
+        self.attrs = {'offset': 0, 'axis1': 0, 'axis2': 1}
         self.target = np.trace(self.inputs['Input'])
 
 
@@ -46,24 +46,24 @@ class TestTraceOpCase1(TestTraceOp):
     def init_config(self):
         self.case = np.random.randn(2, 20, 2, 3).astype('float32')
         self.inputs = {'Input': self.case}
-        self.attrs = {'offset': 1, 'dim1': 0, 'dim2': 2}
+        self.attrs = {'offset': 1, 'axis1': 0, 'axis2': 2}
         self.target = np.trace(
             self.inputs['Input'],
             offset=self.attrs['offset'],
-            axis1=self.attrs['dim1'],
-            axis2=self.attrs['dim2'])
+            axis1=self.attrs['axis1'],
+            axis2=self.attrs['axis2'])
 
 
 class TestTraceOpCase2(TestTraceOp):
     def init_config(self):
         self.case = np.random.randn(2, 20, 2, 3).astype('float32')
         self.inputs = {'Input': self.case}
-        self.attrs = {'offset': -5, 'dim1': 1, 'dim2': -1}
+        self.attrs = {'offset': -5, 'axis1': 1, 'axis2': -1}
         self.target = np.trace(
             self.inputs['Input'],
             offset=self.attrs['offset'],
-            axis1=self.attrs['dim1'],
-            axis2=self.attrs['dim2'])
+            axis1=self.attrs['axis1'],
+            axis2=self.attrs['axis2'])
 
 
 class TestTraceAPICase(unittest.TestCase):
@@ -71,7 +71,7 @@ class TestTraceAPICase(unittest.TestCase):
         case = np.random.randn(2, 20, 2, 3).astype('float32')
         data1 = fluid.data(name='data1', shape=[2, 20, 2, 3], dtype='float32')
         out1 = tensor.trace(data1)
-        out2 = tensor.trace(data1, offset=-5, dim1=1, dim2=-1)
+        out2 = tensor.trace(data1, offset=-5, axis1=1, axis2=-1)
 
         place = core.CPUPlace()
         exe = fluid.Executor(place)
diff --git a/python/paddle/fluid/tests/unittests/test_unique_name.py b/python/paddle/fluid/tests/unittests/test_unique_name.py
index b8c751b2e9b5a905d9de40fc5f78a02c6ca5e034..8f116db855b05079b6b6f9ac6805ae7ee3467252 100644
--- a/python/paddle/fluid/tests/unittests/test_unique_name.py
+++ b/python/paddle/fluid/tests/unittests/test_unique_name.py
@@ -43,3 +43,18 @@ class TestUniqueName(unittest.TestCase):
             name3 = fluid.unique_name.generate('tmp')
             self.assertNotEqual(name1, name2)
             self.assertEqual(name1[-2:], name3[-2:])
+
+
+class TestImperativeUniqueName(unittest.TestCase):
+    def test_name_generator(self):
+        with fluid.dygraph.guard():
+            tracer = fluid.framework._dygraph_tracer()
+            tmp_var_0 = tracer._generate_unique_name()
+            self.assertEqual(tmp_var_0, "eager_tmp_0")
+
+            tmp_var_1 = tracer._generate_unique_name("eager_tmp")
+            self.assertEqual(tmp_var_1, "eager_tmp_1")
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_var_base.py b/python/paddle/fluid/tests/unittests/test_var_base.py
index ebd357106c3320e376861755d50632119a2602e9..7e565ca31b219366b7ab83267b46f32e5812d983 100644
--- a/python/paddle/fluid/tests/unittests/test_var_base.py
+++ b/python/paddle/fluid/tests/unittests/test_var_base.py
@@ -47,6 +47,24 @@ class TestVarBase(unittest.TestCase):
                 linear = fluid.dygraph.Linear(32, 64)
                 var = linear._helper.to_variable("test", name="abc")
 
+    def test_list_to_variable(self):
+        with fluid.dygraph.guard():
+            array = [[[1, 2], [1, 2], [1.0, 2]], [[1, 2], [1, 2], [1, 2]]]
+            var = fluid.dygraph.to_variable(array, dtype='int32')
+            self.assertTrue(np.array_equal(var.numpy(), array))
+            self.assertEqual(var.shape, [2, 3, 2])
+            self.assertEqual(var.dtype, core.VarDesc.VarType.INT32)
+            self.assertEqual(var.type, core.VarDesc.VarType.LOD_TENSOR)
+
+    def test_tuple_to_variable(self):
+        with fluid.dygraph.guard():
+            array = (((1, 2), (1, 2), (1, 2)), ((1, 2), (1, 2), (1, 2)))
+            var = fluid.dygraph.to_variable(array, dtype='float32')
+            self.assertTrue(np.array_equal(var.numpy(), array))
+            self.assertEqual(var.shape, [2, 3, 2])
+            self.assertEqual(var.dtype, core.VarDesc.VarType.FP32)
+            self.assertEqual(var.type, core.VarDesc.VarType.LOD_TENSOR)
+
     def test_tensor_to_variable(self):
         with fluid.dygraph.guard():
             t = fluid.Tensor()
@@ -84,7 +102,7 @@ class TestVarBase(unittest.TestCase):
     def test_to_string(self):
         with fluid.dygraph.guard():
             var = fluid.dygraph.to_variable(self.array)
-            self.assertTrue(isinstance(str(var.to_string(True)), str))
+            self.assertTrue(isinstance(str(var), str))
 
     def test_backward(self):
         with fluid.dygraph.guard():
@@ -233,6 +251,52 @@ class TestVarBase(unittest.TestCase):
             assert bool(var1) == False, "bool(var1) is False"
             assert bool(var2) == True, "bool(var2) is True"
 
+    def test_to_static_var(self):
+        with fluid.dygraph.guard():
+            # Convert VarBase into Variable or Parameter
+            var_base = fluid.dygraph.to_variable(self.array, name="var_base_1")
+            static_var = var_base._to_static_var()
+            self._assert_to_static(var_base, static_var)
+
+            var_base = fluid.dygraph.to_variable(self.array, name="var_base_2")
+            static_param = var_base._to_static_var(to_parameter=True)
+            self._assert_to_static(var_base, static_param, True)
+
+            # Convert ParamBase into Parameter
+            fc = fluid.dygraph.Linear(
+                10,
+                20,
+                param_attr=fluid.ParamAttr(
+                    learning_rate=0.001,
+                    do_model_average=True,
+                    regularizer=fluid.regularizer.L1Decay()))
+            weight = fc.parameters()[0]
+            static_param = weight._to_static_var()
+            self._assert_to_static(weight, static_param, True)
+
+    def _assert_to_static(self, var_base, static_var, is_param=False):
+        if is_param:
+            self.assertTrue(isinstance(static_var, fluid.framework.Parameter))
+            self.assertTrue(static_var.persistable, True)
+            if isinstance(var_base, fluid.framework.ParamBase):
+                for attr in ['trainable', 'is_distributed', 'do_model_average']:
+                    self.assertEqual(
+                        getattr(var_base, attr), getattr(static_var, attr))
+
+                self.assertEqual(static_var.optimize_attr['learning_rate'],
+                                 0.001)
+                self.assertTrue(
+                    isinstance(static_var.regularizer,
+                               fluid.regularizer.L1Decay))
+        else:
+            self.assertTrue(isinstance(static_var, fluid.framework.Variable))
+
+        attr_keys = ['block', 'dtype', 'type', 'name']
+        for attr in attr_keys:
+            self.assertEqual(getattr(var_base, attr), getattr(static_var, attr))
+
+        self.assertListEqual(list(var_base.shape), list(static_var.shape))
+
 
 if __name__ == '__main__':
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_while_loop_op.py b/python/paddle/fluid/tests/unittests/test_while_loop_op.py
index 224dfd7f0a79a0808a045f9ed2a060d1281eb3da..aa692eb5367361f8746f13314f954ee6ffdba9d0 100644
--- a/python/paddle/fluid/tests/unittests/test_while_loop_op.py
+++ b/python/paddle/fluid/tests/unittests/test_while_loop_op.py
@@ -199,10 +199,16 @@ class TestApiWhileLoop_Backward(unittest.TestCase):
         def cond(i, x):
             return layers.less_than(i, eleven)
 
-        def body(i, x):
+        def body(j, x):
+            # TODO: In while block, if the var created in parent block
+            # participates in the calculation of gradient, the result of gradient
+            # is incorrect because each step scope always returns the same value
+            # generated by last step.
+            # Here we call `assign` op in while block to avoid this bug, and working on fixing it in next PR.
+            i = layers.assign(j)
             x = layers.elementwise_mul(x=i, y=i)
-            i = layers.increment(i)
-            return [i, x]
+            j = layers.increment(j)
+            return [j, x]
 
         main_program = Program()
         startup_program = Program()
@@ -232,7 +238,48 @@ class TestApiWhileLoop_Backward(unittest.TestCase):
                             'x': feed_x},
                       fetch_list=[mean.name, i.grad_name])
         self.assertTrue(np.allclose(np.asarray(res[0]), data))
-        self.assertTrue(np.allclose(np.asarray(res[1]), i_grad))
+        self.assertTrue(
+            np.allclose(np.asarray(res[1]), i_grad),
+            msg=" \nres = \n{} \n\n ans = \n{}".format(res[1], i_grad))
+
+    def test_while_loop_backward2(self):
+        def cond(i, x):
+            return i < 5
+
+        def body(i, x):
+            x = x + i
+            i = i + 1
+            return [i, x]
+
+        main_program = Program()
+        startup_program = Program()
+        with fluid.program_guard(main_program, startup_program):
+            i = fluid.data(name='i', shape=[1], dtype='float32')
+            i.stop_gradient = False
+            x = fluid.data(name='x', shape=[1], dtype='float32')
+            x.stop_gradient = False
+
+            out = layers.while_loop(cond, body, [i, x])
+            mean = layers.mean(out[1])
+            append_backward(mean)
+
+        place = fluid.CUDAPlace(0) if core.is_compiled_with_cuda(
+        ) else fluid.CPUPlace()
+        exe = fluid.Executor(place)
+
+        feed_i = np.ones(1).astype('float32')
+        feed_x = np.ones(1).astype('float32')
+        data = np.asarray([11]).astype('float32')
+        i_grad = np.asarray([1]).astype('float32')
+
+        res = exe.run(main_program,
+                      feed={'i': feed_i,
+                            'x': feed_x},
+                      fetch_list=[mean.name, i.grad_name])
+        self.assertTrue(np.allclose(np.asarray(res[0]), data))
+        self.assertTrue(
+            np.allclose(np.asarray(res[1]), i_grad),
+            msg=" \nres = \n{} \n\n ans = \n{}".format(res[1], i_grad))
 
 
 class TestApiWhileLoop_NestedWithBackwardAndLoDTensorArray(unittest.TestCase):
@@ -410,7 +457,7 @@ class TestApiWhileLoop_Error(unittest.TestCase):
             ten = layers.fill_constant(shape=[1], dtype='int64', value=10)
             ten_2d = layers.fill_constant(shape=[2, 2], dtype='int64', value=10)
 
-            # The type of `cond` in Op(while_loop) must be callable 
+            # The type of `cond` in Op(while_loop) must be callable
             def type_error_cond():
                 out = layers.while_loop(data, body, [data_1d])
 
diff --git a/python/paddle/fluid/tests/unittests/test_zeros_like_op.py b/python/paddle/fluid/tests/unittests/test_zeros_like_op.py
new file mode 100644
index 0000000000000000000000000000000000000000..448751f19dbe76fdbd856d0464e36390c69aba41
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/test_zeros_like_op.py
@@ -0,0 +1,82 @@
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+
+import unittest
+import numpy as np
+import paddle
+import paddle.fluid as fluid
+from paddle import zeros_like
+from paddle.fluid import core, Program, program_guard
+
+
+class TestZerosLikeAPIError(unittest.TestCase):
+    def test_errors(self):
+        with program_guard(Program(), Program()):
+            x = paddle.data('x', [3, 4])
+            self.assertRaises(TypeError, zeros_like, x, 'int8')
+
+
+class TestZerosLikeAPI(unittest.TestCase):
+    def test_api(self):
+        shape = [3, 4]
+        startup_program = Program()
+        train_program = Program()
+        with program_guard(train_program, startup_program):
+            x = paddle.data('X', shape)
+
+            # 'bool', 'float32', 'float64', 'int32', 'int64'
+            out1 = zeros_like(x)
+            out2 = zeros_like(x, np.bool)
+            out3 = zeros_like(x, 'float64')
+            out4 = zeros_like(x, 'int32')
+            out5 = zeros_like(x, 'int64')
+
+        place = fluid.CUDAPlace(0) if core.is_compiled_with_cuda(
+        ) else fluid.CPUPlace()
+        exe = fluid.Executor(place)
+        outs = exe.run(train_program,
+                       feed={'X': np.ones(shape).astype('float32')},
+                       fetch_list=[out1, out2, out3, out4, out5])
+
+        for i, dtype in enumerate(
+            [np.float32, np.bool, np.float64, np.int32, np.int64]):
+            self.assertEqual(outs[i].dtype, dtype)
+            self.assertEqual((outs[i] == np.zeros(shape, dtype)).all(), True)
+
+
+class TestZerosLikeImpeartive(unittest.TestCase):
+    def test_out(self):
+        shape = [3, 4]
+        place = fluid.CUDAPlace(0) if core.is_compiled_with_cuda(
+        ) else fluid.CPUPlace()
+        with paddle.imperative.guard(place):
+            x = paddle.imperative.to_variable(np.ones(shape))
+            for dtype in [np.bool, np.float32, np.float64, np.int32, np.int64]:
+                out = zeros_like(x, dtype)
+                self.assertEqual((out.numpy() == np.zeros(shape, dtype)).all(),
+                                 True)
+
+            out = paddle.tensor.zeros_like(x)
+            self.assertEqual((out.numpy() == np.zeros(shape, dtype)).all(),
+                             True)
+
+            out = paddle.tensor.creation.zeros_like(x)
+            self.assertEqual((out.numpy() == np.zeros(shape, dtype)).all(),
+                             True)
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_zeros_op.py b/python/paddle/fluid/tests/unittests/test_zeros_op.py
index 6ce39cd63767e3dac474a7b0b57bfec1438d5e09..b7f7d93418342b1001eaa82bd19d64a84035a254 100644
--- a/python/paddle/fluid/tests/unittests/test_zeros_op.py
+++ b/python/paddle/fluid/tests/unittests/test_zeros_op.py
@@ -18,6 +18,7 @@ import unittest
 import numpy as np
 from op_test import OpTest
 
+import paddle
 import paddle.fluid.core as core
 from paddle.fluid.op import Operator
 import paddle.fluid as fluid
@@ -33,5 +34,47 @@ class TestZerosOpError(unittest.TestCase):
             self.assertRaises(TypeError, fluid.layers.zeros, shape, dtype)
 
 
+class ApiZerosTest(unittest.TestCase):
+    def test_out(self):
+        with paddle.program_guard(fluid.Program()):
+            zeros = paddle.zeros(shape=[10], dtype="float64")
+            place = fluid.CPUPlace()
+            exe = fluid.Executor(place)
+            result, = exe.run(fetch_list=[zeros])
+            expected_result = np.zeros(10, dtype="float64")
+        self.assertEqual((result == expected_result).all(), True)
+
+        with paddle.program_guard(fluid.Program()):
+            zeros = paddle.zeros(shape=[10], dtype="int64")
+            place = fluid.CPUPlace()
+            exe = fluid.Executor(place)
+            result, = exe.run(fetch_list=[zeros])
+            expected_result = np.zeros(10, dtype="int64")
+        self.assertEqual((result == expected_result).all(), True)
+
+        with paddle.program_guard(fluid.Program()):
+            zeros = paddle.zeros(shape=[10], dtype="int64")
+            place = fluid.CPUPlace()
+            exe = fluid.Executor(place)
+            result, = exe.run(fetch_list=[zeros])
+            expected_result = np.zeros(10, dtype="int64")
+        self.assertEqual((result == expected_result).all(), True)
+
+
+class ApiZerosError(unittest.TestCase):
+    def test_errors(self):
+        def test_error1():
+            with paddle.program_guard(fluid.Program()):
+                ones = fluid.layers.zeros(shape=10, dtype="int64")
+
+        self.assertRaises(TypeError, test_error1)
+
+        def test_error2():
+            with paddle.program_guard(fluid.Program()):
+                ones = fluid.layers.zeros(shape=[10], dtype="int8")
+
+        self.assertRaises(TypeError, test_error2)
+
+
 if __name__ == "__main__":
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/white_list/no_check_set_white_list.py b/python/paddle/fluid/tests/unittests/white_list/no_check_set_white_list.py
index 816e7c6ea09492d9d76148fee8b62c5e8650b376..19af0c92154e0fa1f631ef885588d640a338fe1b 100644
--- a/python/paddle/fluid/tests/unittests/white_list/no_check_set_white_list.py
+++ b/python/paddle/fluid/tests/unittests/white_list/no_check_set_white_list.py
@@ -17,6 +17,7 @@ no_check_set_white_list = [
     'fake_quantize_range_abs_max',
     'coalesce_tensor',
     'flatten2',
+    'lrn',
     'squeeze2',
     'reshape2',
     'transpose2',
diff --git a/python/paddle/fluid/tests/unittests/white_list/op_accuracy_white_list.py b/python/paddle/fluid/tests/unittests/white_list/op_accuracy_white_list.py
index ae99aeff557e4aa31f2868fbb8be9d038d5538ca..4629089e39c9489725340df2172c53ed0661708f 100644
--- a/python/paddle/fluid/tests/unittests/white_list/op_accuracy_white_list.py
+++ b/python/paddle/fluid/tests/unittests/white_list/op_accuracy_white_list.py
@@ -70,10 +70,12 @@ NO_FP64_CHECK_GRAD_OP_LIST = [
     'squared_l2_distance', \
     'squared_l2_norm', \
     'tanh', \
+    'mish', \
     'transpose2', \
     'trilinear_interp', \
     'var_conv_2d', \
-    'warpctc'
+    'warpctc', \
+    'bilateral_slice'
 ]
 
 NO_FP16_CHECK_GRAD_OP_LIST = [
diff --git a/python/paddle/fluid/tests/unittests/white_list/op_threshold_white_list.py b/python/paddle/fluid/tests/unittests/white_list/op_threshold_white_list.py
index fd3d5f3104f8243fdcae312620742688eb79b854..ce6868b5c70ae1218df48f899f936f57f6734582 100644
--- a/python/paddle/fluid/tests/unittests/white_list/op_threshold_white_list.py
+++ b/python/paddle/fluid/tests/unittests/white_list/op_threshold_white_list.py
@@ -40,7 +40,8 @@ NEED_FIX_FP64_CHECK_GRAD_THRESHOLD_OP_LIST = [
     'teacher_student_sigmoid_loss', \
     'unpool', \
     'yolov3_loss', \
-    'inverse'
+    'inverse', \
+    'bilateral_slice'
 ]
 
 NEED_FIX_FP64_CHECK_OUTPUT_THRESHOLD_OP_LIST = ['bilinear_interp']
diff --git a/python/paddle/fluid/transpiler/distribute_transpiler.py b/python/paddle/fluid/transpiler/distribute_transpiler.py
index 77985b111af756c14ec5aa92bf5f3a122afc8f59..0e879264f7460ad36b68cc98cc85ef225d8c4e77 100644
--- a/python/paddle/fluid/transpiler/distribute_transpiler.py
+++ b/python/paddle/fluid/transpiler/distribute_transpiler.py
@@ -50,8 +50,8 @@ from .details import delete_ops, find_op_by_output_arg
 from ..distribute_lookup_table import find_distributed_lookup_table
 from . import collective
 
-LOOKUP_TABLE_TYPE = "lookup_table"
-LOOKUP_TABLE_GRAD_TYPE = "lookup_table_grad"
+LOOKUP_TABLE_TYPE = ["lookup_table", "lookup_table_v2"]
+LOOKUP_TABLE_GRAD_TYPE = ["lookup_table_grad", "lookup_table_v2_grad"]
 OP_NAME_SCOPE = "op_namescope"
 CLIP_OP_NAME_SCOPE = "@CLIP"
 OP_ROLE_VAR_ATTR_NAME = core.op_proto_and_checker_maker.kOpRoleVarAttrName()
@@ -140,7 +140,7 @@ def slice_variable(var_list, slice_count, min_block_size):
 
 class DistributeTranspilerConfig(object):
     """
-	:api_attr: Static Graph
+        :api_attr: Static Graph
 
     A configuration class that provide support for transpiler distributed jobs.
     Some important parameters are explained as follows:
@@ -201,10 +201,10 @@ class DistributeTranspilerConfig(object):
     geo_sgd_need_push_nums = 100
 
     nccl_comm_num = 1
-    #The picture here illustrates the principle:
-    #https://github.com/PaddlePaddle/Paddle/pull/17263#discussion_r285411396
+    # The picture here illustrates the principle:
+    # https://github.com/PaddlePaddle/Paddle/pull/17263#discussion_r285411396
     use_hierarchical_allreduce = False
-    #Nccl ranks in a node when use hierarchical allreduce, it's set to gpu cards' number in most cases.
+    # Nccl ranks in a node when use hierarchical allreduce, it's set to gpu cards' number in most cases.
     hierarchical_allreduce_inter_nranks = 0
 
     # if mode is collective
@@ -255,7 +255,7 @@ class ServerRuntimeConfig(object):
 
 class DistributeTranspiler(object):
     """
-	:api_attr: Static Graph
+        :api_attr: Static Graph
 
     **DistributeTranspiler**
 
@@ -449,7 +449,7 @@ class DistributeTranspiler(object):
 
     def _get_all_remote_sparse_update_op(self, main_program):
         sparse_update_ops = []
-        sparse_update_op_types = ["lookup_table", "nce"]
+        sparse_update_op_types = ["lookup_table", "nce", "lookup_table_v2"]
         for op in main_program.global_block().ops:
             if op.type in sparse_update_op_types and op.attr(
                     'remote_prefetch') is True:
@@ -479,7 +479,7 @@ class DistributeTranspiler(object):
                     ops.append(op)
                     used_ops.append(idx)
 
-            if op_type == "lookup_table":
+            if op_type in LOOKUP_TABLE_TYPE:
                 all_ops = program.global_block().ops
                 op_idxs = [all_ops.index(op) for op in ops]
                 inputs = [
@@ -525,7 +525,8 @@ class DistributeTranspiler(object):
                             "height_sections": height_sections,
                             "endpoints": endpoints,
                             "padding_idx": padding_idx,
-                            "trainer_id": self.trainer_id
+                            "trainer_id": self.trainer_id,
+                            "lookup_table_version": op_type
                         })
                 else:
                     raise ValueError(
@@ -613,10 +614,12 @@ WIKI: https://github.com/PaddlePaddle/Fleet/blob/develop/markdown_doc/transpiler
                     )
 
                 assert trainers_num > self.config.hierarchical_allreduce_inter_nranks, \
-                    "trainers_num:{} < hierarchical_allreduce_inter_nranks:{}".format(trainers_num, self.config.hierarchical_allreduce_inter_nranks)
+                    "trainers_num:{} < hierarchical_allreduce_inter_nranks:{}".format(
+                        trainers_num, self.config.hierarchical_allreduce_inter_nranks)
 
                 assert trainers_num % self.config.hierarchical_allreduce_inter_nranks == 0, \
-                    "trainers_num:{} mod hierarchical_allreduce_inter_nranks:{} != 0".format(trainers_num, self.config.hierarchical_allreduce_inter_nranks)
+                    "trainers_num:{} mod hierarchical_allreduce_inter_nranks:{} != 0".format(
+                        trainers_num, self.config.hierarchical_allreduce_inter_nranks)
 
                 self.origin_program._hierarchical_allreduce_inter_nranks = \
                     int(self.config.hierarchical_allreduce_inter_nranks)
@@ -782,7 +785,7 @@ WIKI: https://github.com/PaddlePaddle/Fleet/blob/develop/markdown_doc/transpiler
                 decay_dummy_output = program.global_block().create_var(
                     name=framework.generate_control_dev_var_name())
                 if self.config.runtime_split_send_recv:
-                    ## async mode, using communicator to merge and send
+                    # async mode, using communicator to merge and send
                     send_varnames = [self.counter_var.name]
                 else:
                     send_varnames = []
@@ -1019,7 +1022,7 @@ WIKI: https://github.com/PaddlePaddle/Fleet/blob/develop/markdown_doc/transpiler
 
             - Delete optimizer related op, because parameter updated on Pserver
             - After the op which computed gradient of each parameter, add ``Send_op`` and ``Recv_op`` 
-        
+
         Args:
             wait_port(bool): Whether to wait for the parameter server to be ready before returning to program, 
             default is True
@@ -1076,7 +1079,7 @@ WIKI: https://github.com/PaddlePaddle/Fleet/blob/develop/markdown_doc/transpiler
         sparse_table_names = self._get_sparse_table_names()
 
         # self._fake_init_sparsetable(sparse_table_names)
-        #self._delete_trainer_optimizer(is_startup=True)
+        # self._delete_trainer_optimizer(is_startup=True)
 
         for varname, splited_var in six.iteritems(self.param_var_mapping):
             if varname in sparse_table_names:
@@ -1470,8 +1473,8 @@ WIKI: https://github.com/PaddlePaddle/Fleet/blob/develop/markdown_doc/transpiler
             Program: parameter server side startup program.
 
         Examples:
-	    .. code-block:: python
-            
+            .. code-block:: python
+
                 pserver_endpoints = "192.168.0.1:6174,192.168.0.2:6174"
                 trainer_endpoints = "192.168.0.1:6174,192.168.0.2:6174"
                 current_endpoint = "192.168.0.1:6174"
@@ -2665,7 +2668,7 @@ WIKI: https://github.com/PaddlePaddle/Fleet/blob/develop/markdown_doc/transpiler
         for op in block.ops:
             if self._is_opt_role_op(op):
                 # Todo(chengmo): Whether clip related op belongs to Optimize guard should be discussed
-                # delete clip op from opt_ops when run in Parameter Server mode 
+                # delete clip op from opt_ops when run in Parameter Server mode
                 if OP_NAME_SCOPE in op.all_attrs(
                 ) and CLIP_OP_NAME_SCOPE in op.attr(
                         OP_NAME_SCOPE
@@ -2696,7 +2699,7 @@ WIKI: https://github.com/PaddlePaddle/Fleet/blob/develop/markdown_doc/transpiler
         return opt_ops, params_grads
 
     def _get_distribute_update_vars(self):
-        #TODO(chengmo): find more powerful and simple way to deal with these special situation
+        # TODO(chengmo): find more powerful and simple way to deal with these special situation
         """
         This Function is used for a special model, like PyramidDnn which has pyramid hash op.
         Some Parameters don't use optimizing op to update its value, but updated in its BP process.
diff --git a/python/paddle/imperative/__init__.py b/python/paddle/imperative/__init__.py
index 79e9c57befed77d4f2ad5210c3f81cf1943226f3..489888a2fef39b2cca5b918a412d231784471ddc 100644
--- a/python/paddle/imperative/__init__.py
+++ b/python/paddle/imperative/__init__.py
@@ -16,7 +16,7 @@
 __all__ = [
     'BackwardStrategy', 'enabled', 'grad', 'guard', 'LayerList', 'load', 'save',
     'prepare_context', 'to_variable', 'TracedLayer', 'no_grad', 'ParallelEnv',
-    'ProgramTranslator', 'declarative', 'DataParallel'
+    'ProgramTranslator', 'declarative', 'DataParallel', 'TranslatedLayer', 'jit'
 ]
 
 __all__ += [
@@ -31,6 +31,7 @@ from ..fluid.dygraph.checkpoint import save_dygraph as save
 from ..fluid.dygraph.parallel import prepare_context, ParallelEnv, DataParallel
 from ..fluid.dygraph.jit import TracedLayer, declarative
 from ..fluid.dygraph import ProgramTranslator
+from . import jit
 
 from ..fluid.dygraph.learning_rate_scheduler import NoamDecay, PiecewiseDecay, NaturalExpDecay, ExponentialDecay, \
         InverseTimeDecay, PolynomialDecay, CosineDecay
diff --git a/python/paddle/imperative/jit/__init__.py b/python/paddle/imperative/jit/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..85fccf6e689ebf606092df8c3f94f561a68705ed
--- /dev/null
+++ b/python/paddle/imperative/jit/__init__.py
@@ -0,0 +1,18 @@
+#   Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from ...fluid.dygraph.jit import save, load, SaveLoadConfig
+from ...fluid.dygraph.io import TranslatedLayer
+
+__all__ = ['save', 'load', 'SaveLoadConfig']
diff --git a/python/paddle/incubate/complex/tensor/math.py b/python/paddle/incubate/complex/tensor/math.py
index 51477abd5dcb7ccb742291296db08d11f3614a32..5c26d6da8d9bb002a117ee40e0ce209c3fa0db9f 100644
--- a/python/paddle/incubate/complex/tensor/math.py
+++ b/python/paddle/incubate/complex/tensor/math.py
@@ -236,39 +236,38 @@ def elementwise_div(x, y, axis=-1, name=None):
         name=name)
 
 
-def trace(input, offset=0, dim1=0, dim2=1, name=None):
+def trace(x, offset=0, axis1=0, axis2=1, name=None):
     """
-    The layer to compute the trace for a complex number tensor. input :attr:`input` must be a ComplexVariable. 
+    The layer to compute the trace for a complex number tensor. x :attr:`x` must be a ComplexVariable. 
     See the detailed description for the function and other arguments 
     in :ref:`api_tensor_math_trace` . 
     
     Args:
-        input(ComplexVariable): The input ComplexVariable. Must be at least 2-dimensional. 
+        x(ComplexVariable): The input ComplexVariable x. Must be at least 2-dimensional. 
             The supported data types include complex64 and complex128.
-        offset(int, optional): Which diagonals in input tensor will be taken. Default: 0 (main diagonals).
-        dim1(int, optional): The first dimension with respect to take diagonal. Default: 0.
-        dim2(int, optional): The second dimension with respect to take diagonal. Default: 1.
+        offset(int, optional): Which diagonals in input tensor x will be taken. Default: 0 (main diagonals).
+        axis1(int, optional): The first axis with respect to take diagonal. Default: 0.
+        axis2(int, optional): The second axis with respect to take diagonal. Default: 1.
         name (str, optional): Normally there is no need for user to set this property. For more information, please refer to :ref:`api_guide_Name`. Default: None.
     
     Returns:
-        ComplexVariable: The trace result of input tensor, it's data type is the same as input data type.
+        ComplexVariable: The trace result of input tensor x, it's data type is the same as input data type.
 
     Examples:
         .. code-block:: python
 
             import paddle
-            import paddle.fluid.dygraph as dg
             import numpy as np
             
             case1 = np.random.randn(3, 10, 10).astype('float64') + 1j * np.random.randn(3, 10, 10).astype('float64')
             
-            with dg.guard():
-                case1 = dg.to_variable(case1)
-                data1 = paddle.complex.trace(case1, offset=1, dim1=1, dim2=2) # data1.shape = [3]
+            paddle.enable_imperative()
+            case1 = paddle.imperative.to_variable(case1)
+            data1 = paddle.complex.trace(case1, offset=1, axis1=1, axis2=2) # data1.shape = [3]
     """
-    complex_variable_exists([input], "trace")
-    real = math.trace(input.real, offset, dim1, dim2, name)
-    imag = math.trace(input.imag, offset, dim1, dim2, name)
+    complex_variable_exists([x], "trace")
+    real = math.trace(x.real, offset, axis1, axis2, name)
+    imag = math.trace(x.imag, offset, axis1, axis2, name)
 
     return ComplexVariable(real, imag)
 
diff --git a/python/paddle/nn/__init__.py b/python/paddle/nn/__init__.py
index 607c47c9a8c53a392a69a00329fe2359324620de..e074ca66bb1d3700cc2e50db2b1439e991113f39 100644
--- a/python/paddle/nn/__init__.py
+++ b/python/paddle/nn/__init__.py
@@ -53,12 +53,14 @@ from .input import data  #DEFINE_ALIAS
 # from .input import Input        #DEFINE_ALIAS
 # from .layer.activation import PReLU        #DEFINE_ALIAS
 from .layer.activation import ReLU  #DEFINE_ALIAS
+from .layer.activation import LeakyReLU  #DEFINE_ALIAS
 from .layer.activation import Sigmoid  #DEFINE_ALIAS
 # from .layer.activation import Softmax        #DEFINE_ALIAS
 from .layer.activation import LogSoftmax  #DEFINE_ALIAS
 from .layer.activation import HSigmoid  #DEFINE_ALIAS
 from .layer.common import BilinearTensorProduct  #DEFINE_ALIAS
 from .layer.common import Pool2D  #DEFINE_ALIAS
+from .layer.common import Pad2D  #DEFINE_ALIAS
 from .layer.common import Embedding  #DEFINE_ALIAS
 from .layer.common import Linear  #DEFINE_ALIAS
 from .layer.common import UpSample  #DEFINE_ALIAS
diff --git a/python/paddle/nn/layer/__init__.py b/python/paddle/nn/layer/__init__.py
index cac6afd615465eb0e9c6452032af20bfbeaeb612..4963ac360804f88dad9677e1dd9c05a5231c89b9 100644
--- a/python/paddle/nn/layer/__init__.py
+++ b/python/paddle/nn/layer/__init__.py
@@ -29,12 +29,14 @@ from .activation import *
 from .norm import *
 # from .activation import PReLU        #DEFINE_ALIAS
 from .activation import ReLU  #DEFINE_ALIAS
+from .activation import LeakyReLU  #DEFINE_ALIAS
 from .activation import Sigmoid  #DEFINE_ALIAS
 # from .activation import Softmax        #DEFINE_ALIAS
 from .activation import LogSoftmax  #DEFINE_ALIAS
 from .activation import HSigmoid  #DEFINE_ALIAS
 from .common import BilinearTensorProduct  #DEFINE_ALIAS
 from .common import Pool2D  #DEFINE_ALIAS
+from .common import Pad2D  #DEFINE_ALIAS
 from .common import Embedding  #DEFINE_ALIAS
 from .common import Linear  #DEFINE_ALIAS
 from .common import UpSample  #DEFINE_ALIAS
diff --git a/python/paddle/nn/layer/activation.py b/python/paddle/nn/layer/activation.py
index b30b651b79a501c36f6cd58234a96f62acdd1b1c..02a1d297e83ea4f21b3f1a9cb85b950e5959dc08 100644
--- a/python/paddle/nn/layer/activation.py
+++ b/python/paddle/nn/layer/activation.py
@@ -17,6 +17,7 @@
 __all__ = [
     #       'PReLU',
     'ReLU',
+    'LeakyReLU',
     'Sigmoid',
     #       'Softmax',
     'LogSoftmax',
@@ -207,6 +208,50 @@ class ReLU(layers.Layer):
         return functional.relu(input, self._inplace)
 
 
+class LeakyReLU(layers.Layer):
+    """
+	:alias_main: paddle.nn.LeakyReLU
+	:alias: paddle.nn.LeakyReLU,paddle.nn.layer.LeakyReLU,paddle.nn.layer.activation.LeakyReLU
+
+    Leaky ReLU Activation.
+
+    .. math:
+
+        out = max(x, alpha * x)
+
+    Parameters:
+        alpha (float, optional): Slope of the activation function at x < 0. Default: 0.01.
+        inplace (bool, optional): If inplace is True, the input and output of 
+            ``LeakyReLU`` are the same variable. Otherwise, the input and output of
+            ``LeakyReLU`` are different variables. Default False. Note that if x is
+            more than one OPs' input, inplace must be False. Default: False.
+    
+    Returns:
+        None
+    
+    Examples:
+        .. code-block:: python
+
+          import paddle.fluid as fluid
+          import paddle.nn as nn
+          import numpy as np
+
+          data = np.array([-2, 0, 1]).astype('float32')
+          lrelu = nn.LeakyReLU()
+          with fluid.dygraph.guard():
+              data = fluid.dygraph.to_variable(data)
+              res = lrelu(data)  # [-0.02, 0, 1]
+    """
+
+    def __init__(self, alpha=1e-2, inplace=False):
+        super(LeakyReLU, self).__init__()
+        self._alpha = alpha
+        self._inplace = inplace
+
+    def forward(self, input):
+        return functional.leaky_relu(input, self._alpha, self._inplace)
+
+
 class Sigmoid(layers.Layer):
     """
 	:alias_main: paddle.nn.Sigmoid
diff --git a/python/paddle/nn/layer/common.py b/python/paddle/nn/layer/common.py
index 94841bbe2e700c986e8cc8eca3b68e96dcb7add9..8125e528b195b28024915ed9c20b922bd6224a5e 100644
--- a/python/paddle/nn/layer/common.py
+++ b/python/paddle/nn/layer/common.py
@@ -20,7 +20,10 @@ from ...fluid.dygraph import Linear  #DEFINE_ALIAS
 from ...fluid.dygraph import layers
 from .. import functional as F
 
-__all__ = ['BilinearTensorProduct', 'Pool2D', 'Embedding', 'Linear', 'UpSample']
+__all__ = [
+    'BilinearTensorProduct', 'Pool2D', 'Embedding', 'Linear', 'UpSample',
+    'Pad2D'
+]
 
 
 class UpSample(layers.Layer):
@@ -248,3 +251,93 @@ class UpSample(layers.Layer):
             data_format=self.data_format)
 
         return out
+
+
+class Pad2D(layers.Layer):
+    """
+        :alias_main: paddle.nn.Pad2D
+        :alias: paddle.nn.Pad2D,paddle.nn.layer.Pad2D,paddle.nn.layer.common.Pad2D
+
+    This interface is used to construct a callable object of the ``Pad2D``  class.
+    The Pad2D layer pads the input tensor boundaries according to 'paddings' and 'mode'.
+    If mode is 'reflect', paddings[0] and paddings[1] must be no greater
+    than height-1. And the width dimension has the same condition.
+
+    Parameters:
+        paddings (int | List[int32]): The padding size. If padding is a int, uses the same 
+            padding in all boundaries, if padding is a List, it must contain four integers, 
+            (padding_top, padding_bottom, padding_left, padding_right).
+            Default is [0, 0, 0, 0].
+        mode (str): Three modes: 'constant' (default), 'reflect', 'edge' .
+        	When in 'constant' mode, this op uses a constant value to pad the input tensor.
+        	When in 'reflect' mode, uses reflection of the input boundaries to pad the input tensor.
+        	When in 'edge' mode, uses input boundaries to pad the input tensor.
+        	Default is 'constant'
+        pad_value (float32): The value to fill the padded areas in 'constant' mode . Default is 0.0
+        data_format (str): An string from: "NHWC", "NCHW". Specify the data format of
+                           the input data.
+                           Default is  "NCHW"
+
+    Returns: 
+        None
+
+    Examples:
+        .. code-block:: text
+
+            Input = [[[[1., 2., 3.],
+                       [4., 5., 6.]]]]
+
+            Case 0:
+                paddings = [0, 1, 2, 3],
+                mode = 'constant'
+                pad_value = 0
+                Out = [[[[0., 0., 1., 2., 3., 0., 0., 0.],
+                         [0., 0., 4., 5., 6., 0., 0., 0.],
+                         [0., 0., 0., 0., 0., 0., 0., 0.]]]]
+
+            Case 1:
+                paddings = [0, 1, 2, 1],
+                mode = 'reflect'
+                Out = [[[[3., 2., 1., 2., 3., 2.],
+                         [6., 5., 4., 5., 6., 5.],
+                         [3., 2., 1., 2., 3., 2.]]]]
+
+            Case 2:
+                paddings = [0, 1, 2, 1],
+                mode = 'edge'
+                Out = [[[[1., 1., 1., 2., 3., 3.],
+                         [4., 4., 4., 5., 6., 6.],
+                         [4., 4., 4., 5., 6., 6.]]]]
+
+    Code Examples:
+        .. code-block:: python
+
+            import paddle.fluid as fluid
+            import paddle.nn as nn
+            import numpy as np
+            data = np.ones((2, 2, 2, 2)).astype('float32')
+            my_pad = nn.Pad2D(paddings=[1, 1, 1, 1])
+            with fluid.dygraph.guard():
+                data = fluid.dygraph.to_variable(data)
+                result = my_pad(data)
+    """
+
+    def __init__(self,
+                 paddings=0,
+                 mode='constant',
+                 pad_value=0.0,
+                 data_format="NCHW"):
+        super(Pad2D, self).__init__()
+        self._mode = mode
+        self._pad_value = pad_value
+        self._data_format = data_format
+        self._paddings = [paddings] * 4 if isinstance(paddings,
+                                                      int) else paddings
+
+    def forward(self, input):
+        return F.pad2d(
+            input,
+            paddings=self._paddings,
+            mode=self._mode,
+            pad_value=self._pad_value,
+            data_format=self._data_format)
diff --git a/python/paddle/reader/decorator.py b/python/paddle/reader/decorator.py
index a81746e4a2c4004999891f3a8cf1bf9233d8572f..ff09f4c562aeb8216dea6e1d40b9492c257aeab4 100644
--- a/python/paddle/reader/decorator.py
+++ b/python/paddle/reader/decorator.py
@@ -536,6 +536,10 @@ def multiprocess_reader(readers, use_pipe=True, queue_size=1000):
 
     """
 
+    if sys.platform == 'win32':
+        raise NotImplementedError(
+            "The multiprocess_reader method is not supported on windows.")
+
     try:
         import ujson as json
     except Exception as e:
diff --git a/python/paddle/reader/tests/CMakeLists.txt b/python/paddle/reader/tests/CMakeLists.txt
index b1c2197c40511f0410f83dcea3fe2bfcd2cdb3ea..969718d3b1837bde2e953778be9a1390cc53bb3d 100644
--- a/python/paddle/reader/tests/CMakeLists.txt
+++ b/python/paddle/reader/tests/CMakeLists.txt
@@ -1,4 +1 @@
-# TODO: Fix this unittest failed on Windows
-if(NOT WIN32)
-    py_test(decorator_test SRCS decorator_test.py)
-endif()
+py_test(decorator_test SRCS decorator_test.py)
diff --git a/python/paddle/reader/tests/decorator_test.py b/python/paddle/reader/tests/decorator_test.py
index abe87fa04df5863d3014c7ce58c5267cc79a4c8f..e15702e39c458eeccf2528eed43f80bff6448425 100644
--- a/python/paddle/reader/tests/decorator_test.py
+++ b/python/paddle/reader/tests/decorator_test.py
@@ -12,6 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+import sys
 import time
 import unittest
 import functools
@@ -171,8 +172,9 @@ class TestMultiProcessReader(unittest.TestCase):
         self.assertEqual(sorted(self.samples), sorted(results))
 
     def test_distributed_batch_reader(self):
-        self.reader_test(use_pipe=False)
-        self.reader_test(use_pipe=True)
+        if sys.platform != 'win32':
+            self.reader_test(use_pipe=False)
+            self.reader_test(use_pipe=True)
 
 
 if __name__ == '__main__':
diff --git a/python/paddle/tensor/__init__.py b/python/paddle/tensor/__init__.py
index a96d112c8ea3b8d55cc06265728eb10afe6b4ff7..f16404001eaf43fe5fb0e0f127e5439f83ce06f4 100644
--- a/python/paddle/tensor/__init__.py
+++ b/python/paddle/tensor/__init__.py
@@ -81,7 +81,7 @@ from .manipulation import flatten  #DEFINE_ALIAS
 from .manipulation import gather  #DEFINE_ALIAS
 from .manipulation import gather_nd  #DEFINE_ALIAS
 from .manipulation import reshape  #DEFINE_ALIAS
-from .manipulation import reverse  #DEFINE_ALIAS
+from .manipulation import flip as reverse  #DEFINE_ALIAS
 from .manipulation import scatter  #DEFINE_ALIAS
 from .manipulation import scatter_nd_add  #DEFINE_ALIAS
 from .manipulation import scatter_nd  #DEFINE_ALIAS
@@ -119,7 +119,6 @@ from .math import exp  #DEFINE_ALIAS
 from .math import floor  #DEFINE_ALIAS
 from .math import increment  #DEFINE_ALIAS
 from .math import log  #DEFINE_ALIAS
-from .math import mul  #DEFINE_ALIAS
 from .math import multiplex  #DEFINE_ALIAS
 from .math import pow  #DEFINE_ALIAS
 from .math import reciprocal  #DEFINE_ALIAS
diff --git a/python/paddle/tensor/creation.py b/python/paddle/tensor/creation.py
index 8208629781bb07957d501ea196e583b33b27329b..e84fe6b4e0c4e5be7a342bddd08164f44803d6dd 100644
--- a/python/paddle/tensor/creation.py
+++ b/python/paddle/tensor/creation.py
@@ -21,14 +21,15 @@ from ..fluid.data_feeder import check_variable_and_dtype, check_type, check_dtyp
 from ..fluid.framework import convert_np_dtype_to_dtype_, in_dygraph_mode, _varbase_creator, device_guard, OpProtoHolder
 from ..fluid.layers import fill_constant
 from paddle.common_ops_import import *
+import paddle
 
 # TODO: define functions to get create a tensor  
 from ..fluid.layers import crop_tensor  #DEFINE_ALIAS
 from ..fluid.layers import diag  #DEFINE_ALIAS
 from ..fluid.layers import eye  #DEFINE_ALIAS
 from ..fluid.layers import fill_constant  #DEFINE_ALIAS
-
 from ..fluid.layers import create_tensor  #DEFINE_ALIAS
+from ..fluid.layers import linspace  #DEFINE_ALIAS
 
 __all__ = [
     'create_tensor',
@@ -54,13 +55,7 @@ __all__ = [
 ]
 
 
-def full_like(input,
-              fill_value,
-              out=None,
-              dtype=None,
-              device=None,
-              stop_gradient=True,
-              name=None):
+def full_like(x, fill_value, dtype=None, name=None):
     """
 	:alias_main: paddle.full_like
 	:alias: paddle.full_like,paddle.tensor.full_like,paddle.tensor.creation.full_like
@@ -70,12 +65,11 @@ def full_like(input,
     with `input`.
 
     Args:
-        input(Variable): The input tensor which specifies shape and data type. The data type can be bool, float16, float32, float64, int32, int64.
-        fill_value(bool|float|int): The value to fill the tensor with. Default value is 0. Note: this value shouldn't exceed the range of the output data type.
-        out(Variable, optional): Optional output which can be any created Variable that meets the requirements to store the result of operation. If out is None, a new Varibale will be create to store the result. Default value is None.
-        dtype(np.dtype|core.VarDesc.VarType|str, optional): The data type of output. The default value is None, which means the output data type is the same as input.
-        device (string, optional): Which device to run the operator. The :attr:`device` must be None, 'cpu', 'gpu'. If :attr:`device` is None, it will be the device that the user set in the paddle program. Default value is None.
-        stop_gradient(bool, optional): Indicating if we stop gradient from current(out) Variable. Default value is True.
+        x(Variable): The input tensor which specifies shape and data type. The data type can be bool, float16, float32, float64, int32, int64.
+        fill_value(bool|float|int|Variable): The value to fill the tensor with. Note: this value shouldn't exceed the range of the output data type.
+        dtype(np.dtype|core.VarDesc.VarType|str, optional): The data type of output. The data type can be one
+            of bool, float16, float32, float64, int32, int64. The default value is None, which means the output 
+            data type is the same as input.
         name(str, optional): The default value is None. Normally there is no need for user to set this property. For more information, please refer to :ref:`api_guide_Name`
     
     Returns:
@@ -85,128 +79,36 @@ def full_like(input,
         .. code-block:: python
 
           import paddle
-          import paddle.fluid as fluid
           import numpy as np
-          input = fluid.data(name='input', dtype='float32', shape=[2, 3])
+          
+          paddle.enable_imperative()  # Now we are in imperative mode 
+          input = paddle.full(shape=[2, 3], fill_value=0.0, dtype='float32', name='input')
           output = paddle.full_like(input, 2.0)
-          exe = fluid.Executor(fluid.CPUPlace())
-          exe.run(fluid.default_startup_program())
-          img=np.array([[1, 2, 3], [4, 5, 6]]).astype(np.float32)
-          res = exe.run(fluid.default_main_program(), feed={'input':img}, fetch_list=[output])
-          print(res) # [array([[2., 2., 2.], [2., 2., 2.]], dtype=float32)]
+          #output result : [array([[2., 2., 2.], [2., 2., 2.]], dtype=float32)]
     """
-    helper = LayerHelper("full_like", **locals())
 
-    var_dtype = None
     if dtype is None:
-        var_dtype = input.dtype
+        dtype = x.dtype
     else:
-        check_dtype(
-            dtype, 'dtype',
-            ['bool', 'float16', 'float32', 'float64', 'int32', 'int64'],
-            'full_like')
-        var_dtype = convert_np_dtype_to_dtype_(dtype)
+        if not isinstance(dtype, core.VarDesc.VarType):
+            dtype = convert_np_dtype_to_dtype_(dtype)
 
-    if out is None:
-        out = helper.create_variable_for_type_inference(dtype=dtype)
+    if in_dygraph_mode():
+        return core.ops.fill_any_like(x, 'value', fill_value, 'dtype', dtype)
+
+    helper = LayerHelper("full_like", **locals())
+    check_dtype(dtype, 'dtype',
+                ['bool', 'float16', 'float32', 'float64', 'int32', 'int64'],
+                'full_like/zeros_like')
+    out = helper.create_variable_for_type_inference(dtype=dtype)
 
     helper.append_op(
         type='fill_any_like',
-        inputs={'X': [input]},
+        inputs={'X': [x]},
         attrs={'value': fill_value,
-               "dtype": var_dtype},
+               "dtype": dtype},
         outputs={'Out': [out]})
-    out.stop_gradient = stop_gradient
-
-    return out
-
-
-def linspace(start, stop, num, dtype, out=None, device=None, name=None):
-    """
-	:alias_main: paddle.linspace
-	:alias: paddle.linspace,paddle.tensor.linspace,paddle.tensor.creation.linspace
-
-    This OP return fixed number of evenly spaced values within a given interval.
-    
-    **NOTICE**: The output of this OP has no gradient.
-
-    Args:
-        start(float|Variable): The input :attr:`start` is start variable of range. It is a float scalar, \
-            or a tensor of shape [1] with input data type float32, float64.
-        stop(float|Variable): The input :attr:`stop` is start variable of range. It is a float scalar, \
-            or a tensor of shape [1] with input data type float32, float64.
-        num(int|Variable): The input :attr:`num` is given num of the sequence. It is an int scalar, \
-            or a tensor of shape [1] with type int32.
-        dtype(string): The data type of output tensor, it could be 'float32' and 'float64'.
-        out (Variable, optional): Optional output which can be any created 
-            Variable that meets the requirements to store the result of operation.
-            if out is None, a new Varibale will be create to store the result. Default: None.
-        device (string, optional): Which device to run the operator. The :attr:`device` must be
-            None, 'cpu', 'gpu'. If :attr:`device` is None, it will be choose the device that the user set in 
-            the paddle program. Default: None.
-        name(str, optional): Normally there is no need for user to set this property. 
-            For more information, please refer to :ref:`api_guide_Name`.Default: None.
-
-    Returns:
-        Variable, the output data type will be float32, float64.: The 1-D tensor with fixed number of evenly spaced values, \
-        the data shape of this tensor is :math:`[num]` . If the :attr:`num` is set 1, the output tensor just has \
-        the value with input :attr:`start`. 
-
-    Examples:
-        .. code-block:: python
-
-             import paddle
-             data = paddle.linspace(0, 10, 5, dtype='float32') # [0.0,  2.5,  5.0,  7.5, 10.0]
-             data = paddle.linspace(0, 10, 1, dtype='float32') # [0.0]
-
-    """
-    helper = LayerHelper("linspace", **locals())
-
-    if not isinstance(start, Variable):
-        start = fill_constant([1], dtype, start)
-    if not isinstance(stop, Variable):
-        stop = fill_constant([1], dtype, stop)
-    if not isinstance(num, Variable):
-        num = fill_constant([1], 'int32', num)
-
-    if out is None:
-        out = helper.create_variable_for_type_inference(dtype=start.dtype)
-    else:
-        check_dtype(
-            out.dtype, out.name,
-            convert_dtype(start.dtype), 'linspace',
-            "The out data type '%s' in linspace must be the same with '%s' seted by parameter 'dtype'."
-            % (out.dtype, dtype))
-        if name:
-            warning.warn(
-                "The output Variable name of the paddle.tensor.linspace operation can only be given by parameter out or name.\
-                When parameter out and name are set at the same time, out has a higher priority than name. \
-                Finally, the output Variable name is same as the out name %s." %
-                out.name,
-                category=UserWarning,
-                stacklevel=2)
-
-    if device is not None:
-        if device not in ['cpu', 'gpu']:
-            raise ValueError(
-                "The value of 'device' in linspace operation must be cpu or gpu, but received %s."
-                % (device))
-        else:
-            with device_guard(device):
-                helper.append_op(
-                    type='linspace',
-                    inputs={'Start': start,
-                            'Stop': stop,
-                            'Num': num},
-                    outputs={'Out': [out]})
-    else:
-        helper.append_op(
-            type='linspace',
-            inputs={'Start': start,
-                    'Stop': stop,
-                    'Num': num},
-            outputs={'Out': [out]})
-
+    out.stop_gradient = True
     return out
 
 
@@ -322,7 +224,7 @@ def ones_like(input, dtype=None, device=None, name=None):
     return out
 
 
-def zeros(shape, dtype, out=None, device=None):
+def zeros(shape, dtype=None, name=None):
     """
 	:alias_main: paddle.zeros
 	:alias: paddle.zeros,paddle.tensor.zeros,paddle.tensor.creation.zeros
@@ -331,14 +233,10 @@ def zeros(shape, dtype, out=None, device=None):
 
     Args:
         shape(tuple|list): Shape of output tensor.
-        dtype(np.dtype|core.VarDesc.VarType|str): Data type of output tensor, it supports
-            bool, float16, float32, float64, int32 and int64.
-        out(Variable, optional): Optional output which can be any created 
-            Variable that meets the requirements to store the result of operation.
-            if out is None, a new Varibale will be create to store the result.
-        device(str, optional): Which device to run the operator. The :attr:`device` must be
-            None,'cpu', 'gpu'. If :attr:`device` is None, it will be choose the device that the user set in 
-            the paddle program. Default value is False.
+        dtype(np.dtype|core.VarDesc.VarType|str, optional): Data type of output tensor, it supports
+            bool, float16, float32, float64, int32 and int64. Default: if None, the date type is float32.
+        name(str, optional): The default value is None.  Normally there is no need for user to set this
+            property.  For more information, please refer to :ref:`api_guide_Name`.
 
     Returns:
         Variable: A tensor of data type :attr:`dtype` with shape :attr:`shape` and all elements set to 0.
@@ -347,91 +245,54 @@ def zeros(shape, dtype, out=None, device=None):
         .. code-block:: python
 
           import paddle
+          
+          paddle.enable_imperative()  # Now we are in imperative mode
           data = paddle.zeros(shape=[3, 2], dtype='float32') # [[0., 0.], [0., 0.], [0., 0.]]
-          data = paddle.zeros(shape=[2, 2], dtype='float32', device='cpu') # [[0., 0.], [0., 0.]]
+          data = paddle.zeros(shape=[2, 2], dtype='int32', name='zeros') # [[0, 0], [0, 0]]
     """
-    check_dtype(dtype, 'create data type',
-                ['bool', 'float16', 'float32', 'float64', 'int32', 'int64'],
-                'zeros')
-    if device is not None:
-        if device not in ['cpu', 'gpu']:
-            raise ValueError(
-                "The value of 'device' in zeros_op must be cpu or gpu, but received %s."
-                % (device))
-        with fluid.device_guard(device):
-            return fill_constant(value=0.0, shape=shape, dtype=dtype, out=out)
-
-    return fill_constant(value=0.0, shape=shape, dtype=dtype, out=out)
+    if dtype is None:
+        dtype = 'float32'
+    return fill_constant(value=0.0, shape=shape, dtype=dtype, name=name)
 
 
-def zeros_like(input, dtype=None, device=None, name=None):
+def zeros_like(x, dtype=None, name=None):
     """
 	:alias_main: paddle.zeros_like
-	:alias: paddle.zeros_like,paddle.tensor.zeros_like,paddle.tensor.creation.zeros_like
+	:alias: paddle.zeros_like, paddle.tensor.zeros_like, paddle.tensor.creation.zeros_like
 
     This function creates a zeros tensor which has identical shape and dtype 
     with `input`.
 
     Args:
-        input(Variable): The input tensor which specifies shape and dtype.The dtype of input can be 
-            bool, float32, float64, int32, int64.
-        dtype(np.dtype|core.VarDesc.VarType|str, optional): The data type can be set bool, float32, float64, int32, int64. 
-            The default value is None, the dtype is the same as input.
-        device(str, optional): Which device to run the operator. The :attr:`device` must be
-            None, 'cpu', 'gpu'. If :attr:`device` is None, it will be choose the device that the user set in 
-            the paddle program. Default value is None.
-        name(str, optional): The name of output variable, normally there is no need for user to set this this property. 
-            Default value is None, the framework set the name of output variable.  
+        x(Variable): The input tensor which specifies shape and dtype. The
+            dtype of input can be bool, float16, float32, float64, int32, int64.
+        dtype(np.dtype|core.VarDesc.VarType|str, optional): The data type can
+            be set bool, float16, float32, float64, int32, int64. The default
+            value is None, the dtype is the same as input.
+        name(str, optional): The default value is None. Normally there is no
+            need for user to set this property. For more information, please
+            refer to :ref:`api_guide_Name`.
 
     Returns:
         out(Variable): The tensor variable storing the output.
 
+    Raise:
+        TypeError: If dtype is not bool, float16, float32, float64, int32 or int64.
+
     Examples:
         .. code-block:: python
 
-          import paddle
-          import paddle.fluid as fluid
+        import paddle
+        import numpy as np
 
-          x = fluid.data(name='x', dtype='float32', shape=[3])
-          data = paddle.ones_like(x) # data=[1.0, 1.0, 1.0]
-          data1 = paddle.ones_like(input=x, device="gpu") #data1=[1.0, 1.0. 1.0]
-
-    """
-
-    helper = LayerHelper("zeros_like", **locals())
-
-    attrs = {"value": 0.0}
-    var_dtype = None
-    if dtype is not None:
-        check_dtype(dtype, 'create data type',
-                    ['bool', 'float32', 'float64', 'int32', 'int64'],
-                    'zeros_like')
-        var_dtype = convert_np_dtype_to_dtype_(dtype)
-        attrs["dtype"] = var_dtype
-    else:
-        var_dtype = input.dtype
+        paddle.enable_imperative()
 
-    out = helper.create_variable_for_type_inference(dtype=var_dtype)
+        x = paddle.imperative.to_variable(np.array([1,2,3], dtype='float32'))
+        out1 = paddle.zeros_like(x) # [1.0, 1.0, 1.0]
+        out2 = paddle.zeros_like(x, dtype='int32') # [1, 1, 1]
 
-    if device is not None:
-        if device not in ['cpu', 'gpu']:
-            raise ValueError(
-                "The value of 'device' in zeros_op must be cpu or gpu, but received %s."
-                % (device))
-        with fluid.device_guard(device):
-            helper.append_op(
-                type='fill_any_like',
-                inputs={'X': [input]},
-                attrs=attrs,
-                outputs={'Out': [out]})
-            return out
-    helper.append_op(
-        type='fill_any_like',
-        inputs={'X': [input]},
-        attrs=attrs,
-        outputs={'Out': [out]})
-    out.stop_gradient = True
-    return out
+    """
+    return full_like(x=x, fill_value=0, dtype=dtype, name=name)
 
 
 def eye(num_rows,
@@ -497,13 +358,7 @@ def eye(num_rows,
     return out
 
 
-def full(shape,
-         fill_value,
-         out=None,
-         dtype=None,
-         device=None,
-         stop_gradient=True,
-         name=None):
+def full(shape, fill_value, dtype=None, name=None):
     """
 	:alias_main: paddle.full
 	:alias: paddle.full,paddle.tensor.full,paddle.tensor.creation.full
@@ -517,17 +372,9 @@ def full(shape,
                 If ``shape`` is an Variable, it should be an 1-D Tensor .
         fill_value(bool|float16|float32|float64|int32|int64|Variable): The constant value
             used to initialize the Tensor to be created. If fill_value is an Variable, it must be an 1-D Tensor.
-        out(Variable, optional): Optional output which can be any created 
-            Variable that meets the requirements to store the result of operation.
-            if out is None, a new Varibale will be create to store the result.
         dtype(np.dtype|core.VarDesc.VarType|str, optional): Data type of the output tensor
             which can be float16, float32, float64, int32, int64, if dytpe is `None`, the data
             type of created tensor is `float32`
-        device(str, optional): On which device to run this Op. The :attr:`device` must be
-            None, 'cpu' or 'gpu'. If :attr:`device` is None, the device that the user set in 
-            the paddle program will be chosen. Default value is None.
-        stop_gradient(bool, optional): Indicating if we stop gradient from current(out) Variable,
-            default value is True.
         name(str, optional): The default value is None.  Normally there is no need for user to set this
             property.  For more information, please refer to :ref:`api_guide_Name`.
     
@@ -536,28 +383,26 @@ def full(shape,
 
     Raises:
         TypeError: The `dtype` must be one of None, bool, float16, float32, float64, int32 and int64.
-        TypeError: The `out` must be a Variable.
         TypeError: The `shape` must be one of Variable, list tuple.
     
     Examples:
         .. code-block:: python
 
           import paddle
-          import paddle.fluid as fluid
 
+          paddle.enable_imperative()  # Now we are in imperative mode
           data1 = paddle.full(shape=[2,1], fill_value=0, dtype='int64') # data1=[[0],[0]]
-          data2 = paddle.full(shape=[2,1], fill_value=5, dtype='int64', device='gpu') # data2=[[5],[5]]
 
           # attr shape is a list which contains Variable Tensor.
-          positive_2 = fluid.layers.fill_constant([1], "int32", 2)
+          positive_2 = paddle.fill_constant([1], "int32", 2)
           data3 = paddle.full(shape=[1, positive_2], dtype='float32', fill_value=1.5) # data3=[1.5, 1.5]
 
           # attr shape is an Variable Tensor.
-          shape = fluid.layers.fill_constant([1,2], "int32", 2) # shape=[2,2]
+          shape = paddle.fill_constant([2], "int32", 2) # shape=[2,2]
           data4 = paddle.full(shape=shape, dtype='bool', fill_value=True) # data4=[[True,True],[True,True]]
           
           # attr value is an Variable Tensor.
-          val = fluid.layers.fill_constant([1], "float32", 2.0) # val=[2.0]
+          val = paddle.fill_constant([1], "float32", 2.0) # val=[2.0]
           data5 = paddle.full(shape=[2,1], fill_value=val, dtype='float32') #data5=[[2.0],[2.0]]
     """
 
@@ -566,93 +411,83 @@ def full(shape,
     if dtype is None:
         dtype = 'float32'
 
-    check_dtype(dtype, 'create data type',
-                ['bool', 'float16', 'float32', 'float64', 'int32', 'int64'],
-                'full')
-    check_type(shape, 'shape', (Variable, list, tuple), 'full')
-    if out is not None:
-        check_type(out, 'out', (Variable), 'full')
+    return fill_constant(shape=shape, dtype=dtype, value=fill_value, name=name)
 
-    if out is None:
-        out = helper.create_variable_for_type_inference(dtype=dtype)
 
-    out.stop_gradient = stop_gradient
-
-    with device_guard(device):
-        out = fill_constant(shape=shape, dtype=dtype, value=fill_value, out=out)
-    return out
-
-
-def arange(start, end, step=1, dtype=None, name=None):
+def arange(start=0, end=None, step=1, dtype=None, name=None):
     """
 	:alias_main: paddle.arange
 	:alias: paddle.arange,paddle.tensor.arange,paddle.tensor.creation.arange
 
     Return evenly spaced values within a given interval.
 
-    Values are generated within the half-open interval [start, stop) (in other words,
-    the interval including start but excluding stop).
+    Values are generated into the half-open interval [start, stop) with the step.
+    (the interval including start but excluding stop).
+
+    If dtype is float32 or float64, we advise adding a small epsilon to end to
+    avoid floating point rounding errors when comparing against end.
 
     Parameters:
-        start(float32 | float64 | int32 | int64 | Variable): Start of interval. The interval includes this value.
-            when start is Variable, it is a 1-D Tensor with shape [1].
-        end(float32 | float64 | int32 | int64 | Variable): End of interval. The interval does not include this
-                                 value, except in some cases where step is not an integer
-                                 and floating point round-off affects the length of out. When end is Variable,
-                                 it is a 1-D Tensor with shape [1].
-        step(float32 | float64 | int32 | int64 | Variable): Spacing between values. For any output out, this is the
-                                  distance between two adjacent values, out[i+1] - out[i].
-        dtype(str|core.VarDesc.VarType): the data type of the output tensor, can be float32, float64, int32, int64.
-
-    Returns: a 1-D Tensor which is evenly spaced values within a given interval. Its data type is set by dtype.
+        start(float|int|Variable): Start of interval. The interval includes
+            this value. If end is None, the half-open interval is [0, start).
+            If start is Variable, it is a 1-D Tensor with shape [1], and it's
+            data type should be one of int32, int64, float32, float64. Default
+            is 0.
+        end(float|int|Variable, optional): End of interval. The interval does
+            not include this value. When end is Variable, it is a 1-D Tensor
+            with shape [1], and it's data type should be one of int32, int64,
+            float32, float64. If end is None, the half-open interval is [0, start).
+            Default is None.
+        step(float|int|Variable, optional): Spacing between values. For any
+            out, this is the istance between two adjacent values, out[i+1] - out[i].
+            When end is Variable, it is a 1-D Tensor with shape [1], and it's
+            data type should be one of int32, int64, float32, float64. Default is 1.
+        dtype(str|np.dtype|core.VarDesc.VarType, optional): The data type of
+            the output tensor, can be float32, float64, int32, int64. If dtype
+            is `None` , the data type of out tensor is `int64` . Defaule is None
+        name(str, optional): Normally there is no need for user to set this property.
+            For more information, please refer to :ref:`api_guide_Name` .
+            Default is None.
+
+    Returns: a 1-D Tensor which is evenly spaced values within a given interval.
+        Its data type is set by dtype.
     
     Return type: Variable
 
+    Raises:
+        TypeError: If dtype is not float32, float64, int32 or int64.
+
     examples:
 
         .. code-block:: python
 
-             import paddle
-             # expected out put: [0, 2, 4, 6, 8]
-             data = paddle.arange(0, 10, 2, 'int32')
-
-         #dygraph mode
-             import paddle
-             import paddle.fluid as fluid
-             with fluid.dygraph.guard():
-                 x = paddle.arange(0, 6, 2) 
-                 # x: [0, 2, 4]
-                 # x dtype: float32
-             
-    """
-    helper = LayerHelper("range", **locals())
+        import paddle
+        import numpy as np
 
-    if dtype is None:
-        dtype = 'float32'
+        paddle.enable_imperative()
 
-    check_dtype(dtype, 'create data type',
-                ['float32', 'float64', 'int32', 'int64'], 'range')
-
-    dtype = convert_dtype(dtype)
-    if not isinstance(start, Variable):
-        start = fill_constant([1], dtype, start)
+        out1 = paddle.arange(5)
+        # [0, 1, 2, 3, 4]
 
-    if not isinstance(end, Variable):
-        end = fill_constant([1], dtype, end)
+        out2 = paddle.arange(3, 9, 2.0)
+        # [3, 5, 7]
 
-    if not isinstance(step, Variable):
-        step = fill_constant([1], dtype, step)
+        # use 4.999 instead of 5.0 to avoid floating point rounding errors
+        out3 = paddle.arange(4.999, dtype='float32')
+        # [0., 1., 2., 3., 4.]
 
-    out = helper.create_variable_for_type_inference(dtype=start.dtype)
+        start_var = paddle.imperative.to_variable(np.array([3]))
+        out4 = paddle.arange(start_var, 7)
+        # [3, 4, 5, 6]
+             
+    """
+    if dtype is None:
+        dtype = 'int64'
+    if end is None:
+        end = start
+        start = 0
 
-    helper.append_op(
-        type='range',
-        inputs={'Start': start,
-                'End': end,
-                'Step': step},
-        outputs={'Out': [out]})
-    out.stop_gradient = True
-    return out
+    return paddle.fluid.layers.range(start, end, step, dtype, name)
 
 
 def _tril_triu_op(helper):
@@ -844,18 +679,19 @@ def triu(input, diagonal=0, name=None):
     return _tril_triu_op(LayerHelper('triu', **locals()))
 
 
-def meshgrid(input, name=None):
+def meshgrid(*args, **kwargs):
     """
 	:alias_main: paddle.meshgrid
 	:alias: paddle.meshgrid,paddle.tensor.meshgrid,paddle.tensor.creation.meshgrid
 
-    This op takes a list of N tensors as input, each of which is 1-dimensional 
+    This op takes a list of N tensors as input *args, each of which is 1-dimensional 
     vector, and creates N-dimensional grids.
     
     Args:
-        input(Variable) : tensors (list of tensor): the shapes of input k tensors are (N1,), 
+        *args(Variable|list of Variable) : tensors (tuple(list) of tensor): the shapes of input k tensors are (N1,), 
             (N2,),..., (Nk,). Support data types: ``float64``, ``float32``, ``int32``, ``int64``.
-        name (str, optional): The default value is None. Normally there is no need for
+        **kwargs (optional): Currently, we only accept name in **kwargs 
+            The default value is None. Normally there is no need for
             user to set this property. For more information, please refer to :ref:`api_guide_Name`.
  
     Returns:
@@ -875,7 +711,7 @@ def meshgrid(input, name=None):
           input_2 = np.random.randint(0, 100, [200, ]).astype('int32')
 
           exe = fluid.Executor(place=fluid.CPUPlace())
-          grid_x, grid_y = paddle.tensor.meshgrid([x, y])
+          grid_x, grid_y = paddle.tensor.meshgrid(x, y)
           res_1, res_2 = exe.run(fluid.default_main_program(),
                                  feed={'x': input_1,
                                        'y': input_2},
@@ -889,41 +725,45 @@ def meshgrid(input, name=None):
           #example 2: in dygraph mode
 
           import paddle
-          import paddle.fluid as fluid
           import numpy as np
+          
+          paddle.enable_imperative()
 
           input_3 = np.random.randint(0, 100, [100, ]).astype('int32')
           input_4 = np.random.randint(0, 100, [200, ]).astype('int32')
-          with fluid.dygraph.guard():
-              tensor_3 = fluid.dygraph.to_variable(input_3)
-              tensor_4 = fluid.dygraph.to_variable(input_4)
-              grid_x, grid_y = paddle.tensor.meshgrid([tensor_3, tensor_4])
+          tensor_3 = paddle.imperative.to_variable(input_3)
+          tensor_4 = paddle.imperative.to_variable(input_4)
+          grid_x, grid_y = paddle.tensor.meshgrid(tensor_3, tensor_4)
 
           #the shape of grid_x is (100, 200)
           #the shape of grid_y is (100, 200)
 
     """
 
+    if len(args) == 1 and isinstance(args[0], (list, tuple)):
+        args = args[0]
     if in_dygraph_mode():
-        num = len(input)
-        out = core.ops.meshgrid(input, num)
+        num = len(args)
+        out = core.ops.meshgrid(list(args), num)
         return out
 
+    name = kwargs.get("name", None)
     helper = LayerHelper('meshgrid', **locals())
 
-    if not isinstance(input, list):
-        raise TypeError("The type of input in meshgrid should be list.")
+    if not isinstance(args, (list, tuple)):
+        raise TypeError("The type of input args in meshgrid should be list.")
 
-    for id, input_ in enumerate(input):
+    for id, input_ in enumerate(args):
         check_dtype(input_.dtype, 'create data type',
                     ['float16', 'float32', 'float64', 'int32', 'int64'],
                     'meshgrid')
 
-    num = len(input)
+    num = len(args)
     out = [
-        helper.create_variable_for_type_inference(dtype=input[i].dtype)
+        helper.create_variable_for_type_inference(dtype=args[i].dtype)
         for i in range(num)
     ]
-    helper.append_op(type='meshgrid', inputs={'X': input}, outputs={'Out': out})
+    helper.append_op(
+        type='meshgrid', inputs={'X': list(args)}, outputs={'Out': out})
 
     return out
diff --git a/python/paddle/tensor/manipulation.py b/python/paddle/tensor/manipulation.py
index a888b78ea9146f9dd785390f2ff3604b9d1b03f6..a98a07d3dbdcd95606f3d5348e233ae148624811 100644
--- a/python/paddle/tensor/manipulation.py
+++ b/python/paddle/tensor/manipulation.py
@@ -28,7 +28,6 @@ from ..fluid.layers import expand  #DEFINE_ALIAS
 from ..fluid.layers import expand_as  #DEFINE_ALIAS
 from ..fluid.layers import flatten  #DEFINE_ALIAS
 from ..fluid.layers import reshape  #DEFINE_ALIAS
-from ..fluid.layers import reverse  #DEFINE_ALIAS
 from ..fluid.layers import scatter  #DEFINE_ALIAS
 from ..fluid.layers import slice  #DEFINE_ALIAS
 from ..fluid.layers import strided_slice  #DEFINE_ALIAS
@@ -51,46 +50,47 @@ __all__ = [
 ]
 
 
-def flip(input, dims, name=None):
+def flip(x, axis, name=None):
     """
 	:alias_main: paddle.flip
 	:alias: paddle.flip,paddle.tensor.flip,paddle.tensor.manipulation.flip
 
 
-    Reverse the order of a n-D tensor along given axis in dims.
+    Reverse the order of a n-D tensor along given axis in axis.
 
     Args:
-        input (Variable): A Tensor(or LoDTensor) with shape :math:`[N_1, N_2,..., N_k]` . The data type of the input Tensor
+        x (Variable): A Tensor(or LoDTensor) with shape :math:`[N_1, N_2,..., N_k]` . The data type of the input Tensor x
             should be float32, float64, int32, int64, bool.
-        dims (list): The axis to flip on.
+        axis (list): The axis(axes) to flip on. Negative indices for indexing from the end are accepted.
         name (str, optional): The default value is None.  Normally there is no need for user to set this property.
             For more information, please refer to :ref:`api_guide_Name` .
 
     Returns:
-        Variable: Tensor or LoDTensor calculated by flip layer. The data type is same with input.
+        Variable: Tensor or LoDTensor calculated by flip layer. The data type is same with input x.
 
     Examples:
         .. code-block:: python
 
           import paddle
-          import paddle.fluid as fluid
           import numpy as np
-          input = fluid.data(name="x", shape=[-1, 2, 2], dtype='float32')
-          output = paddle.flip(input, dims=[0, 1])
-          exe = fluid.Executor(fluid.CPUPlace())
-          exe.run(fluid.default_startup_program())
-          img = np.arange(12).reshape((3,2,2)).astype(np.float32)
-          res = exe.run(fluid.default_main_program(), feed={'x':img}, fetch_list=[output])
-          print(res) # [[[10,11][8, 9]],[[6, 7],[4, 5]] [[2, 3],[0, 1]]]
+
+          paddle.enable_imperative()
+
+          image_shape=(3, 2, 2)
+          x = np.arange(image_shape[0] * image_shape[1] * image_shape[2]).reshape(image_shape)
+          x = x.astype('float32')
+          img = paddle.imperative.to_variable(x)
+          out = paddle.flip(img, [0,1])
+
+          print(out) # [[[10,11][8, 9]],[[6, 7],[4, 5]] [[2, 3],[0, 1]]]
     """
     helper = LayerHelper("flip", **locals())
-    check_type(input, 'X', (Variable), 'flip')
-    dtype = helper.input_dtype()
+    check_type(x, 'X', (Variable), 'flip')
+    dtype = helper.input_dtype('x')
     check_dtype(dtype, 'X',
                 ['float16', 'float32', 'float64', 'int32', 'int64', 'bool'],
                 'flip')
-    check_type(dims, 'dims', (list, tuple), 'flip')
-    assert len(dims) > 0, 'len(dims) must be greater than 0.'
+    check_type(axis, 'axis', (list, tuple), 'flip')
     if name is None:
         out = helper.create_variable_for_type_inference(dtype)
     else:
@@ -98,29 +98,33 @@ def flip(input, dims, name=None):
 
     helper.append_op(
         type="flip",
-        inputs={"X": input},
+        inputs={"X": x},
         outputs={"Out": out},
-        attrs={"dims": dims})
+        attrs={"axis": axis})
     return out
 
 
-def roll(input, shifts, dims=None):
+reverse = flip  #DEFINE_ALIAS
+
+
+def roll(x, shifts, axis=None, name=None):
     """
 	:alias_main: paddle.roll
 	:alias: paddle.roll,paddle.tensor.roll,paddle.tensor.manipulation.roll
 
-    Roll the `input` tensor along the given dimension(s). Elements that are shifted beyond 
-    the last position are re-introduced at the first position. If a dimension is not specified, 
+    Roll the `x` tensor along the given axis(axes). With specific 'shifts', Elements that 
+    roll beyond the last position are re-introduced at the first according to 'shifts'. 
+    If a axis is not specified, 
     the tensor will be flattened before rolling and then restored to the original shape.
 
     Args:
-        input (Variable): The input tensor variable.
+        x (Variable): The x tensor variable as input.
         shifts (int|list|tuple): The number of places by which the elements
-                           of the `input` tensor are shifted.
-        dims (int|list|tuple|None): Dimentions along which to roll.
+                           of the `x` tensor are shifted.
+        axis (int|list|tuple|None): axis(axes) along which to roll.
 
     Returns:
-        Variable: A Tensor with same data type as `input`.
+        Variable: A Tensor with same data type as `x`.
 
     Examples:
         .. code-block:: python
@@ -131,48 +135,56 @@ def roll(input, shifts, dims=None):
             data = np.array([[1.0, 2.0, 3.0],
                              [4.0, 5.0, 6.0],
                              [7.0, 8.0, 9.0]])
-            with fluid.dygraph.guard():
-                x = fluid.dygraph.to_variable(data)
-                out_z1 = paddle.roll(x, shifts=1)
-                print(out_z1.numpy())
-                #[[9. 1. 2.]
-                # [3. 4. 5.]
-                # [6. 7. 8.]]
-                out_z2 = paddle.roll(x, shifts=1, dims=0)
-                print(out_z2.numpy())
-                #[[7. 8. 9.]
-                # [1. 2. 3.]
-                # [4. 5. 6.]]
+            paddle.enable_imperative()
+            x = paddle.imperative.to_variable(data)
+            out_z1 = paddle.roll(x, shifts=1)
+            print(out_z1.numpy())
+            #[[9. 1. 2.]
+            # [3. 4. 5.]
+            # [6. 7. 8.]]
+            out_z2 = paddle.roll(x, shifts=1, axis=0)
+            print(out_z2.numpy())
+            #[[7. 8. 9.]
+            # [1. 2. 3.]
+            # [4. 5. 6.]]
     """
     helper = LayerHelper("roll", **locals())
-    origin_shape = input.shape
+    origin_shape = x.shape
     if type(shifts) == int:
         shifts = [shifts]
-    if type(dims) == int:
-        dims = [dims]
-
-    if dims:
-        check_type(dims, 'dims', (list, tuple), 'roll')
+    if type(axis) == int:
+        axis = [axis]
+
+    len_origin_shape = len(origin_shape)
+    if axis:
+        for i in range(len(axis)):
+            if axis[i] >= len_origin_shape or axis[i] < -len_origin_shape:
+                raise ValueError(
+                    "axis is out of range, it should be in range [{}, {}), but received {}".
+                    format(-len_origin_shape, len_origin_shape, axis))
+
+    if axis:
+        check_type(axis, 'axis', (list, tuple), 'roll')
     check_type(shifts, 'shifts', (list, tuple), 'roll')
 
     if in_dygraph_mode():
-        if dims is None:
-            input = core.ops.reshape(input, 'shape', [-1, 1])
-            dims = [0]
-        out = core.ops.roll(input, 'dims', dims, 'shifts', shifts)
+        if axis is None:
+            x = core.ops.reshape(x, 'shape', [-1, 1])
+            axis = [0]
+        out = core.ops.roll(x, 'axis', axis, 'shifts', shifts)
         return core.ops.reshape(out, 'shape', origin_shape)
 
-    out = helper.create_variable_for_type_inference(input.dtype)
+    out = helper.create_variable_for_type_inference(x.dtype)
 
-    if dims is None:
-        input = reshape(input, shape=[-1, 1])
-        dims = [0]
+    if axis is None:
+        x = reshape(x, shape=[-1, 1])
+        axis = [0]
 
     helper.append_op(
         type='roll',
-        inputs={'X': input},
+        inputs={'X': x},
         outputs={'Out': out},
-        attrs={'dims': dims,
+        attrs={'axis': axis,
                'shifts': shifts})
     out = reshape(out, shape=origin_shape, inplace=True)
     return out
diff --git a/python/paddle/tensor/math.py b/python/paddle/tensor/math.py
index 7cc19186d00682d9b0bab473b013b8def3dd1786..b4a9c7a468e2f61c79082a746cb319975d99a441 100644
--- a/python/paddle/tensor/math.py
+++ b/python/paddle/tensor/math.py
@@ -191,8 +191,8 @@ Examples:
 @templatedoc()
 def pow(input, exponent, out=None, name=None):
     """
-	:alias_main: paddle.pow
-	:alias: paddle.pow,paddle.tensor.pow,paddle.tensor.math.pow
+	:alias_main: paddle.pow
+	:alias: paddle.pow,paddle.tensor.pow,paddle.tensor.math.pow
 
     This is Pow Activation Operator.
 
@@ -260,94 +260,6 @@ def pow(input, exponent, out=None, name=None):
     return out
 
 
-def mul(x, y, x_num_col_dims=1, y_num_col_dims=1, out=None, name=None):
-    """
-	:alias_main: paddle.mul
-	:alias: paddle.mul,paddle.tensor.mul,paddle.tensor.math.mul
-
-    Mul Operator.
-    This operator is used to perform matrix multiplication for input $x$ and $y$.
-    The equation is:
-
-    ..  math::
-        Out = x * y
-
-    Both the input $x$ and $y$ can carry the LoD (Level of Details) information, or not. 
-    But the output only shares the LoD information with input $x$.
-
-    Args:
-        x (Variable): The first input Tensor/LoDTensor of mul_op.
-        y (Variable): The second input Tensor/LoDTensor of mul_op.
-        x_num_col_dims (int, optional): The mul_op can take tensors with more than two dimensions as its inputs. 
-            If the input $x$ is a tensor with more than two dimensions, $x$ will be flattened into a two-dimensional 
-            matrix first. The flattening rule is: the first `num_col_dims` will be flattened to form the first 
-            dimension of the final matrix (the height of the matrix), and the rest `rank(x) - num_col_dims` 
-            dimensions are flattened to form the second dimension of the final matrix (the width of the matrix). 
-            As a result, height of the flattened matrix is equal to the product of $x$'s first `x_num_col_dims` dimensions' 
-            sizes, and width of the flattened matrix is equal to the product of $x$'s last `rank(x) - num_col_dims` 
-            dimensions' size. For example, suppose $x$ is a 6-dimensional tensor with the shape [2, 3, 4, 5, 6], 
-            and `x_num_col_dims` = 3. Thus, the flattened matrix will have a shape [2 x 3 x 4, 5 x 6] = [24, 30]. Default is 1. 
-        y_num_col_dims (int, optional): The mul_op can take tensors with more than two dimensions as its inputs. If the 
-            input $y$ is a tensor with more than two dimensions, $y$ will be flattened into a two-dimensional matrix first. 
-            The attribute `y_num_col_dims` determines how $y$ is flattened. See comments of `x_num_col_dims` for more details. 
-            Default is 1. 
-        out(Variable, optinal): The Variable that stores results of the operation. If out is None, 
-            a new Variable will be created to store the results.
-        name (str, optional): Name of the output. Normally there is no need for user to set this property. 
-            For more information, please refer to :ref:`api_guide_Name`. Default is None. If both of out and name are not None, 
-            the output name will be same as out. 
-
-    Returns:
-        Variable(Tensor/LoDTensor): The output Tensor/LoDTensor of mul op.
-
-    Examples:
-        ..  code-block:: python
-            
-            import paddle
-            import paddle.fluid as fluid
-            dataX = fluid.data(name="dataX", shape=[2, 5], dtype="float32")
-            dataY = fluid.data(name="dataY", shape=[5, 3], dtype="float32")
-            
-            res = fluid.data(name="output", shape=[2, 3], dtype="float32")
-            output = paddle.mul(dataX, dataY,
-                                      x_num_col_dims = 1,
-                                      y_num_col_dims = 1, 
-                                      out=res)
-            
-
-    """
-    inputs = {"X": [x], "Y": [y]}
-    attrs = {"x_num_col_dims": x_num_col_dims, "y_num_col_dims": y_num_col_dims}
-    if in_dygraph_mode():
-        outs = core.ops.mul(inputs, attrs)
-        return outs['Out'][0]
-
-    helper = LayerHelper("mul", **locals())
-    check_variable_and_dtype(x, 'x', ['float16', 'float32', 'float64'], 'mul')
-    check_variable_and_dtype(y, 'y', ['float16', 'float32', 'float64'], 'mul')
-
-    if out is None:
-        out = helper.create_variable_for_type_inference(dtype=x.dtype)
-    else:
-        check_dtype(
-            out.dtype, out.name,
-            convert_dtype(x.dtype), 'mul',
-            '(The out data type in pow must be the same with input data type.)')
-        if name:
-            warnings.warn(
-                "The output Variable name of the paddle.tensor.pow operation can only be given by parameter out or name.\
-                When parameter out and name are set at the same time, out has a higher priority than name. \
-                Finally, the output Variable name is same as the out name %s"
-                                                                              %
-                out.name,
-                category=UserWarning,
-                stacklevel=2)
-    helper.append_op(
-        type="mul", inputs={"X": x,
-                            "Y": y}, attrs=attrs, outputs={"Out": out})
-    return out
-
-
 __ops__noattr__ = [
     'atan',
     'sin',
@@ -411,9 +323,6 @@ def _elementwise_op(helper):
 
 def add(x, y, alpha=1, out=None, name=None):
     """
-	:alias_main: paddle.add
-	:alias: paddle.add,paddle.tensor.add,paddle.tensor.math.add
-
 Examples:
 
     .. code-block:: python
@@ -556,9 +465,6 @@ Examples:
 
 def div(x, y, out=None, name=None):
     """
-	:alias_main: paddle.div
-	:alias: paddle.div,paddle.tensor.div,paddle.tensor.math.div
-
 Examples:
 
     .. code-block:: python
@@ -681,6 +587,9 @@ for func in [
     proto_dict = {'add': 'elementwise_add', 'div': 'elementwise_div'}
     op_proto = OpProtoHolder.instance().get_op_proto(proto_dict[func.__name__])
     if func.__name__ in ['add']:
+        alias_main = ':alias_main: paddle.%(func)s' % {'func': func.__name__}
+        alias = ':alias: paddle.%(func)s, paddle.tensor.%(func)s, paddle.tensor.math.%(func)s' % {'func': func.__name__}
+
         additional_args_lines = [
             "alpha (int|float, optional): The alpha factor of the input. Default is 1. If alpha is not 1, the equation becomes Out = X + alpha * Y.",
             "out (Variable, optinal): The Variable that stores results of the operation. Default is None. If out is None, \
@@ -700,7 +609,7 @@ for func in [
             :ref:`api_guide_Name` "
         ]
 
-    func.__doc__ = _generate_doc_string_(
+    func.__doc__ = alias_main + """\n""" + alias + """\n""" + _generate_doc_string_(
         op_proto,
         additional_args_lines=additional_args_lines,
         skip_attrs_set={"x_data_format", "y_data_format", "axis",
@@ -709,8 +618,8 @@ for func in [
 
 def sum(input, dim=None, dtype=None, keep_dim=False, name=None):
     """
-	:alias_main: paddle.sum
-	:alias: paddle.sum,paddle.tensor.sum,paddle.tensor.math.sum
+	:alias_main: paddle.sum
+	:alias: paddle.sum,paddle.tensor.sum,paddle.tensor.math.sum
 
     Computes the sum of tensor elements over the given dimension.
 
@@ -814,8 +723,8 @@ def sum(input, dim=None, dtype=None, keep_dim=False, name=None):
 @templatedoc(op_type="sum")
 def elementwise_sum(inputs, name=None):
     """
-	:alias_main: paddle.elementwise_sum
-	:alias: paddle.elementwise_sum,paddle.tensor.elementwise_sum,paddle.tensor.math.elementwise_sum
+	:alias_main: paddle.elementwise_sum
+	:alias: paddle.elementwise_sum,paddle.tensor.elementwise_sum,paddle.tensor.math.elementwise_sum
 
     ${comment}
     
@@ -912,8 +821,8 @@ def elementwise_sum(inputs, name=None):
 
 def mm(input, mat2, out=None, name=None):
     """
-	:alias_main: paddle.mm
-	:alias: paddle.mm,paddle.tensor.mm,paddle.tensor.math.mm
+	:alias_main: paddle.mm
+	:alias: paddle.mm,paddle.tensor.mm,paddle.tensor.math.mm
 
     Applies matrix multiplication to two tensors.
 
@@ -1017,8 +926,8 @@ def mm(input, mat2, out=None, name=None):
 
 def addmm(input, x, y, alpha=1.0, beta=1.0, name=None):
     """
-	:alias_main: paddle.addmm
-	:alias: paddle.addmm,paddle.tensor.addmm,paddle.tensor.math.addmm
+	:alias_main: paddle.addmm
+	:alias: paddle.addmm,paddle.tensor.addmm,paddle.tensor.math.addmm
 
     **addmm**
 
@@ -1086,8 +995,8 @@ def addmm(input, x, y, alpha=1.0, beta=1.0, name=None):
 
 def logsumexp(x, dim=None, keepdim=False, out=None, name=None):
     """
-	:alias_main: paddle.logsumexp
-	:alias: paddle.logsumexp,paddle.tensor.logsumexp,paddle.tensor.math.logsumexp
+	:alias_main: paddle.logsumexp
+	:alias: paddle.logsumexp,paddle.tensor.logsumexp,paddle.tensor.math.logsumexp
 
     This operator calculates the log of the sum of exponentials of the input Tensor.
 
@@ -1157,8 +1066,8 @@ def logsumexp(x, dim=None, keepdim=False, out=None, name=None):
 
 def inverse(input, out=None, name=None):
     """
-	:alias_main: paddle.inverse
-	:alias: paddle.inverse,paddle.tensor.inverse,paddle.tensor.math.inverse
+	:alias_main: paddle.inverse
+	:alias: paddle.inverse,paddle.tensor.inverse,paddle.tensor.math.inverse
 
     Takes the inverse of the square matrix. A square matrix is a matrix with
     the same number of rows and columns. The input can be a square matrix
@@ -1232,8 +1141,8 @@ def inverse(input, out=None, name=None):
 
 def max(input, dim=None, keep_dim=False, out=None, name=None):
     """
-	:alias_main: paddle.max
-	:alias: paddle.max,paddle.tensor.max,paddle.tensor.math.max
+	:alias_main: paddle.max
+	:alias: paddle.max,paddle.tensor.max,paddle.tensor.math.max
 
     Computes the maximum of tensor elements over the given dimension.
 
@@ -1312,8 +1221,8 @@ def max(input, dim=None, keep_dim=False, out=None, name=None):
 
 def min(input, dim=None, keep_dim=False, out=None, name=None):
     """
-	:alias_main: paddle.min
-	:alias: paddle.min,paddle.tensor.min,paddle.tensor.math.min
+	:alias_main: paddle.min
+	:alias: paddle.min,paddle.tensor.min,paddle.tensor.math.min
 
     Computes the minimum of tensor elements over the given dimension.
 
@@ -1389,19 +1298,16 @@ def min(input, dim=None, keep_dim=False, out=None, name=None):
     return out
 
 
-def log1p(x, out=None, name=None):
+def log1p(x, name=None):
     """
-	:alias_main: paddle.log1p
-	:alias: paddle.log1p,paddle.tensor.log1p,paddle.tensor.math.log1p
+	:alias_main: paddle.log1p
+	:alias: paddle.log1p,paddle.tensor.log1p,paddle.tensor.math.log1p
 
     Calculates the natural log of the given input tensor, element-wise.
     .. math::
         Out = \\ln(x+1)
     Args:
         x (Variable): Input LoDTensor or Tensor. Must be one of the following types: float32, float64.
-        out(Variable, optional): Optional output which can be any created 
-            Variable that meets the requirements to store the result of operation.
-            if out is None, a new Varibale will be create to store the result.
         name(str, optional): The default value is None.  Normally there is no need for 
             user to set this property.  For more information, please refer to :ref:`api_guide_Name`
     Returns:
@@ -1430,15 +1336,15 @@ def log1p(x, out=None, name=None):
     inputs = {'X': [x]}
     helper = LayerHelper('log1p', **locals())
     dtype = helper.input_dtype(input_param_name='x')
-    if out is None:
-        out = helper.create_variable_for_type_inference(dtype)
+    out = helper.create_variable_for_type_inference(dtype)
     helper.append_op(type="log1p", inputs={"X": x}, outputs={"Out": out})
     return out
 
+
 def addcmul(input, tensor1, tensor2, value=1.0, out=None, name=None):
     """
-	:alias_main: paddle.addcmul
-	:alias: paddle.addcmul,paddle.tensor.addcmul,paddle.tensor.math.addcmul
+	:alias_main: paddle.addcmul
+	:alias: paddle.addcmul,paddle.tensor.addcmul,paddle.tensor.math.addcmul
 
     Calculate the element-wise multiplication of tensor1 and tensor2,
     then multiply the result by value, and add it to input. The shape of input,
@@ -1486,8 +1392,8 @@ def addcmul(input, tensor1, tensor2, value=1.0, out=None, name=None):
 
 def clamp(input, min=None, max=None, output=None, name=None):
     """
-	:alias_main: paddle.clamp
-	:alias: paddle.clamp,paddle.tensor.clamp,paddle.tensor.math.clamp
+	:alias_main: paddle.clamp
+	:alias: paddle.clamp,paddle.tensor.clamp,paddle.tensor.math.clamp
 
     **clampe layer**
 
@@ -1572,30 +1478,30 @@ def clamp(input, min=None, max=None, output=None, name=None):
 
     return output
 
-def trace(input, offset=0, dim1=0, dim2=1, out=None, name=None):
+def trace(x, offset=0, axis1=0, axis2=1, name=None):
     """
-	:alias_main: paddle.trace
-	:alias: paddle.trace,paddle.tensor.trace,paddle.tensor.math.trace
+	:alias_main: paddle.trace
+	:alias: paddle.trace,paddle.tensor.trace,paddle.tensor.math.trace
 
-    This OP computes the sum along diagonals of the input tensor.
+    This OP computes the sum along diagonals of the input tensor x.
     
-    If ``input`` is 2D, returns the sum of diagonal. 
+    If ``x`` is 2D, returns the sum of diagonal. 
 
-    If ``input`` has larger dimensions, then returns an tensor of diagonals sum, diagonals be taken from
-    the 2D planes specified by dim1 and dim2. By default, the 2D planes formed by the first and second dimensions 
-    of the input tensor.
+    If ``x`` has larger dimensions, then returns an tensor of diagonals sum, diagonals be taken from
+    the 2D planes specified by axis1 and axis2. By default, the 2D planes formed by the first and second axes 
+    of the input tensor x.
 
-    The argument ``offset`` determines where diagonals are taken from input tensor:
+    The argument ``offset`` determines where diagonals are taken from input tensor x:
 
     - If offset = 0, it is the main diagonal.
     - If offset > 0, it is above the main diagonal.
     - If offset < 0, it is below the main diagonal.
     
     Args:
-        input(Variable): The input tensor. Must be at least 2-dimensional. The input data type should be float32, float64, int32, int64.
-        offset(int, optional): Which diagonals in input tensor will be taken. Default: 0 (main diagonals).
-        dim1(int, optional): The first dimension with respect to take diagonal. Default: 0.
-        dim2(int, optional): The second dimension with respect to take diagonal. Default: 1.
+        x(Variable): The input tensor x. Must be at least 2-dimensional. The input data type should be float32, float64, int32, int64.
+        offset(int, optional): Which diagonals in input tensor x will be taken. Default: 0 (main diagonals).
+        axis1(int, optional): The first axis with respect to take diagonal. Default: 0.
+        axis2(int, optional): The second axis with respect to take diagonal. Default: 1.
         name (str, optional): Normally there is no need for user to set this property. For more information, please refer to :ref:`api_guide_Name`. Default: None.
 
     Returns:
@@ -1605,74 +1511,71 @@ def trace(input, offset=0, dim1=0, dim2=1, out=None, name=None):
         .. code-block:: python
 
             import paddle
-            import paddle.fluid.dygraph as dg
             import numpy as np
             
             case1 = np.random.randn(2, 3).astype('float32')
             case2 = np.random.randn(3, 10, 10).astype('float32')
             case3 = np.random.randn(3, 10, 5, 10).astype('float32')
             
-            with dg.guard():
-                case1 = dg.to_variable(case1)
-                case2 = dg.to_variable(case2)
-                case3 = dg.to_variable(case3)
-                data1 = paddle.trace(case1) # data1.shape = [1]
-                data2 = paddle.trace(case2, offset=1, dim1=1, dim2=2) # data2.shape = [3]
-                data3 = paddle.trace(case3, offset=-3, dim1=1, dim2=-1) # data2.shape = [3, 5]
+            paddle.enable_imperative()
+
+            case1 = paddle.imperative.to_variable(case1)
+            case2 = paddle.imperative.to_variable(case2)
+            case3 = paddle.imperative.to_variable(case3)
+            data1 = paddle.trace(case1) # data1.shape = [1]
+            data2 = paddle.trace(case2, offset=1, axis1=1, axis2=2) # data2.shape = [3]
+            data3 = paddle.trace(case3, offset=-3, axis1=1, axis2=-1) # data2.shape = [3, 5]
     """
-    inputs = {'Input': [input]}
-    attrs = {'offset': offset, 'dim1': dim1, 'dim2': dim2}
+    inputs = {'Input': [x]}
+    attrs = {'offset': offset, 'axis1': axis1, 'axis2': axis2}
 
     def __check_input(input, offset, dim1, dim2):
-        check_dtype(input.dtype, 'Input',
+        check_dtype(x.dtype, 'Input',
                     ['int32', 'int64', 'float16', 'float32', 'float64'],
                     'trace')
 
-        input_shape = list(input.shape)
+        input_shape = list(x.shape)
         assert len(input_shape) >= 2,                     \
-                "The input must be at least 2-dimensional, "   \
-                "But received Input's dimensional: %s.\n" %  \
+                "The x must be at least 2-dimensional, "   \
+                "But received Input x's dimensional: %s.\n" %  \
                 len(input_shape)
 
-        dim1_ = dim1 if dim1 >= 0 else len(input_shape) + dim1
-        dim2_ = dim2 if dim2 >= 0 else len(input_shape) + dim2
+        axis1_ = axis1 if axis1 >= 0 else len(input_shape) + axis1
+        axis2_ = axis2 if axis2 >= 0 else len(input_shape) + axis2
 
-        assert dim1_ < len(input_shape),     \
-            "The argument dim1 is out of range (expected to be in range of [%d, %d], but got %d).\n"  \
-            % (-(len(input_shape)), len(input_shape) - 1, dim1)
+        assert axis1_ < len(input_shape),     \
+            "The argument axis1 is out of range (expected to be in range of [%d, %d], but got %d).\n"  \
+            % (-(len(input_shape)), len(input_shape) - 1, axis1)
 
-        assert dim2_ < len(input_shape),   \
-            "The argument dim2 is out of range (expected to be in range of [%d, %d], but got %d).\n"   \
-            % (-(len(input_shape)), len(input_shape) - 1, dim2)
+        assert axis2_ < len(input_shape),   \
+            "The argument axis2 is out of range (expected to be in range of [%d, %d], but got %d).\n"   \
+            % (-(len(input_shape)), len(input_shape) - 1, axis2)
 
 
-        assert  dim1_ != dim2_,   \
-               "dim1 and dim2 cannot be the same dimension." \
-                "But received dim1 = %d, dim2 = %d\n"%(dim1, dim2)
+        assert  axis1_ != axis2_,   \
+               "axis1 and axis2 cannot be the same axis." \
+                "But received axis1 = %d, axis2 = %d\n"%(axis1, axis2)
 
     if not in_dygraph_mode():
-        __check_input(input, offset, dim1, dim2)
+        __check_input(input, offset, axis1, axis2)
     helper = LayerHelper('trace', **locals())
 
-    if out is None:
-        out = helper.create_variable_for_type_inference(dtype=input.dtype)
-    else:
-        check_variable_and_dtype(out, 'out', ['float16', 'float32', 'float64', 'int32', 'int64'], 'trace')
+    out = helper.create_variable_for_type_inference(dtype=x.dtype)
 
     helper.append_op(
         type='trace',
-        inputs={'Input': [input]},
+        inputs={'Input': [x]},
         attrs={'offset': offset,
-               'dim1': dim1,
-               'dim2': dim2},
+               'axis1': axis1,
+               'axis2': axis2},
         outputs={'Out': [out]})
     return out
 
 @templatedoc(op_type="kron")
 def kron(x, y, out=None, name=None):
     """
-	:alias_main: paddle.kron
-	:alias: paddle.kron,paddle.tensor.kron,paddle.tensor.math.kron
+	:alias_main: paddle.kron
+	:alias: paddle.kron,paddle.tensor.kron,paddle.tensor.math.kron
 
 ${comment}
 
diff --git a/python/paddle/tensor/random.py b/python/paddle/tensor/random.py
index feb2f6afd000d4d4c0595d360c2293f683343c13..8ef9dde0880795c08342d95d0f80cd2ea6e2b6dc 100644
--- a/python/paddle/tensor/random.py
+++ b/python/paddle/tensor/random.py
@@ -21,7 +21,7 @@ from ..fluid.framework import device_guard, in_dygraph_mode, _varbase_creator, V
 from ..fluid.layers.layer_function_generator import templatedoc
 from ..fluid.layer_helper import LayerHelper
 from ..fluid.data_feeder import convert_dtype, check_variable_and_dtype, check_type, check_dtype
-from ..fluid.layers import uniform_random, utils
+from ..fluid.layers import utils, uniform_random, gaussian_random
 from ..fluid.layers.tensor import fill_constant
 
 from ..fluid.io import shuffle  #DEFINE_ALIAS
@@ -37,205 +37,131 @@ __all__ = [
 ]
 
 
-def randint(low,
-            high=None,
-            shape=None,
-            out=None,
-            dtype=None,
-            device=None,
-            stop_gradient=False,
-            seed=0,
-            name=None):
+def randint(low=0, high=None, shape=[1], dtype=None, name=None):
     """
 	:alias_main: paddle.randint
 	:alias: paddle.randint,paddle.tensor.randint,paddle.tensor.random.randint
 
-    This function returns a Tensor filled with random integers from the "discrete uniform" distribution of the
-    specified data type in the interval [low, high). If high is None (the default), then results are from [0, low).
+    This function returns a Tensor filled with random integers from the
+    "discrete uniform" distribution of the specified data type in the interval
+    [low, high). If high is None (the default), then results are from [0, low).
 
     Args:
-        low (int): The lower bound on the range of random values to generate, the low is included in the range.
-            (unless high=None, in which case this parameter is one above the highest such integer).
-        high (int, optional): The upper bound on the range of random values to generate, the high is excluded 
-            in the range. Default None(see above for behavior if high=None).
-        shape (list|tuple|Variable, optional): The shape of the output Tensor,  if the shape is a list or tuple, 
-                                     its elements can be an integer
-                                     or a Tensor with the shape [1], and the type of the Tensor must be int32 or int64. 
-                                     If the shape is a Variable, it is a 1-D Tensor, and the type of the Tensor must be 
-                                     int32 or int64. Default is None, in which case the shape is [1].
-        out(Variable, optional): Optional output which can be any created 
-            Variable that meets the requirements to store the result of operation.
-            if out is None, a new Varibale will be create to store the result.
-        dtype(np.dtype|core.VarDesc.VarType|str, optional): Data type of the output Tensor
-            which can be int32, int64, if dytpe is `None`, the data
-            type of created Tensor is `int64`
-        device(str, optional): This parameter specifies that the Tensor is created 
-            on the GPU or CPU.
-        stop_gradient(bool, optional): Indicating if we stop gradient from current(out) Variable,
-            default value is False.
-        seed (int, optional): Random seed used for permute samples. If seed is 
-            equal to 0, it means use a seed generated by the system. Note that 
-            if seed is not 0, this operator will always generate the same random 
-            permutation every time. Default: 0.
-        name(str, optional): The default value is None.  Normally there is no need for user to set this
-            property.  For more information, please refer to :ref:`api_guide_Name`.
+        low (int): The lower bound on the range of random values to generate,
+            the low is included in the range.(unless high=None, in which case
+            this parameter is one above the highest such integer). Default is 0.
+        high (int, optional): The upper bound on the range of random values to
+            generate, the high is excluded in the range. Default is None(see
+            above for behavior if high=None).
+        shape (list|tuple|Variable, optional): The shape of the output Tensor,
+            if the shape is a list or tuple, its elements can be an integer or
+            a Tensor with the shape [1], and the type of the Tensor must be
+            int32 or int64. If the shape is a Variable, it is a 1-D Tensor,
+            and the type of the Tensor must be int32 or int64. Default is None.
+        dtype(np.dtype|core.VarDesc.VarType|str, optional): Data type of the
+            output Tensor which can be int32, int64. If dtype is `None`, the
+            data type of created Tensor is `int64`
+        name(str, optional): The default value is None.  Normally there is no
+            need for user to set this property.  For more information, please
+            refer to :ref:`api_guide_Name`.
 
     Returns: 
         Variable: A Tensor of the specified shape filled with random integers.
 
     Raises:
-        TypeError: Randint's low must less then high.
+        TypeError: If shape's type is not list, tuple or Variable.
+        TypeError: If dtype is not int32 or int64.
+        ValueError: If low is not large then high; If low is 0, and high is None.
 
     Examples:
         .. code-block:: python
 
-            import paddle
-            import paddle.fluid as fluid
-
-            # example 1:
-            # attr shape is a list which doesn't contain tensor Variable.
-            result_1 = paddle.randint(low=-5, high=5, shape=[3, 4], dtype="int64")
-
-            # example 2:
-            # attr shape is a list which contains tensor Variable.
-            dim_1 = fluid.layers.fill_constant([1],"int64",3)
-            dim_2 = fluid.layers.fill_constant([1],"int32",5)
-            result_2 = paddle.randint(low=-5, high=5, shape=[dim_1, dim_2], dtype="int32")
-
-            # example 3:
-            # attr shape is a Variable, the data type must be int64 or int32.
-            var_shape = fluid.data(name='var_shape', shape=[2], dtype="int64")
-            result_3 = paddle.randint(low=-5, high=5, shape=var_shape, dtype="int32")
-            var_shape_int32 = fluid.data(name='var_shape_int32', shape=[2], dtype="int32")
-            result_4 = paddle.randint(low=-5, high=5, shape=var_shape_int32, dtype="int64")
-
-            # example 4:
-            # Input only one parameter
-            # low=0, high=10, shape=[1], dtype='int64'
-            result_4 = paddle.randint(10)
-     """
-
-    def get_new_shape_tensor(list_shape):
-        new_shape_tensor = []
-        for dim in list_shape:
-            if isinstance(dim, Variable):
-                dim.stop_gradient = True
-                new_shape_tensor.append(dim)
-            else:
-                assert isinstance(dim, int) or isinstance(dim, long)
-                temp_out = helper.create_variable_for_type_inference('int64')
-                fill_constant([1], 'int64', dim, force_cpu=True, out=temp_out)
-                new_shape_tensor.append(temp_out)
-        return new_shape_tensor
-
-    def get_attr_shape(list_shape):
-        unk_dim_idx = -1
-        attrs_shape = []
-        for dim_idx, dim_size in enumerate(list_shape):
-            if isinstance(dim_size, Variable):
-                attrs_shape.append(-1)
-            else:
-                attrs_shape.append(dim_size)
-                assert dim_size > 0, (
-                    "Each dimension size given in shape must not be negative "
-                    "except one unknown dimension.")
-        return attrs_shape
+        import paddle
+        import numpy as np
+
+        paddle.enable_imperative()
+
+        # example 1:
+        # attr shape is a list which doesn't contain tensor Variable.
+        result_1 = paddle.randint(low=-5, high=5, shape=[3])
+        # [0 -3 2]
+
+        # example 2:
+        # attr shape is a list which contains tensor Variable.
+        dim_1 = paddle.fill_constant([1],"int64",2)
+        dim_2 = paddle.fill_constant([1],"int32",3)
+        result_2 = paddle.randint(low=-5, high=5, shape=[dim_1, dim_2], dtype="int32")
+        print(result_2.numpy())
+        # [[ 0 -1 -3]
+        #  [ 4 -2  0]]
+
+        # example 3:
+        # attr shape is a Variable
+        var_shape = paddle.imperative.to_variable(np.array([3]))
+        result_3 = paddle.randint(low=-5, high=5, shape=var_shape)
+        # [-2 2 3]
+
+        # example 4:
+        # data type is int32
+        result_4 = paddle.randint(low=-5, high=5, shape=[3], dtype='int32')
+        # [-5 4 -4]
+
+        # example 5:
+        # Input only one parameter
+        # low=0, high=10, shape=[1], dtype='int64'
+        result_5 = paddle.randint(10)
+        # [7]
 
+    """
+    if high is None:
+        high = low
+        low = 0
     if dtype is None:
         dtype = 'int64'
-    check_dtype(dtype, 'dtype', ['int32', 'int64'], 'randint')
-
-    inputs = dict()
-    attrs = dict()
-
-    if shape is None:
-        shape = [1]
-        assert len(shape) > 0, ("The size of argument(shape) can't be zero.")
-
-    helper = LayerHelper("randint", **locals())
+    if not isinstance(dtype, core.VarDesc.VarType):
+        dtype = convert_np_dtype_to_dtype_(dtype)
 
     if in_dygraph_mode():
-        attrs['shape'] = shape
-    else:
-        if isinstance(shape, Variable):
-            shape.stop_gradient = True
-            inputs["ShapeTensor"] = shape
-        elif isinstance(shape, (list, tuple)):
-            assert len(shape) > 0, (
-                "The size of argument(shape) can't be zero.")
-            if utils._contain_var(shape):
-                inputs['ShapeTensorList'] = get_new_shape_tensor(shape)
-            else:
-                attrs["shape"] = get_attr_shape(shape)
-    check_type(shape, 'shape', (list, tuple, Variable), 'randint')
+        shape = utils._convert_shape_to_list(shape)
+        return core.ops.randint('shape', shape, 'low', low, 'high', high,
+                                'seed', 0, 'dtype', dtype)
 
-    if high is None:
-        high = low
-        low = 0
-    attrs['low'] = low
-    attrs['high'] = high
-    attrs['seed'] = seed
-    if (low >= high):
+    check_type(shape, 'shape', (list, tuple, Variable), 'randint')
+    check_dtype(dtype, 'dtype', ['int32', 'int64'], 'randint')
+    if low >= high:
         raise ValueError(
             "randint's low must less then high, but received low = {0}, "
             "high = {1}".format(low, high))
 
-    if out is None:
-        if name is None:
-            out = helper.create_variable_for_type_inference(dtype=dtype)
-        else:
-            out = helper.create_variable(
-                name=name, dtype=dtype, persistable=False)
-    else:
-        check_dtype(dtype, 'dtype',
-                    convert_dtype(out.dtype), 'randint',
-                    "(The dtype in randint must be the same with out's dtype.)")
-    attrs['dtype'] = out.dtype
-    out.stop_gradient = stop_gradient
-
-    if device is None:
-        helper.append_op(
-            type='randint', inputs=inputs, outputs={'Out': out}, attrs=attrs)
-    else:
-        with device_guard(device):
-            helper.append_op(
-                type='randint',
-                inputs=inputs,
-                outputs={'Out': out},
-                attrs=attrs)
+    inputs = dict()
+    attrs = {'low': low, 'high': high, 'seed': 0, 'dtype': dtype}
+    utils._get_shape_tensor_inputs(
+        inputs=inputs, attrs=attrs, shape=shape, op_type='randint')
+
+    helper = LayerHelper("randint", **locals())
+    out = helper.create_variable_for_type_inference(dtype=dtype)
+    helper.append_op(
+        type='randint', inputs=inputs, outputs={'Out': out}, attrs=attrs)
     return out
 
 
-def randn(shape,
-          out=None,
-          dtype=None,
-          device=None,
-          stop_gradient=True,
-          name=None):
+def randn(shape, dtype=None, name=None):
     """
 	:alias_main: paddle.randn
 	:alias: paddle.randn,paddle.tensor.randn,paddle.tensor.random.randn
 
     This function returns a tensor filled with random numbers from a normal 
-    distribution with mean 0 and variance 1 (also called the standard normal
+    distribution with mean 0 and standard deviation 1 (also called the standard normal
     distribution).
 
     Args:
-        shape(list|tuple): Shape of the generated random tensor.
-        out(Variable, optional): Optional output which can be any created Variable 
-            that meets the requirements to store the result of operation. If the 
-            out is `None`, a new Variable will be returned to store the result.
-            Default is None.
-        dtype(np.dtype|core.VarDesc.VarType|str, optional): Data type of the output 
-            tensor, which can be float32, float64. if dtype is `None` , the data 
-            type of output tensor is `float32` .
-            Default is None.
-        device(str, optional): Specific the output variable to be saved in cpu
-            or gpu memory. Supported None, 'cpu', 'gpu'. If it is None, the output
-            variable will be automatically assigned devices. 
-            Default: None.
-        stop_gradient(bool, optional): Indicating if we stop gradient from current(out) 
-            Variable. Default is True.
+        shape(list|tuple|Variable): Shape of the Tensor to be created. The data
+            type is ``int32`` or ``int64`` . If ``shape`` is a list or tuple,
+            the elements of it should be integers or Tensors with shape [1]. If
+            ``shape`` is a Variable, it should be an 1-D Tensor .
+        dtype(np.dtype|core.VarDesc.VarType|str, optional): Data type of the output
+            tensor, which can be float32, float64. If dtype is `None` , the data
+            type of output tensor is `float32` . Default is None.
         name(str, optional): Normally there is no need for user to set this property.
             For more information, please refer to :ref:`api_guide_Name` .
             Default is None.
@@ -244,85 +170,55 @@ def randn(shape,
         Random tensor whose data is drawn from a standard normal distribution,
         dtype: flaot32 or float64 as specified.
 
-    Return type:
-        Variable
+    Return type: Variable
 
     Raises:
-        TypeError: If the type of `shape` is not list or tuple.
+        TypeError: If the type of `shape` is not Variable, list or tuple.
         TypeError: If the data type of `dtype` is not float32 or float64.
         ValueError: If the length of `shape` is not bigger than 0.
 
     Examples:
         .. code-block:: python
 
-            # declarative mode
-            import paddle
-            import paddle.fluid as fluid
-
-            data = paddle.randn([2, 4])
-            place = fluid.CPUPlace()
-            exe = fluid.Executor(place)
-            res, = exe.run(fluid.default_main_program(), feed={}, fetch_list=[data])
-            print(res)
-            # [[-1.4187592   0.7368311  -0.53748125 -0.0146909 ]
-            #  [-0.66294265 -1.3090698   0.1898754  -0.14065823]]
-
-        .. code-block:: python
+        import paddle
+        import numpy as np
+
+        paddle.enable_imperative()
+
+        # example 1: attr shape is a list which doesn't contain tensor Variable.
+        result_1 = paddle.randn(shape=[2, 3])
+        # [[-2.923464    0.11934398 -0.51249987]
+        #  [ 0.39632758  0.08177969  0.2692008 ]]
+
+        # example 2: attr shape is a list which contains tensor Variable.
+        dim_1 = paddle.fill_constant([1], "int64", 2)
+        dim_2 = paddle.fill_constant([1], "int32", 3)
+        result_2 = paddle.randn(shape=[dim_1, dim_2, 2])
+        # [[[-2.8852394  -0.25898588]
+        #   [-0.47420555  0.17683524]
+        #   [-0.7989969   0.00754541]]
+        #  [[ 0.85201347  0.32320443]
+        #   [ 1.1399018   0.48336947]
+        #   [ 0.8086993   0.6868893 ]]]
+
+        # example 3: attr shape is a Variable, the data type must be int64 or int32.
+        var_shape = paddle.imperative.to_variable(np.array([2, 3]))
+        result_3 = paddle.randn(var_shape)
+        # [[-2.878077    0.17099959  0.05111201]
+        #  [-0.3761474  -1.044801    1.1870178 ]]
 
-            # imperative mode
-            import paddle
-            import paddle.fluid as fluid
-            import paddle.fluid.dygraph as dg
-
-            place = fluid.CPUPlace()
-            with dg.guard(place) as g:
-                x = paddle.randn([2, 4])
-                x_np = x.numpy()
-                print(x_np)
-                # [[ 1.5149173  -0.26234224 -0.592486    1.4523455 ]
-                #  [ 0.04581212 -0.85345626  1.1687907  -0.02512913]]
     """
-    helper = LayerHelper("randn", **locals())
-    check_type(shape, 'shape', (list, tuple), 'randn')
-    assert len(shape) > 0, ("The size of argument(shape) can't be zero.")
-
     if dtype is None:
         dtype = 'float32'
 
-    check_dtype(dtype, 'create data type', ['float32', 'float64'], 'randn')
-
-    if out is None:
-        out = helper.create_variable_for_type_inference(dtype=dtype)
-    else:
-        check_variable_and_dtype(out, 'out', [dtype], 'randn')
-
-    out.stop_gradient = stop_gradient
-
-    dtype = convert_np_dtype_to_dtype_(dtype)
-    seed = np.random.randint(0, 100)
-
-    with device_guard(device):
-        helper.append_op(
-            type='gaussian_random',
-            outputs={'Out': out},
-            attrs={
-                'shape': shape,
-                'mean': 0.0,
-                'std': 1.0,
-                'seed': seed,
-                'dtype': dtype,
-                'use_mkldnn': False
-            })
+    out = gaussian_random(
+        shape=shape, mean=0.0, std=1.0, seed=0, dtype=dtype, name=name)
+    out.stop_gradient = True
     return out
 
 
 @templatedoc()
-def randperm(n,
-             out=None,
-             dtype="int64",
-             device=None,
-             stop_gradient=True,
-             seed=0):
+def randperm(n, dtype="int64", name=None):
     """
 	:alias_main: paddle.randperm
 	:alias: paddle.randperm,paddle.tensor.randperm,paddle.tensor.random.randperm
@@ -330,23 +226,13 @@ def randperm(n,
     ${comment}
 
     Args:
-        n (int): The upper bound (exclusive), and it should be greater than 0.
-        out (Variable, optional): Optional output which can be any created 
-            Variable that meets the requirements to store the result of operation.
-            If out is None, a new Varibale will be create to store the result. 
-            Default: None.
-        dtype (np.dtype|core.VarDesc.VarType|str, optional): The type of the 
-            output Tensor. Supported data types: int64, int32. Default: int32.
-        device (str, optional): Specific the output variable to be saved in cpu
-            or gpu memory. Supported None, 'cpu', 'gpu'. If it is None, the output
-            variable will be automatically assigned devices.
-            Default: None.
-        stop_gradient (bool, optional): Whether grad should record operations 
-            on the returned tensor. Default: True.
-        seed (int, optional): Random seed used for permute samples. If seed is 
-            equal to 0, it means use a seed generated by the system. Note that 
-            if seed is not 0, this operator will always generate the same random 
-            permutation every time. Default: 0.
+        n(int): The upper bound (exclusive), and it should be greater than 0.
+        dtype(np.dtype|core.VarDesc.VarType|str, optional): The type of the 
+            output Tensor. Supported data types: int32, int64, float32, float64.
+            Default: int32.
+        name(str, optional): Normally there is no need for user to set this property.
+            For more information, please refer to :ref:`api_guide_Name` .
+            Default is None.
 
     Returns:
         ${out_comment}.
@@ -357,56 +243,38 @@ def randperm(n,
     Examples:
         .. code-block:: python
 
-	    import paddle
-	    import paddle.fluid as fluid
-
-	    num = 6
-	    is_use_gpu = False
+        import paddle
 
-	    data_1 = paddle.randperm(num)
-	    fluid.layers.Print(data_1)
+        paddle.enable_imperative()
 
-	    data_2 = paddle.randperm(num, dtype="int32", seed=1)
-	    fluid.layers.Print(data_2)
+        result_1 = paddle.randperm(5)
+        # [4 1 2 3 0]
 
-	    data_3 = paddle.randperm(num, stop_gradient=False, device="cpu")
-	    fluid.layers.Print(data_3)
-
-	    paddle.randperm(num, out=data_3)
-	    fluid.layers.Print(data_3)
-
-	    place = fluid.CUDAPlace(0) if is_use_gpu else fluid.CPUPlace()
-	    exe = fluid.Executor(place)
-	    exe.run(fluid.default_startup_program())
-	    exe.run()
+        result_2 = paddle.randperm(7, 'int32')
+        # [1 6 2 0 4 3 5]
  
     """
+    if not isinstance(dtype, core.VarDesc.VarType):
+        dtype = convert_np_dtype_to_dtype_(dtype)
+
+    if in_dygraph_mode():
+        return core.ops.randperm('n', n, 'seed', 0, 'dtype', dtype)
 
     if n < 1:
         raise ValueError("The input n should be greater than 0 in randperm op.")
-    check_dtype(dtype, 'dtype', ['int64', 'int32'], 'randperm')
-    dtype = convert_dtype(dtype)
-    if device not in [None, 'cpu', 'gpu']:
-        raise ValueError("The input device should in [None, 'cpu', 'gpu'].")
-    check_type(stop_gradient, 'stop_gradient', bool, 'randperm')
+    check_dtype(dtype, 'dtype', ['int64', 'int32', 'float32', 'float64'],
+                'randperm')
 
     helper = LayerHelper("randperm", **locals())
-    if out is None:
-        out = helper.create_variable_for_type_inference(dtype=dtype)
-    else:
-        check_variable_and_dtype(out, 'out', [dtype], 'randperm')
-    if stop_gradient:
-        out.stop_gradient = True
-    inputs = dict()
-    outputs = {'Out': [out]}
-    attrs = {'n': n, 'dtype': out.dtype, 'seed': seed}
-    with device_guard(device):
-        helper.append_op(
-            type='randperm', inputs=inputs, outputs=outputs, attrs=attrs)
+    out = helper.create_variable_for_type_inference(dtype)
+    attrs = {'n': n, 'dtype': dtype, 'seed': 0}
+    helper.append_op(
+        type='randperm', inputs={}, outputs={'Out': out}, attrs=attrs)
+    out.stop_gradient = True
     return out
 
 
-def rand(shape, out=None, dtype=None, device=None, stop_gradient=True):
+def rand(shape, dtype=None, name=None):
     """
 	:alias_main: paddle.rand
 	:alias: paddle.rand,paddle.tensor.rand,paddle.tensor.random.rand
@@ -424,22 +292,19 @@ def rand(shape, out=None, dtype=None, device=None, stop_gradient=True):
           result=[[0.8505902, 0.8397286]]
 
     Args:
-        shape(list|tuple|Variable): Shape of the Tensor to be created.
-                The data type is ``int32`` or ``int64`` . If ``shape`` is a list or tuple,
-                the elements of it should be integers or Tensors with shape [1].
-                If ``shape`` is a Variable, it should be an 1-D Tensor .
-        out(Variable, optional): Optional output which can be any created
-            Variable that meets the requirements to store the result of operation.
-            if out is None, a new Varibale will be create to store the result.
-        dtype(np.dtype|core.VarDesc.VarType|str, optional): Data type of the output tensor
-            which can be float32, float64, if dytpe is `None`, the data
-            type of created tensor is `float32`
-        device(str, optional): This parameter specifies that the Tensor is created
-            on the GPU or CPU.
-        stop_gradient(bool, optional): Indicating if we stop gradient from current(out) Variable,
-            default value is True.
+        shape(list|tuple|Variable): Shape of the Tensor to be created. The data
+            type is ``int32`` or ``int64`` . If ``shape`` is a list or tuple,
+            the elements of it should be integers or Tensors with shape [1]. If
+            ``shape`` is a Variable, it should be an 1-D Tensor .
+        dtype(np.dtype|core.VarDesc.VarType|str, optional): Data type of the
+            output tensor which can be float32, float64, if dytpe is `None`,
+            the data type of created tensor is `float32`
+        name(str, optional): The default value is None. Normally there is no
+            need for user to set this property. For more information, please
+            refer to :ref:`api_guide_Name`.
     Returns:
-        Variable: A Tensor of the specified shape filled with random numbers from a uniform distribution on the interval [0, 1).
+        Variable: A Tensor of the specified shape filled with random numbers
+        from a uniform distribution on the interval [0, 1).
 
     Raises:
         TypeError: The shape type should be list or tupple or Variable.
@@ -447,54 +312,36 @@ def rand(shape, out=None, dtype=None, device=None, stop_gradient=True):
     Examples:
         .. code-block:: python
 
-            import paddle
-            import paddle.fluid as fluid
-
-            # example 1:
-            # attr shape is a list which doesn't contain tensor Variable.
-            result_1 = paddle.rand(shape=[3, 4])
-
-            # example 2:
-            # attr shape is a list which contains tensor Variable.
-            dim_1 = fluid.layers.fill_constant([1],"int64",3)
-            dim_2 = fluid.layers.fill_constant([1],"int32",5)
-            result_2 = paddle.rand(shape=[dim_1, dim_2])
-
-            # example 3:
-            # attr shape is a Variable, the data type must be int64 or int32.
-            var_shape = fluid.data(name='var_shape', shape=[2], dtype="int64")
-            result_3 = paddle.rand(var_shape)
-            var_shape_int32 = fluid.data(name='var_shape_int32', shape=[2], dtype="int32")
-            result_4 = paddle.rand(var_shape_int32)
+        import paddle
+        import numpy as np
+
+        paddle.enable_imperative()
+        # example 1: attr shape is a list which doesn't contain tensor Variable.
+        result_1 = paddle.rand(shape=[2, 3])
+        # [[0.451152  , 0.55825245, 0.403311  ],
+        #  [0.22550228, 0.22106001, 0.7877319 ]]
+
+        # example 2: attr shape is a list which contains tensor Variable.
+        dim_1 = paddle.fill_constant([1], "int64", 2)
+        dim_2 = paddle.fill_constant([1], "int32", 3)
+        result_2 = paddle.rand(shape=[dim_1, dim_2, 2])
+        # [[[0.8879919  0.25788337]
+        #   [0.28826773 0.9712097 ]
+        #   [0.26438272 0.01796806]]
+        #  [[0.33633623 0.28654453]
+        #   [0.79109055 0.7305809 ]
+        #   [0.870881   0.2984597 ]]]
+
+        # example 3: attr shape is a Variable, the data type must be int64 or int32.
+        var_shape = paddle.imperative.to_variable(np.array([2, 3]))
+        result_3 = paddle.rand(var_shape)
+        # [[0.22920267 0.841956   0.05981819]
+        #  [0.4836288  0.24573246 0.7516129 ]]
+
     """
     if dtype is None:
         dtype = 'float32'
 
-    check_dtype(dtype, 'dtype', ['float32', 'float64'], 'rand')
-
-    check_type(shape, 'shape', (Variable, list, tuple), 'rand')
-    if isinstance(shape, Variable):
-        check_variable_and_dtype(shape, 'shape', ['int32', 'int64'], 'rand')
-    elif isinstance(shape, (list, tuple)):
-        for i, _shape in enumerate(shape):
-            if not isinstance(_shape, Variable):
-                check_type(_shape, '_shape', (int), 'rand')
-            else:
-                check_variable_and_dtype(_shape, 'shape[' + str(i) + ']',
-                                         ['int32', 'int64'], 'rand')
-
-    if device not in [None, 'cpu', 'gpu']:
-        raise ValueError(
-            "The input device should in [None, 'cpu', 'gpu'], but received {}".
-            format(device))
-
-    helper = LayerHelper("rand", **locals())
-    if out is None:
-        out = helper.create_variable_for_type_inference(dtype=dtype)
-    else:
-        check_variable_and_dtype(out, 'out', [dtype], 'rand')
-    out.stop_gradient = stop_gradient
-
-    with device_guard(device):
-        out = uniform_random(shape, dtype, min=0., max=1.0)
+    out = uniform_random(shape, dtype, min=0.0, max=1.0, name=name)
+    out.stop_gradient = True
     return out
diff --git a/python/paddle/tensor/search.py b/python/paddle/tensor/search.py
index 59b8f1e765b266a2fda61dd645b9fdce96e2a400..d8874e47c357937020ffe2e392332599df6653c0 100644
--- a/python/paddle/tensor/search.py
+++ b/python/paddle/tensor/search.py
@@ -63,9 +63,9 @@ def argmax(input, axis=None, dtype=None, out=None, keepdims=False, name=None):
             Variable that meets the requirements to store the result of operation.
             if out is None, a new Varibale will be create to store the result. Defalut is None.
         keepdims(bool, optional): Keep the axis that do the select max.
-        name(str, optional): The name of output variable, normally there is no need for user to set this this property. 
-            Default value is None, the framework set the name of output variable.  
-
+        name(str, optional): The default value is None. Normally there is no
+            need for user to set this property. For more information, please
+            refer to :ref:`api_guide_Name`.
 
     Returns:
         Variable: A Tensor with data type int64.
@@ -135,7 +135,7 @@ def argmax(input, axis=None, dtype=None, out=None, keepdims=False, name=None):
     return out
 
 
-def index_select(input, index, dim=0):
+def index_select(x, index, axis=0, name=None):
     """
 	:alias_main: paddle.index_select
 	:alias: paddle.index_select,paddle.tensor.index_select,paddle.tensor.search.index_select
@@ -146,56 +146,60 @@ def index_select(input, index, dim=0):
     size as the length of `index`; other dimensions have the same size as in the `input` tensor. 
 
     Args:
-        input (Variable): The input tensor variable.
-        index (Variable): The 1-D tensor containing the indices to index.
-        dim (int): The dimension in which we index.
+        x (Variable): The input tensor variable.The dtype of x can be one of float32, float64, int32, int64.
+        index (Variable): The 1-D tensor containing the indices to index.the dtype of index can be int32 or int64.
+        axis (int, optional): The dimension in which we index. Default: if None, the axis is 0.
+        name(str, optional): The default value is None. Normally there is no
+            need for user to set this property. For more information, please
+            refer to :ref:`api_guide_Name`.
 
     Returns:
         Variable: A Tensor with same data type as `input`.
+    
+    Raises:
+        TypeError: x must be a Variable and the dtype of x must be one of  float32, float64, int32 and int64.
+        TypeError: index must be a Variable adn the dtype of index must be int32 or int64.
 
     Examples:
         .. code-block:: python
             import paddle
-            import paddle.fluid as fluid
             import numpy as np
 
+            paddle.enable_imperative()  # Now we are in imperative mode
             data = np.array([[1.0, 2.0, 3.0, 4.0],
                              [5.0, 6.0, 7.0, 8.0],
                              [9.0, 10.0, 11.0, 12.0]])
             data_index = np.array([0, 1, 1]).astype('int32')
 
-            with fluid.dygraph.guard():
-                x = fluid.dygraph.to_variable(data)
-                index = fluid.dygraph.to_variable(data_index)
-                out_z1 = paddle.index_select(x, index)
-                print(out_z1.numpy())
-                #[[1. 2. 3. 4.]
-                # [5. 6. 7. 8.]
-                # [5. 6. 7. 8.]]
-                out_z2 = paddle.index_select(x, index, dim=1)
-                print(out_z2.numpy())
-                #[[ 1.  2.  2.]
-                # [ 5.  6.  6.]
-                # [ 9. 10. 10.]]
+            x = paddle.imperative.to_variable(data)
+            index = paddle.imperative.to_variable(data_index)
+            out_z1 = paddle.index_select(x=x, index=index)
+            #[[1. 2. 3. 4.]
+            # [5. 6. 7. 8.]
+            # [5. 6. 7. 8.]]
+            out_z2 = paddle.index_select(x=x, index=index, axis=1)
+            #[[ 1.  2.  2.]
+            # [ 5.  6.  6.]
+            # [ 9. 10. 10.]]
     """
-    helper = LayerHelper("index_select", **locals())
+
     if in_dygraph_mode():
-        return core.ops.index_select(input, index, 'dim', dim)
+        return core.ops.index_select(x, index, 'dim', axis)
 
-    check_variable_and_dtype(input, 'x',
-                             ['float32', 'float64', 'int32', 'int64'],
-                             'paddle.tensor.search.index_sample')
+    helper = LayerHelper("index_select", **locals())
+    check_variable_and_dtype(x, 'x', ['float32', 'float64', 'int32', 'int64'],
+                             'paddle.tensor.search.index_select')
     check_variable_and_dtype(index, 'index', ['int32', 'int64'],
-                             'paddle.tensor.search.index_sample')
+                             'paddle.tensor.search.index_select')
 
-    out = helper.create_variable_for_type_inference(input.dtype)
+    out = helper.create_variable_for_type_inference(x.dtype)
 
     helper.append_op(
         type='index_select',
-        inputs={'X': input,
+        inputs={'X': x,
                 'Index': index},
         outputs={'Out': out},
-        attrs={'dim': dim})
+        attrs={'dim': axis})
     return out
 
 
diff --git a/python/requirements.txt b/python/requirements.txt
index 407a0dc5e201de5e44085ab079cb53e609bfd381..5e081f5e85b6e0f645991ab70874d04ab93e3106 100644
--- a/python/requirements.txt
+++ b/python/requirements.txt
@@ -7,7 +7,8 @@ matplotlib<=2.2.4 ; python_version<"3.6"
 scipy>=0.19.0, <=1.2.1 ; python_version<"3.5"
 nltk>=3.2.2, <=3.4 ; python_version<"3.5"
 matplotlib ; python_version>="3.6"
-scipy ; python_version>="3.5"
+scipy<=1.3.1 ; python_version=="3.5"
+scipy ; python_version>"3.5"
 nltk ; python_version>="3.5"
 rarfile
 Pillow
diff --git a/python/setup.py.in b/python/setup.py.in
index f1e9457c19db5c085eb24138c1450f468baa1ec4..ba61499d254f4850a89ec04a3cdcef0f8d5cb9d9 100644
--- a/python/setup.py.in
+++ b/python/setup.py.in
@@ -6,6 +6,7 @@ import shutil
 import sys
 import fnmatch
 import errno
+import platform
 
 from contextlib import contextmanager
 from setuptools import Command
@@ -142,6 +143,13 @@ packages=['paddle',
           'paddle.incubate',
           'paddle.incubate.complex',
           'paddle.incubate.complex.tensor',
+          'paddle.fleet',
+          'paddle.fleet.base',
+          'paddle.fleet.collective',
+          'paddle.fleet.dataset',
+          'paddle.fleet.metrics',
+          'paddle.fleet.parameter_server',
+          'paddle.fleet.proto',
           'paddle.framework',
           'paddle.fluid',
           'paddle.fluid.dygraph',
@@ -160,6 +168,7 @@ packages=['paddle',
           'paddle.fluid.contrib.slim.graph',
           'paddle.fluid.contrib.slim.prune',
           'paddle.fluid.contrib.slim.quantization',
+          'paddle.fluid.contrib.slim.quantization.imperative',
           'paddle.fluid.contrib.slim.distillation',
           'paddle.fluid.contrib.slim.nas',
           'paddle.fluid.contrib.slim.searcher',
@@ -193,6 +202,7 @@ packages=['paddle',
           'paddle.nn.initializer',
           'paddle.metric',
           'paddle.imperative',
+          'paddle.imperative.jit',
           'paddle.tensor',
           ]
 
@@ -262,6 +272,10 @@ else:
         shutil.copy('${OPENBLAS_SHARED_LIB}', libs_path)
         package_data['paddle.libs'] += ['openblas' + ext_name]
 
+if '${WITH_LITE}' == 'ON':
+    shutil.copy('${LITE_SHARED_LIB}', libs_path)
+    package_data['paddle.libs']+=['libpaddle_full_api_shared' + ext_name]
+
 if '${WITH_PSLIB}' == 'ON':
     shutil.copy('${PSLIB_LIB}', libs_path)
     if os.path.exists('${PSLIB_VERSION_PY}'):
@@ -310,8 +324,9 @@ if '${CMAKE_BUILD_TYPE}' == 'Release':
             command = "install_name_tool -id \"@loader_path/../libs/\" ${PADDLE_BINARY_DIR}/python/paddle/fluid/${FLUID_CORE_NAME}" + '.so'
         else:
             command = "patchelf --set-rpath '$ORIGIN/../libs/' ${PADDLE_BINARY_DIR}/python/paddle/fluid/${FLUID_CORE_NAME}" + '.so'
-        if os.system(command) != 0:
-            raise Exception("patch ${FLUID_CORE_NAME}.%s failed, command: %s" % (ext_name, command))
+        if platform.machine() != 'aarch64':
+          if os.system(command) != 0:
+              raise Exception("patch ${FLUID_CORE_NAME}.%s failed, command: %s" % (ext_name, command))
 
 ext_modules = [Extension('_foo', ['stub.cc'])]
 if os.name == 'nt':
diff --git a/tools/check_api_approvals.sh b/tools/check_api_approvals.sh
index 0ef20b746c0dfbbc43df34d40d716a19da2f8d87..3bdcc4cad1ce30a07181b6b8bd3e3707b2c6468b 100644
--- a/tools/check_api_approvals.sh
+++ b/tools/check_api_approvals.sh
@@ -68,16 +68,16 @@ fi
 
 api_spec_diff=`python ${PADDLE_ROOT}/tools/diff_api.py ${PADDLE_ROOT}/paddle/fluid/API_DEV.spec.api  ${PADDLE_ROOT}/paddle/fluid/API_PR.spec.api` 
 if [ "$api_spec_diff" != "" ]; then
-    echo_line="You must have one RD (XiaoguangHu01 or lanxianghit) and one TPM (saxon-zh or Boyan-Liu or swtkiwi or Heeenrrry) approval for the api change for the management reason of API interface.\n"
+    echo_line="You must have one RD (XiaoguangHu01 or lanxianghit) and one TPM (saxon-zh or jzhang533 or swtkiwi or Heeenrrry or TCChenlong) approval for the api change for the management reason of API interface.\n"
     check_approval 1 46782768 47554610
     echo_line=""
-    check_approval 1 2870059 2870059 27208573 28379894
+    check_approval 1 2870059 29231 27208573 28379894 11935832
 fi
 
 api_doc_spec_diff=`python ${PADDLE_ROOT}/tools/diff_api.py ${PADDLE_ROOT}/paddle/fluid/API_DEV.spec.doc  ${PADDLE_ROOT}/paddle/fluid/API_PR.spec.doc` 
 if [ "$api_doc_spec_diff" != "" ]; then
-    echo_line="You must have one TPM (saxon-zh or Boyan-Liu or swtkiwi or Heeenrrry) approval for the api change for the management reason of API document.\n"
-    check_approval 1 31623103 2870059 27208573 28379894
+    echo_line="You must have one TPM (saxon-zh or jzhang533 or swtkiwi or Heeenrrry or TCChenlong) approval for the api change for the management reason of API document.\n"
+    check_approval 1 2870059 29231 27208573 28379894 11935832
 fi
 
 op_type_spec_diff=`python ${PADDLE_ROOT}/tools/check_op_register_type.py ${PADDLE_ROOT}/paddle/fluid/OP_TYPE_DEV.spec  ${PADDLE_ROOT}/paddle/fluid/OP_TYPE_PR.spec`
@@ -127,7 +127,7 @@ for API_FILE in ${API_FILES[*]}; do
           check_approval 1 45189361 6836917 43953930
       elif [ "${API_FILE}" == "python/paddle/fluid/tests/unittests/white_list/no_check_set_white_list.py" ];then
           echo_line="You must have one RD (cryoco (Recommend), luotao1 or phlrain) approval for the python/paddle/fluid/tests/unittests/white_list/no_check_set_white_list.py, which manages the white list of setting no_check_set of check_output. \n"
-          check_approval 1 12407750 26615455 6836917
+          check_approval 1 12407750 6836917 43953930
       elif [ "${API_FILE}" == "python/paddle/fluid/tests/unittests/white_list/check_op_sequence_instance_0_input_white_list.py" ]; then
           echo_line="You must have one RD (JepsonWong (Recommend), luotao1, phlrain) approval for the ${API_FILE}, which manages the white list of instance size 0 input for sequence op test. For more information, please refer to [https://github.com/PaddlePaddle/Paddle/wiki/It-is-required-to-include-LoDTensor-input-with-instance_size=0-in-sequence-OP-test]. \n"
           check_approval 1 16509038 6836917 43953930
@@ -141,8 +141,8 @@ for API_FILE in ${API_FILES[*]}; do
         echo_line="You must have one RD (Shixiaowei02 (Recommend), luotao1 or phlrain) approval for the python/paddle/fluid/tests/unittests/white_list/no_grad_set_white_list.py, which manages the white list of no_grad_set without value in operators. For more information, please refer to[https://github.com/PaddlePaddle/Paddle/wiki/It's-recommend-to-set-no_grad_set-to-be-None].\n"
         check_approval 1 39303645 6836917 43953930
       elif [ "${API_FILE}" == "tools/wlist.json" ];then
-        echo_line="You must have one TPM (saxon-zh or Boyan-Liu or swtkiwi or Heeenrrry) approval for the api whitelist for the tools/wlist.json.\n"
-        check_approval 1 31623103 2870059 27208573 28379894
+        echo_line="You must have one RD (lelelelelez (Recommend) or luotao1) approval for the api whitelist for the tools/wlist.json.\n"
+        check_approval 1 22937122 6836917
       else
           echo_line="You must have one RD (XiaoguangHu01,Xreki,luotao1,sneaxiy) approval for ${API_FILE}, which manages the underlying code for fluid.\n"
           check_approval 1 3048612 46782768 12538138 6836917 32832641
diff --git a/tools/count_api_without_ops.py b/tools/count_api_without_ops.py
new file mode 100644
index 0000000000000000000000000000000000000000..84dd9a6b2f63b7e5cb63c6ab4367a2591274d660
--- /dev/null
+++ b/tools/count_api_without_ops.py
@@ -0,0 +1,140 @@
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+List all operator-raleated APIs that contains append_op but not core.ops.xx.
+
+Usage:
+    python ./count_api_without_ops.py paddle
+"""
+from __future__ import print_function
+
+import importlib
+import inspect
+import collections
+import sys
+import pydoc
+import hashlib
+import six
+import functools
+
+visited_modules = set()
+
+# APIs that should not be printed into API.spec 
+omitted_list = [
+    "paddle.fluid.LoDTensor.set",  # Do not know why it should be omitted
+    "paddle.fluid.io.ComposeNotAligned",
+    "paddle.fluid.io.ComposeNotAligned.__init__",
+]
+
+api_with_ops = []
+api_without_ops = []
+
+
+def queue_dict(member, cur_name):
+    if cur_name in omitted_list:
+        return
+
+    if inspect.isclass(member):
+        pass
+    else:
+        try:
+            source = inspect.getsource(member)
+            if source.find('append_op') != -1:
+                if source.find('core.ops') != -1:
+                    api_with_ops.append(cur_name)
+                else:
+                    api_without_ops.append(cur_name)
+
+        except Exception as e:  # special for PyBind method
+            pass
+
+
+def visit_member(parent_name, member):
+    cur_name = ".".join([parent_name, member.__name__])
+    if inspect.isclass(member):
+        queue_dict(member, cur_name)
+        for name, value in inspect.getmembers(member):
+            if hasattr(value, '__name__') and (not name.startswith("_") or
+                                               name == "__init__"):
+                visit_member(cur_name, value)
+    elif inspect.ismethoddescriptor(member):
+        return
+    elif callable(member):
+        queue_dict(member, cur_name)
+    elif inspect.isgetsetdescriptor(member):
+        return
+    else:
+        raise RuntimeError("Unsupported generate signature of member, type {0}".
+                           format(str(type(member))))
+
+
+def is_primitive(instance):
+    int_types = (int, long) if six.PY2 else (int, )
+    pritimitive_types = int_types + (float, str)
+    if isinstance(instance, pritimitive_types):
+        return True
+    elif isinstance(instance, (list, tuple, set)):
+        for obj in instance:
+            if not is_primitive(obj):
+                return False
+
+        return True
+    else:
+        return False
+
+
+def visit_all_module(mod):
+    mod_name = mod.__name__
+    if mod_name != 'paddle' and not mod_name.startswith('paddle.'):
+        return
+
+    if mod_name.startswith('paddle.fluid.core'):
+        return
+
+    if mod in visited_modules:
+        return
+
+    visited_modules.add(mod)
+
+    for member_name in (
+            name
+            for name in (mod.__all__ if hasattr(mod, "__all__") else dir(mod))
+            if not name.startswith("_")):
+        instance = getattr(mod, member_name, None)
+        if instance is None:
+            continue
+
+        if is_primitive(instance):
+            continue
+
+        if not hasattr(instance, "__name__"):
+            continue
+
+        if inspect.ismodule(instance):
+            visit_all_module(instance)
+        else:
+            visit_member(mod.__name__, instance)
+
+
+modules = sys.argv[1].split(",")
+for m in modules:
+    visit_all_module(importlib.import_module(m))
+
+print('api_with_ops:', len(api_with_ops))
+print('\n'.join(api_with_ops))
+
+print('\n==============\n')
+
+print('api_without_ops:', len(api_without_ops))
+print('\n'.join(api_without_ops))
diff --git a/tools/manylinux1/Dockerfile.Inference b/tools/manylinux1/Dockerfile.Inference
index e045fc52109e8e1e30ce62cb77cb0294be56e1a5..0ba180b894b22041afc3a0ce9eaa918a74a355c2 120000
--- a/tools/manylinux1/Dockerfile.Inference
+++ b/tools/manylinux1/Dockerfile.Inference
@@ -1 +1 @@
-Dockerfile.cuda10_cudnn7_gcc48_ubuntu16
\ No newline at end of file
+Dockerfile.cuda10_cudnn7_gcc8_ubuntu16
\ No newline at end of file
diff --git a/tools/manylinux1/Dockerfile.cuda10_cudnn7_gcc48_ubuntu16 b/tools/manylinux1/Dockerfile.cuda10_cudnn7_gcc48_ubuntu16
index 9a763059ace11cd1cef11eaa739de9b5f2a24a88..ffef02dba4614f7bbbe13ebc30b40438a52b4590 100644
--- a/tools/manylinux1/Dockerfile.cuda10_cudnn7_gcc48_ubuntu16
+++ b/tools/manylinux1/Dockerfile.cuda10_cudnn7_gcc48_ubuntu16
@@ -100,10 +100,13 @@ WORKDIR /home/setuptools-40.6.2
 RUN python setup.py build
 RUN python setup.py install
 WORKDIR /home
-RUN wget https://files.pythonhosted.org/packages/69/81/52b68d0a4de760a2f1979b0931ba7889202f302072cc7a0d614211bc7579/pip-18.0.tar.gz
-RUN tar -zxvf pip-18.0.tar.gz
+
+RUN wget https://files.pythonhosted.org/packages/69/81/52b68d0a4de760a2f1979b0931ba7889202f302072cc7a0d614211bc7579/pip-18.0.tar.gz && tar -zxvf pip-18.0.tar.gz
 WORKDIR pip-18.0
-RUN python setup.py install
+RUN python setup.py install && \
+  python3.7 setup.py install && \
+  python3.6 setup.py install && \
+  python3 setup.py install
 
 WORKDIR /home
 RUN rm Python-$version.tgz setuptools-40.6.2.zip pip-18.0.tar.gz && \
@@ -225,6 +228,9 @@ RUN wget https://paddle-ci.gz.bcebos.com/ccache-3.7.9.tar.gz && \
     make -j8 && make install && \
     ln -s /usr/local/ccache-3.7.9/bin/ccache /usr/local/bin/ccache
 
+RUN wget --no-check-certificate  -q https://paddle-edl.bj.bcebos.com/hadoop-2.7.7.tar.gz && \
+     tar -xzf  hadoop-2.7.7.tar.gz && mv hadoop-2.7.7 /usr/local/
+
 # Configure OpenSSH server. c.f. https://docs.docker.com/engine/examples/running_ssh_service
 RUN mkdir /var/run/sshd
 RUN echo 'root:root' | chpasswd
diff --git a/tools/manylinux1/Dockerfile.cuda10_cudnn7_gcc8_ubuntu16 b/tools/manylinux1/Dockerfile.cuda10_cudnn7_gcc8_ubuntu16
index 3e1697fbd57cfe2c37b42b00e6300bf0279632c0..d421f5ec801438343b38914cce391ee43b3f53d0 100644
--- a/tools/manylinux1/Dockerfile.cuda10_cudnn7_gcc8_ubuntu16
+++ b/tools/manylinux1/Dockerfile.cuda10_cudnn7_gcc8_ubuntu16
@@ -41,6 +41,7 @@ RUN wget -q https://paddle-docker-tar.bj.bcebos.com/home/users/tianshuo/bce-pyth
   make -j8 && make install
 
 ENV PATH=/usr/local/gcc-8.2/bin:$PATH
+ENV LD_LIBRARY_PATH=/usr/local/gcc-8.2/lib64:$LD_LIBRARY_PATH
 RUN rm -rf /temp_gcc82 && rm -rf /gcc-8.2.0.tar.xz && rm -rf /gcc-8.2.0
 
 # Install Python3.6
@@ -107,10 +108,13 @@ WORKDIR /home/setuptools-40.6.2
 RUN python setup.py build
 RUN python setup.py install
 WORKDIR /home
-RUN wget https://files.pythonhosted.org/packages/69/81/52b68d0a4de760a2f1979b0931ba7889202f302072cc7a0d614211bc7579/pip-18.0.tar.gz
-RUN tar -zxvf pip-18.0.tar.gz
+
+RUN wget https://files.pythonhosted.org/packages/69/81/52b68d0a4de760a2f1979b0931ba7889202f302072cc7a0d614211bc7579/pip-18.0.tar.gz && tar -zxvf pip-18.0.tar.gz
 WORKDIR pip-18.0
-RUN python setup.py install
+RUN python setup.py install && \
+  python3.7 setup.py install && \
+  python3.6 setup.py install && \
+  python3 setup.py install
 
 WORKDIR /home
 RUN rm Python-$version.tgz setuptools-40.6.2.zip pip-18.0.tar.gz && \
@@ -228,6 +232,9 @@ RUN wget https://paddle-ci.gz.bcebos.com/ccache-3.7.9.tar.gz && \
     make -j8 && make install && \
     ln -s /usr/local/ccache-3.7.9/bin/ccache /usr/local/bin/ccache
 
+RUN wget --no-check-certificate  -q https://paddle-edl.bj.bcebos.com/hadoop-2.7.7.tar.gz && \
+     tar -xzf  hadoop-2.7.7.tar.gz && mv hadoop-2.7.7 /usr/local/
+
 # Configure OpenSSH server. c.f. https://docs.docker.com/engine/examples/running_ssh_service
 RUN mkdir /var/run/sshd
 RUN echo 'root:root' | chpasswd
diff --git a/tools/manylinux1/Dockerfile.cuda9_cudnn7_gcc48_py35_centos6 b/tools/manylinux1/Dockerfile.cuda9_cudnn7_gcc48_py35_centos6
index a5afee246e350a124b7b6a6b7664bd00d980f183..1f972c583cb83c97c8f7bec96913a62ac32a1e5a 100644
--- a/tools/manylinux1/Dockerfile.cuda9_cudnn7_gcc48_py35_centos6
+++ b/tools/manylinux1/Dockerfile.cuda9_cudnn7_gcc48_py35_centos6
@@ -65,4 +65,7 @@ RUN LD_LIBRARY_PATH=/opt/_internal/cpython-2.7.11-ucs4/lib:${LD_LIBRARY_PATH} /o
 RUN wget -O /opt/swig-2.0.12.tar.gz https://sourceforge.net/projects/swig/files/swig/swig-2.0.12/swig-2.0.12.tar.gz/download && \
     cd /opt && tar xzf swig-2.0.12.tar.gz && cd /opt/swig-2.0.12 && ./configure && make && make install && cd /opt && rm swig-2.0.12.tar.gz
 
+RUN wget --no-check-certificate  -q https://paddle-edl.bj.bcebos.com/hadoop-2.7.7.tar.gz && \
+     tar -xzf  hadoop-2.7.7.tar.gz && mv hadoop-2.7.7 /usr/local/
+
 CMD ["bash", "/paddle/paddle/scripts/docker/build.sh"]
diff --git a/tools/sampcd_processor.py b/tools/sampcd_processor.py
index 086a36c3a80b57660a1973d34dbf2c399840a6cd..0ef3a63f54a0918ae13f17138339b13848458680 100644
--- a/tools/sampcd_processor.py
+++ b/tools/sampcd_processor.py
@@ -481,7 +481,10 @@ def get_filenames():
                 print("\nWARNING:----Exception in get api filename----\n")
                 print("\n" + api + ' module is ' + module + "\n")
             if filename != '':
-                if filename not in filenames:
+                # rm contrib file
+                if filename.startswith('../python/paddle/fluid/contrib'):
+                    pass
+                elif filename not in filenames:
                     filenames.append(filename)
             # get all methods
             method = ''