diff --git a/CMakeLists.txt b/CMakeLists.txt
index 3c3e319fb7b8bae0523d7d2e1605680d59f193ce..446b21418a45f8a673f7c71de19d7e7d20790adb 100755
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -63,8 +63,29 @@ if(WIN32)
         set(CMAKE_C_FLAGS_RELEASE  "${CMAKE_C_FLAGS_RELEASE} /bigobj /MT")
         set(CMAKE_CXX_FLAGS_DEBUG  "${CMAKE_CXX_FLAGS_DEBUG} /bigobj /MTd")
         set(CMAKE_CXX_FLAGS_RELEASE   "${CMAKE_CXX_FLAGS_RELEASE} /bigobj /MT")
+        foreach(flag_var
+            CMAKE_CXX_FLAGS CMAKE_CXX_FLAGS_DEBUG CMAKE_CXX_FLAGS_RELEASE
+            CMAKE_CXX_FLAGS_MINSIZEREL CMAKE_CXX_FLAGS_RELWITHDEBINFO
+            CMAKE_C_FLAGS CMAKE_C_FLAGS_DEBUG CMAKE_C_FLAGS_RELEASE
+            CMAKE_C_FLAGS_MINSIZEREL CMAKE_C_FLAGS_RELWITHDEBINFO)
+            if(${flag_var} MATCHES "/MD")
+                string(REGEX REPLACE "/MD" "/MT" ${flag_var} "${${flag_var}}")
+            endif()
+        endforeach(flag_var)
     endif()
-    
+
+    # windows build turn off warnings.
+    foreach(flag_var
+        CMAKE_CXX_FLAGS CMAKE_CXX_FLAGS_DEBUG CMAKE_CXX_FLAGS_RELEASE
+        CMAKE_CXX_FLAGS_MINSIZEREL CMAKE_CXX_FLAGS_RELWITHDEBINFO
+        CMAKE_C_FLAGS CMAKE_C_FLAGS_DEBUG CMAKE_C_FLAGS_RELEASE
+        CMAKE_C_FLAGS_MINSIZEREL CMAKE_C_FLAGS_RELWITHDEBINFO)
+        string(REGEX REPLACE "/W[1-4]" " /W0 " ${flag_var} "${${flag_var}}")
+    endforeach(flag_var)
+    foreach(flag_var CMAKE_CXX_FLAGS CMAKE_C_FLAGS)
+        set(${flag_var} "${${flag_var}} /w")
+    endforeach(flag_var)
+
     set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} /wd4068 /wd4129 /wd4244 /wd4267 /wd4297 /wd4530 /wd4577 /wd4819 /wd4838 /MP")
     set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} /wd4068 /wd4129 /wd4244 /wd4267 /wd4297 /wd4530 /wd4577 /wd4819 /wd4838 /MP")
     message(STATUS "Using parallel compiling (/MP)")
diff --git a/cmake/cuda.cmake b/cmake/cuda.cmake
index b7a93cd9ee2160090c0142d62d96da72e4c58717..c78fe5f6c7fbd44e0820747f200e3e8168dc3783 100644
--- a/cmake/cuda.cmake
+++ b/cmake/cuda.cmake
@@ -16,6 +16,7 @@ else()
   set(paddle_known_gpu_archs8 "30 35 50 52 60 61")
   set(paddle_known_gpu_archs9 "30 35 50 52 60 61 70")
   set(paddle_known_gpu_archs10 "30 35 50 52 60 61 70 75")
+  set(paddle_known_gpu_archs11 "52 60 61 70 75 80")
 endif()
 
 ######################################################################################
@@ -106,6 +107,9 @@ function(select_nvcc_arch_flags out_variable)
   elseif(${CUDA_ARCH_NAME} STREQUAL "Maxwell")
     set(cuda_arch_bin "50")
   elseif(${CUDA_ARCH_NAME} STREQUAL "Pascal")
+    if (NOT ${CMAKE_CUDA_COMPILER_VERSION} LESS 10.0)
+      add_definitions("-DSUPPORTS_CUDA_FP16")
+    endif()
     set(cuda_arch_bin "60 61")
   elseif(${CUDA_ARCH_NAME} STREQUAL "Volta")
     if (NOT ${CMAKE_CUDA_COMPILER_VERSION} LESS 10.0)
@@ -188,6 +192,10 @@ elseif (${CMAKE_CUDA_COMPILER_VERSION} LESS 11.0) # CUDA 10.x
   set(paddle_known_gpu_archs ${paddle_known_gpu_archs10})
   set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -D_MWAITXINTRIN_H_INCLUDED")
   set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -D__STRICT_ANSI__")
+elseif (${CMAKE_CUDA_COMPILER_VERSION} LESS 12.0) # CUDA 11.x
+  set(paddle_known_gpu_archs ${paddle_known_gpu_archs11})
+  set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -D_MWAITXINTRIN_H_INCLUDED")
+  set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -D__STRICT_ANSI__")
 endif()
 
 add_definitions("-DPADDLE_CUDA_BINVER=\"${CUDA_VERSION_MAJOR}${CUDA_VERSION_MINOR}\"")
diff --git a/cmake/external/cryptopp.cmake b/cmake/external/cryptopp.cmake
index af5dd0e2c9b2d19929f58363d08e7ff40d43b013..351ef1c7c7aebb698a5d41689352a913d0b950e8 100644
--- a/cmake/external/cryptopp.cmake
+++ b/cmake/external/cryptopp.cmake
@@ -22,23 +22,8 @@ SET(CRYPTOPP_TAG        CRYPTOPP_8_2_0)
 
 IF(WIN32)
   SET(CRYPTOPP_LIBRARIES "${CRYPTOPP_INSTALL_DIR}/lib/cryptopp-static.lib" CACHE FILEPATH "cryptopp library." FORCE)
-  SET(CRYPTOPP_CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS}")
-  set(CMAKE_CXX_FLAGS_RELEASE "${CMAKE_CXX_FLAGS_RELEASE} /MT")
-  set(CMAKE_CXX_FLAGS_DEBUG "${CMAKE_CXX_FLAGS_DEBUG} /MTd")
-  set(CompilerFlags
-        CMAKE_CXX_FLAGS
-        CMAKE_CXX_FLAGS_DEBUG
-        CMAKE_CXX_FLAGS_RELEASE
-        CMAKE_C_FLAGS
-        CMAKE_C_FLAGS_DEBUG
-        CMAKE_C_FLAGS_RELEASE
-        )
-  foreach(CompilerFlag ${CompilerFlags})
-    string(REPLACE "/MD" "/MT" ${CompilerFlag} "${${CompilerFlag}}")
-  endforeach()
 ELSE(WIN32)
   SET(CRYPTOPP_LIBRARIES "${CRYPTOPP_INSTALL_DIR}/lib/libcryptopp.a" CACHE FILEPATH "cryptopp library." FORCE)
-  SET(CRYPTOPP_CMAKE_CXX_FLAGS ${CMAKE_CXX_FLAGS})
 ENDIF(WIN32)
 
 set(CRYPTOPP_CMAKE_ARGS ${COMMON_CMAKE_ARGS}
@@ -48,7 +33,7 @@ set(CRYPTOPP_CMAKE_ARGS ${COMMON_CMAKE_ARGS}
                         -DCMAKE_INSTALL_LIBDIR=${CRYPTOPP_INSTALL_DIR}/lib
                         -DCMAKE_INSTALL_PREFIX=${CRYPTOPP_INSTALL_DIR}
                         -DCMAKE_BUILD_TYPE=${THIRD_PARTY_BUILD_TYPE}
-                        -DCMAKE_CXX_FLAGS=${CRYPTOPP_CMAKE_CXX_FLAGS}
+                        -DCMAKE_CXX_FLAGS=${CMAKE_CXX_FLAGS}
                         -DCMAKE_CXX_FLAGS_RELEASE=${CMAKE_CXX_FLAGS_RELEASE}
                         -DCMAKE_C_COMPILER=${CMAKE_C_COMPILER}
                         -DCMAKE_CXX_COMPILER=${CMAKE_CXX_COMPILER}
diff --git a/cmake/external/dgc.cmake b/cmake/external/dgc.cmake
index 8472a0743b91e69d823ed62f94b55045a31aaabc..bc8611f3862cd14c0de493564ea82a1c9ce66667 100644
--- a/cmake/external/dgc.cmake
+++ b/cmake/external/dgc.cmake
@@ -19,7 +19,7 @@ SET(DGC_SOURCES_DIR "${THIRD_PARTY_PATH}/dgc/src/extern_dgc")
 SET(DGC_INSTALL_DIR "${THIRD_PARTY_PATH}/install/dgc")
 SET(DGC_INCLUDE_DIR "${DGC_INSTALL_DIR}/include" CACHE PATH "dgc include directory." FORCE)
 SET(DGC_LIBRARIES   "${DGC_INSTALL_DIR}/lib/libdgc.a" CACHE FILEPATH "dgc library." FORCE)
-SET(DGC_URL         "http://fleet.bj.bcebos.com/collective_ef2216a.tgz")
+SET(DGC_URL         "https://fleet.bj.bcebos.com/dgc/collective_f66ef73.tgz")
 INCLUDE_DIRECTORIES(${DGC_INCLUDE_DIR})
 
 cache_third_party(extern_dgc
@@ -30,7 +30,7 @@ ExternalProject_Add(
     extern_dgc
     ${EXTERNAL_PROJECT_LOG_ARGS}
     "${DGC_DOWNLOAD_CMD}"
-    URL_MD5         "2f67549fd5f1262383d83289abc4f88f"
+    URL_MD5         "94e6fa1bc97169d0e1aad44570fe3251"
     PREFIX          "${DGC_PREFIX_DIR}"
     SOURCE_DIR      "${DGC_SOURCES_DIR}"
     CONFIGURE_COMMAND ""
diff --git a/cmake/external/lite.cmake b/cmake/external/lite.cmake
index 8a655b2954dea5d6b864616ed2f4d19b167c4be8..3da550519bae2a12139873a2a84680debbaa8f4c 100644
--- a/cmake/external/lite.cmake
+++ b/cmake/external/lite.cmake
@@ -34,7 +34,7 @@ if (NOT LITE_SOURCE_DIR OR NOT LITE_BINARY_DIR)
   set(LITE_INSTALL_DIR ${THIRD_PARTY_PATH}/install/lite)
 
   if(NOT LITE_GIT_TAG)
-    set(LITE_GIT_TAG dfdfa6440c83bf0b415f9f5a9ff84842ce0bb0fa)
+    set(LITE_GIT_TAG 6d2b2a4028a58715b01887b04eb9bff8432eb184)
   endif()
 
   if(NOT CUDA_ARCH_NAME)
diff --git a/cmake/external/mkldnn.cmake b/cmake/external/mkldnn.cmake
index ae870b766fc3349ea53628e14c68ab9a5826213f..c0adda0da31ae1e7425ddfb352971444c09d5615 100644
--- a/cmake/external/mkldnn.cmake
+++ b/cmake/external/mkldnn.cmake
@@ -19,8 +19,8 @@ SET(MKLDNN_PREFIX_DIR     ${THIRD_PARTY_PATH}/mkldnn)
 SET(MKLDNN_SOURCE_DIR     ${THIRD_PARTY_PATH}/mkldnn/src/extern_mkldnn)
 SET(MKLDNN_INSTALL_DIR    ${THIRD_PARTY_PATH}/install/mkldnn)
 SET(MKLDNN_INC_DIR        "${MKLDNN_INSTALL_DIR}/include" CACHE PATH "mkldnn include directory." FORCE)
-SET(MKLDNN_REPOSITORY     https://github.com/intel/mkl-dnn.git)
-SET(MKLDNN_TAG            1ea812f4f5aa1bd989372a23ab50d0f0f81ee677)
+SET(MKLDNN_REPOSITORY     https://github.com/oneapi-src/oneDNN.git)
+SET(MKLDNN_TAG            64a48f9565aa72f6359917b3406328075a409939)
 
 # Introduce variables:
 # * CMAKE_INSTALL_LIBDIR
diff --git a/cmake/external/warpctc.cmake b/cmake/external/warpctc.cmake
index 0f6b1c182d5590354c8a970eea339a3e23846f39..ac6cf624e82c0a346fea42fa29fe9bab6ace8d47 100644
--- a/cmake/external/warpctc.cmake
+++ b/cmake/external/warpctc.cmake
@@ -18,7 +18,7 @@ SET(WARPCTC_PREFIX_DIR  ${THIRD_PARTY_PATH}/warpctc)
 SET(WARPCTC_SOURCE_DIR  ${THIRD_PARTY_PATH}/warpctc/src/extern_warpctc)
 SET(WARPCTC_INSTALL_DIR ${THIRD_PARTY_PATH}/install/warpctc)
 set(WARPCTC_REPOSITORY  https://github.com/baidu-research/warp-ctc.git)
-set(WARPCTC_TAG         bc29dcfff07ced1c7a19a4ecee48e5ad583cef8e)
+set(WARPCTC_TAG         fc7f226b93758216a03b1be9d24593a12819b984)
 
 SET(WARPCTC_INCLUDE_DIR "${WARPCTC_INSTALL_DIR}/include"
     CACHE PATH "Warp-ctc Directory" FORCE)
diff --git a/cmake/flags.cmake b/cmake/flags.cmake
index 9d07a0979d9392c9b2ab78562f8e0ceb8fc5d722..ed0bf8396b3faa22350811cf1711f5d1e5b89998 100644
--- a/cmake/flags.cmake
+++ b/cmake/flags.cmake
@@ -28,7 +28,15 @@ function(CheckCompilerCXX11Flag)
 endfunction()
 
 CheckCompilerCXX11Flag()
-set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -std=c++11")
+if (WITH_GPU)
+    if (${CMAKE_CUDA_COMPILER_VERSION} GREATER_EQUAL 11.0)
+       set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -std=c++14")
+    else()
+      set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -std=c++11")
+    endif()
+else()
+    set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -std=c++11")
+endif()
 # safe_set_flag
 #
 # Set a compile flag only if compiler is support
@@ -82,20 +90,6 @@ macro(safe_set_nvflag flag_name)
     endif()
 endmacro()
 
-macro(safe_set_static_flag) # set c_flags and cxx_flags to static or shared
-    if (BUILD_SHARED_LIBS) 
-        return() # if build shared libs, the flags keep same with '/MD'
-    endif(BUILD_SHARED_LIBS)
-    foreach(flag_var
-        CMAKE_CXX_FLAGS CMAKE_CXX_FLAGS_DEBUG CMAKE_CXX_FLAGS_RELEASE
-        CMAKE_CXX_FLAGS_MINSIZEREL CMAKE_CXX_FLAGS_RELWITHDEBINFO
-        CMAKE_C_FLAGS CMAKE_C_FLAGS_DEBUG CMAKE_C_FLAGS_RELEASE
-        CMAKE_C_FLAGS_MINSIZEREL CMAKE_C_FLAGS_RELWITHDEBINFO)
-      if(${flag_var} MATCHES "/MD")
-        string(REGEX REPLACE "/MD" "/MT" ${flag_var} "${${flag_var}}")
-      endif(${flag_var} MATCHES "/MD")
-    endforeach(flag_var)
-endmacro()
 
 CHECK_CXX_SYMBOL_EXISTS(UINT64_MAX "stdint.h" UINT64_MAX_EXISTS)
 if(NOT UINT64_MAX_EXISTS)
@@ -221,20 +215,3 @@ endforeach()
 
 set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} ${SAFE_GPU_COMMON_FLAGS}")
 
-
-if(WIN32)
-    # windows build turn off warnings.
-    if(MSVC_STATIC_CRT)
-        safe_set_static_flag()
-    endif()
-    foreach(flag_var
-        CMAKE_CXX_FLAGS CMAKE_CXX_FLAGS_DEBUG CMAKE_CXX_FLAGS_RELEASE
-        CMAKE_CXX_FLAGS_MINSIZEREL CMAKE_CXX_FLAGS_RELWITHDEBINFO
-        CMAKE_C_FLAGS CMAKE_C_FLAGS_DEBUG CMAKE_C_FLAGS_RELEASE
-        CMAKE_C_FLAGS_MINSIZEREL CMAKE_C_FLAGS_RELWITHDEBINFO)
-        string(REGEX REPLACE "/W[1-4]" " /W0 " ${flag_var} "${${flag_var}}")
-    endforeach(flag_var)
-    foreach(flag_var CMAKE_CXX_FLAGS CMAKE_C_FLAGS)
-        set(${flag_var} "${${flag_var}} /w")
-    endforeach(flag_var)
-endif()
diff --git a/cmake/generic.cmake b/cmake/generic.cmake
index 1956e5c39ea2524d8a8e2650eb08f8d58f410b73..b0a6dfe29020781e57d57861137861366864abdb 100644
--- a/cmake/generic.cmake
+++ b/cmake/generic.cmake
@@ -386,7 +386,7 @@ function(cc_test_run TARGET_NAME)
     set_property(TEST ${TARGET_NAME} PROPERTY ENVIRONMENT FLAGS_cudnn_deterministic=true)
     # No unit test should exceed 2 minutes.
     if (APPLE OR WIN32)
-        set_tests_properties(${TARGET_NAME} PROPERTIES TIMEOUT 600)
+        set_tests_properties(${TARGET_NAME} PROPERTIES TIMEOUT 150)
     else()
         set_tests_properties(${TARGET_NAME} PROPERTIES TIMEOUT 120)
     endif()
@@ -748,7 +748,7 @@ function(py_test TARGET_NAME)
     endif()
     
     if (APPLE OR WIN32)
-        set_tests_properties(${TARGET_NAME} PROPERTIES TIMEOUT 600)
+        set_tests_properties(${TARGET_NAME} PROPERTIES TIMEOUT 150)
     else()
         # No unit test should exceed 2 minutes in Linux.
         set_tests_properties(${TARGET_NAME} PROPERTIES TIMEOUT 120)
diff --git a/cmake/inference_lib.cmake b/cmake/inference_lib.cmake
index 20f27715e00457a8fe43f5c620e2a005387d7988..f19f0eb43d34bd0f3748d7beb1fcf403fa1c9037 100644
--- a/cmake/inference_lib.cmake
+++ b/cmake/inference_lib.cmake
@@ -13,18 +13,18 @@
 # limitations under the License.
 
 # make package for paddle fluid shared and static library
-set(FLUID_INSTALL_DIR "${CMAKE_BINARY_DIR}/fluid_install_dir" CACHE STRING
-  "A path setting fluid shared and static libraries")
+set(PADDLE_INSTALL_DIR "${CMAKE_BINARY_DIR}/paddle_install_dir" CACHE STRING
+  "A path setting paddle shared and static libraries")
 
-set(FLUID_INFERENCE_INSTALL_DIR "${CMAKE_BINARY_DIR}/fluid_inference_install_dir" CACHE STRING
-  "A path setting fluid inference shared and static libraries")
+set(PADDLE_INFERENCE_INSTALL_DIR "${CMAKE_BINARY_DIR}/paddle_inference_install_dir" CACHE STRING
+  "A path setting paddle inference shared and static libraries")
   
 # TODO(zhaolong)
 # At present, the size of static lib in Windows exceeds the system limit,
 # so the generation of static lib is temporarily turned off.
 if(WIN32)
     #todo: remove the option 
-    option(WITH_STATIC_LIB "Compile demo with static/shared library, default use static."   OFF)
+    option(WITH_STATIC_LIB "Compile demo with static/shared library, default use dynamic."   OFF)
     if(NOT PYTHON_EXECUTABLE)
         FIND_PACKAGE(PythonInterp REQUIRED)
     endif()
@@ -142,14 +142,14 @@ set(inference_lib_deps third_party paddle_fluid paddle_fluid_c paddle_fluid_shar
 add_custom_target(inference_lib_dist DEPENDS ${inference_lib_deps})
 
 
-set(dst_dir "${FLUID_INFERENCE_INSTALL_DIR}/third_party/threadpool")
+set(dst_dir "${PADDLE_INFERENCE_INSTALL_DIR}/third_party/threadpool")
 copy(inference_lib_dist
         SRCS ${THREADPOOL_INCLUDE_DIR}/ThreadPool.h
         DSTS ${dst_dir})
 
 # Only GPU need cudaErrorMessage.pb
 IF(WITH_GPU)
-        set(dst_dir "${FLUID_INFERENCE_INSTALL_DIR}/third_party/cudaerror/data")
+        set(dst_dir "${PADDLE_INFERENCE_INSTALL_DIR}/third_party/cudaerror/data")
         copy(inference_lib_dist
                 SRCS ${cudaerror_INCLUDE_DIR}
                 DSTS ${dst_dir})
@@ -158,65 +158,62 @@ ENDIF()
 # CMakeCache Info
 copy(inference_lib_dist
         SRCS ${CMAKE_CURRENT_BINARY_DIR}/CMakeCache.txt
-        DSTS ${FLUID_INFERENCE_INSTALL_DIR})
+        DSTS ${PADDLE_INFERENCE_INSTALL_DIR})
 
-copy_part_of_thrid_party(inference_lib_dist ${FLUID_INFERENCE_INSTALL_DIR})
+copy_part_of_thrid_party(inference_lib_dist ${PADDLE_INFERENCE_INSTALL_DIR})
 
 set(src_dir "${PADDLE_SOURCE_DIR}/paddle/fluid")
 if(WIN32)
     if(WITH_STATIC_LIB)
-        set(paddle_fluid_lib ${PADDLE_BINARY_DIR}/paddle/fluid/inference/${CMAKE_BUILD_TYPE}/libpaddle_fluid.lib)
+        set(paddle_fluid_lib ${PADDLE_BINARY_DIR}/paddle/fluid/inference/${CMAKE_BUILD_TYPE}/libpaddle_fluid.lib
+                             ${PADDLE_BINARY_DIR}/paddle/fluid/inference/${CMAKE_BUILD_TYPE}/paddle_fluid.*)
     else()
         set(paddle_fluid_lib ${PADDLE_BINARY_DIR}/paddle/fluid/inference/${CMAKE_BUILD_TYPE}/paddle_fluid.dll
-                            ${PADDLE_BINARY_DIR}/paddle/fluid/inference/${CMAKE_BUILD_TYPE}/paddle_fluid.lib)
+                             ${PADDLE_BINARY_DIR}/paddle/fluid/inference/${CMAKE_BUILD_TYPE}/paddle_fluid.lib)
     endif()
+    copy(inference_lib_dist
+            SRCS  ${src_dir}/inference/api/paddle_*.h ${paddle_fluid_lib}
+            DSTS  ${PADDLE_INFERENCE_INSTALL_DIR}/paddle/include ${PADDLE_INFERENCE_INSTALL_DIR}/paddle/lib
+            ${PADDLE_INFERENCE_INSTALL_DIR}/paddle/lib)
 else(WIN32)
     set(paddle_fluid_lib ${PADDLE_BINARY_DIR}/paddle/fluid/inference/libpaddle_fluid.*)
-endif(WIN32)
-
-if(WIN32 AND NOT WITH_STATIC_LIB)
-        copy(inference_lib_dist
-                SRCS  ${src_dir}/inference/api/paddle_*.h ${paddle_fluid_lib}
-                DSTS  ${FLUID_INFERENCE_INSTALL_DIR}/paddle/include ${FLUID_INFERENCE_INSTALL_DIR}/paddle/lib
-                      ${FLUID_INFERENCE_INSTALL_DIR}/paddle/lib)
-else()
-        copy(inference_lib_dist
+    copy(inference_lib_dist
                 SRCS  ${src_dir}/inference/api/paddle_*.h ${paddle_fluid_lib}
-                DSTS  ${FLUID_INFERENCE_INSTALL_DIR}/paddle/include ${FLUID_INFERENCE_INSTALL_DIR}/paddle/lib)
-endif()
+                DSTS  ${PADDLE_INFERENCE_INSTALL_DIR}/paddle/include ${PADDLE_INFERENCE_INSTALL_DIR}/paddle/lib)
+endif(WIN32)
 
 copy(inference_lib_dist
         SRCS  ${CMAKE_BINARY_DIR}/paddle/fluid/framework/framework.pb.h
-        DSTS  ${FLUID_INFERENCE_INSTALL_DIR}/paddle/include/internal)
+        DSTS  ${PADDLE_INFERENCE_INSTALL_DIR}/paddle/include/internal)
 copy(inference_lib_dist
         SRCS  ${PADDLE_SOURCE_DIR}/paddle/fluid/framework/io/crypto/cipher.h
-        DSTS  ${FLUID_INFERENCE_INSTALL_DIR}/paddle/include/crypto/)
+        DSTS  ${PADDLE_INFERENCE_INSTALL_DIR}/paddle/include/crypto/)
 include_directories(${CMAKE_BINARY_DIR}/../paddle/fluid/framework/io)
 
 # CAPI inference library for only inference
-set(FLUID_INFERENCE_C_INSTALL_DIR "${CMAKE_BINARY_DIR}/fluid_inference_c_install_dir" CACHE STRING
-"A path setting CAPI fluid inference shared")
-copy_part_of_thrid_party(inference_lib_dist ${FLUID_INFERENCE_C_INSTALL_DIR})
+set(PADDLE_INFERENCE_C_INSTALL_DIR "${CMAKE_BINARY_DIR}/paddle_inference_c_install_dir" CACHE STRING
+"A path setting CAPI paddle inference shared")
+copy_part_of_thrid_party(inference_lib_dist ${PADDLE_INFERENCE_C_INSTALL_DIR})
 
 set(src_dir "${PADDLE_SOURCE_DIR}/paddle/fluid")
 set(paddle_fluid_c_lib ${PADDLE_BINARY_DIR}/paddle/fluid/inference/capi/libpaddle_fluid_c.*)
 
 copy(inference_lib_dist
       SRCS  ${src_dir}/inference/capi/paddle_c_api.h  ${paddle_fluid_c_lib}
-      DSTS  ${FLUID_INFERENCE_C_INSTALL_DIR}/paddle/include ${FLUID_INFERENCE_C_INSTALL_DIR}/paddle/lib)
+      DSTS  ${PADDLE_INFERENCE_C_INSTALL_DIR}/paddle/include ${PADDLE_INFERENCE_C_INSTALL_DIR}/paddle/lib)
 
 # fluid library for both train and inference
 set(fluid_lib_deps inference_lib_dist)
 add_custom_target(fluid_lib_dist ALL DEPENDS ${fluid_lib_deps})
 
-set(dst_dir "${FLUID_INSTALL_DIR}/paddle/fluid")
+set(dst_dir "${PADDLE_INSTALL_DIR}/paddle/fluid")
 set(module "inference")
-if(WIN32 AND NOT WITH_STATIC_LIB)
+if(WIN32)
         copy(fluid_lib_dist
                 SRCS ${src_dir}/${module}/*.h ${src_dir}/${module}/api/paddle_*.h ${paddle_fluid_lib}
                 DSTS ${dst_dir}/${module} ${dst_dir}/${module} ${dst_dir}/${module} ${dst_dir}/${module}
                 )
-else()
+        else()
         copy(fluid_lib_dist
                 SRCS ${src_dir}/${module}/*.h ${src_dir}/${module}/api/paddle_*.h ${paddle_fluid_lib}
                 DSTS ${dst_dir}/${module} ${dst_dir}/${module} ${dst_dir}/${module} 
@@ -273,22 +270,22 @@ copy(fluid_lib_dist
         DSTS ${dst_dir}/${module}
         )
 
-set(dst_dir "${FLUID_INSTALL_DIR}/third_party/eigen3")
+set(dst_dir "${PADDLE_INSTALL_DIR}/third_party/eigen3")
 copy(inference_lib_dist
         SRCS ${EIGEN_INCLUDE_DIR}/Eigen/Core ${EIGEN_INCLUDE_DIR}/Eigen/src ${EIGEN_INCLUDE_DIR}/unsupported/Eigen
         DSTS ${dst_dir}/Eigen ${dst_dir}/Eigen ${dst_dir}/unsupported)
 
-set(dst_dir "${FLUID_INSTALL_DIR}/third_party/boost")
+set(dst_dir "${PADDLE_INSTALL_DIR}/third_party/boost")
 copy(inference_lib_dist
         SRCS ${BOOST_INCLUDE_DIR}/boost
         DSTS ${dst_dir})
 
-set(dst_dir "${FLUID_INSTALL_DIR}/third_party/dlpack")
+set(dst_dir "${PADDLE_INSTALL_DIR}/third_party/dlpack")
 copy(inference_lib_dist
         SRCS ${DLPACK_INCLUDE_DIR}/dlpack
         DSTS ${dst_dir})
 
-set(dst_dir "${FLUID_INSTALL_DIR}/third_party/install/zlib")
+set(dst_dir "${PADDLE_INSTALL_DIR}/third_party/install/zlib")
 copy(inference_lib_dist
         SRCS ${ZLIB_INCLUDE_DIR} ${ZLIB_LIBRARIES}
         DSTS ${dst_dir} ${dst_dir}/lib)
@@ -296,8 +293,8 @@ copy(inference_lib_dist
 
 # CMakeCache Info
 copy(fluid_lib_dist
-        SRCS ${FLUID_INFERENCE_INSTALL_DIR}/third_party ${CMAKE_CURRENT_BINARY_DIR}/CMakeCache.txt
-        DSTS ${FLUID_INSTALL_DIR} ${FLUID_INSTALL_DIR}
+        SRCS ${PADDLE_INFERENCE_INSTALL_DIR}/third_party ${CMAKE_CURRENT_BINARY_DIR}/CMakeCache.txt
+        DSTS ${PADDLE_INSTALL_DIR} ${PADDLE_INSTALL_DIR}
         )
 
 # paddle fluid version
@@ -323,6 +320,6 @@ function(version version_file)
     endif()
     
 endfunction()
-version(${FLUID_INSTALL_DIR}/version.txt)
-version(${FLUID_INFERENCE_INSTALL_DIR}/version.txt)
-version(${FLUID_INFERENCE_C_INSTALL_DIR}/version.txt)
+version(${PADDLE_INSTALL_DIR}/version.txt)
+version(${PADDLE_INFERENCE_INSTALL_DIR}/version.txt)
+version(${PADDLE_INFERENCE_C_INSTALL_DIR}/version.txt)
diff --git a/cmake/operators.cmake b/cmake/operators.cmake
index f60a6dc3f0c89dd345b04ea3a1e213de770e5760..21080fbe8fd2e14cf7fd805e01948f2f28535c22 100644
--- a/cmake/operators.cmake
+++ b/cmake/operators.cmake
@@ -127,7 +127,8 @@ function(op_library TARGET)
 "tensor_array_read_write_op" "tensorrt_engine_op" "conv_fusion_op"
 "fusion_transpose_flatten_concat_op" "fusion_conv_inception_op"
 "sync_batch_norm_op" "dgc_op" "fused_fc_elementwise_layernorm_op"
-"multihead_matmul_op" "fusion_group_op" "fused_bn_activation_op" "fused_embedding_eltwise_layernorm_op" "fusion_gru_op")
+"multihead_matmul_op" "fusion_group_op" "fused_bn_activation_op" "fused_embedding_eltwise_layernorm_op" "fusion_gru_op"
+"fused_bn_add_activation_op")
         if ("${TARGET}" STREQUAL "${manual_pybind_op}")
             set(pybind_flag 1)
         endif()
@@ -138,12 +139,17 @@ function(op_library TARGET)
     # And for detail pybind information, please see generated paddle/pybind/pybind.h.
     file(READ ${TARGET}.cc TARGET_CONTENT)
     string(REGEX MATCH "REGISTER_OPERATOR\\(.*REGISTER_OPERATOR\\(" multi_register "${TARGET_CONTENT}")
-    string(REGEX MATCH "REGISTER_OPERATOR\\([a-z0-9_]*," one_register "${multi_register}")
+    # [ \t\r\n]* is used for blank characters
+    string(REGEX MATCH "REGISTER_OPERATOR\\([ \t\r\n]*[a-z0-9_]*," one_register "${multi_register}")
+
     if (one_register STREQUAL "")
         string(REPLACE "_op" "" TARGET "${TARGET}")
     else ()
         string(REPLACE "REGISTER_OPERATOR(" "" TARGET "${one_register}")
         string(REPLACE "," "" TARGET "${TARGET}")
+        # [ \t\r\n]+ is used for blank characters.
+        # Here we use '+' instead of '*' since it is a REPLACE operation.
+        string(REGEX REPLACE "[ \t\r\n]+" "" TARGET "${TARGET}")
     endif()
 
     # pybind USE_NO_KERNEL_OP
diff --git a/cmake/third_party.cmake b/cmake/third_party.cmake
index c9442e8f843ac152cac02908799a8d24f5951e58..9edfcb967abc26a25a94d368298c1c475295019f 100644
--- a/cmake/third_party.cmake
+++ b/cmake/third_party.cmake
@@ -243,9 +243,10 @@ IF(WITH_TESTING OR (WITH_DISTRIBUTE AND NOT WITH_GRPC))
 ENDIF()
 
 if(WITH_GPU)
-    include(external/cub)       # download cub
-    list(APPEND third_party_deps extern_cub)
-  
+    if (${CMAKE_CUDA_COMPILER_VERSION} LESS 11.0)
+        include(external/cub)       # download cub
+        list(APPEND third_party_deps extern_cub)
+    endif()
     set(CUDAERROR_URL  "http://paddlepaddledeps.bj.bcebos.com/cudaErrorMessage.tar.gz" CACHE STRING "" FORCE)
     file_download_and_uncompress(${CUDAERROR_URL} "cudaerror") # download file cudaErrorMessage
 endif(WITH_GPU)
diff --git a/go/README_cn.md b/go/README_cn.md
index 57af05ce0af59360f02b919b376d1e8a8843a531..8ffc31adf85a638c4f4a4aa0bee6d3b7f09ef7fb 100644
--- a/go/README_cn.md
+++ b/go/README_cn.md
@@ -1,7 +1,7 @@
 # Paddle 预测golang API
 
 ## 安装
-首先cmake编译时打开`-DON_INFER=ON`,在编译目录下得到``fluid_inference_c_install_dir``,将该目录移动到当前目录中并重命名为`paddle_c`
+首先cmake编译时打开`-DON_INFER=ON`,在编译目录下得到``paddle_inference_c_install_dir``,将该目录移动到当前目录中并重命名为`paddle_c`
 
 ## 在Go中使用Paddle预测
 首先创建预测配置
diff --git a/go/paddle/config.go b/go/paddle/config.go
index cea69e716bffada9e5565eacf8ac1af84ae5b930..c4f39fa9c5d627a689c064bbbd2178cd1ae1a929 100644
--- a/go/paddle/config.go
+++ b/go/paddle/config.go
@@ -154,10 +154,17 @@ func (config *AnalysisConfig) EnableMkldnnQuantizer() {
 	C.PD_EnableMkldnnQuantizer(config.c)
 }
 
+func (config *AnalysisConfig) EnableMkldnnBfloat16() {
+	C.PD_EnableMkldnnBfloat16(config.c)
+}
+
 func (config *AnalysisConfig) MkldnnQuantizerEnabled() bool {
 	return ConvertCBooleanToGo(C.PD_MkldnnQuantizerEnabled(config.c))
 }
 
+func (config *AnalysisConfig) MkldnnBfloat16Enabled() bool {
+	return ConvertCBooleanToGo(C.PD_MkldnnBfloat16Enabled(config.c))
+}
 // SetModelBuffer
 // ModelFromMemory
 
diff --git a/paddle/fluid/framework/CMakeLists.txt b/paddle/fluid/framework/CMakeLists.txt
index 9d5c0cc7048f7db539c090d28c6184ac6d72d75a..bb5e2e1369a8478b500572106f9d11dff12e0189 100644
--- a/paddle/fluid/framework/CMakeLists.txt
+++ b/paddle/fluid/framework/CMakeLists.txt
@@ -272,7 +272,7 @@ cc_test(op_compatible_info_test SRCS op_compatible_info_test.cc DEPS op_compatib
 
 cc_library(save_load_util SRCS save_load_util DEPS tensor scope layer)
 cc_test(save_load_util_test SRCS save_load_util_test.cc DEPS save_load_util tensor scope layer)
-cc_library(generator SRCS generator.cc)
+cc_library(generator SRCS generator.cc DEPS enforce place)
 
 # Get the current working branch
 execute_process(
diff --git a/paddle/fluid/framework/c/c_api.cc b/paddle/fluid/framework/c/c_api.cc
index ab987fb56686594f505e63b6664c2176e5a4ad89..0dd2768ccb9ffa1dc7b85dca500095f8c10479c3 100644
--- a/paddle/fluid/framework/c/c_api.cc
+++ b/paddle/fluid/framework/c/c_api.cc
@@ -49,7 +49,8 @@ std::vector<std::string> PD_GetGradOpDescStrs(
     for (size_t i = 0; i < op_num; ++i) {
       PADDLE_ENFORCE_EQ(
           grad_op_descs[i]->Proto()->SerializePartialToString(&ret[i]), true,
-          "Cannot serialize message.");
+          paddle::platform::errors::Unavailable(
+              "Cannot serialize operator desc message."));
     }
   }
   return ret;
diff --git a/paddle/fluid/framework/data_feed.cc b/paddle/fluid/framework/data_feed.cc
index 96d54ec86917432837d61f681ece91da2ddcab10..aec27bd9d91e5afb6bf11037e60ff213162ad97f 100644
--- a/paddle/fluid/framework/data_feed.cc
+++ b/paddle/fluid/framework/data_feed.cc
@@ -527,6 +527,8 @@ bool MultiSlotDataFeed::CheckFile(const char* filename) {
         VLOG(0) << "error: the number of ids is a negative number: " << num;
         VLOG(0) << "please check line<" << instance_cout << "> in file<"
                 << filename << ">";
+        VLOG(0) << "Error occured when parsing " << i
+                << " th slot with total slots number: " << all_slots_.size();
         return false;
       } else if (num == 0) {
         VLOG(0)
@@ -536,42 +538,66 @@ bool MultiSlotDataFeed::CheckFile(const char* filename) {
                "characters.";
         VLOG(0) << "please check line<" << instance_cout << "> in file<"
                 << filename << ">";
+        VLOG(0) << "Error occured when parsing " << i
+                << " th slot with total slots number: " << all_slots_.size();
         return false;
       } else if (errno == ERANGE || num > INT_MAX) {
         VLOG(0) << "error: the number of ids greater than INT_MAX";
         VLOG(0) << "please check line<" << instance_cout << "> in file<"
                 << filename << ">";
+        VLOG(0) << "Error occured when parsing " << i
+                << " th slot with total slots number: " << all_slots_.size();
         return false;
       }
       if (all_slots_type_[i] == "float") {
-        for (int i = 0; i < num; ++i) {
+        for (int j = 0; j < num; ++j) {
           strtof(endptr, &endptr);
           if (errno == ERANGE) {
             VLOG(0) << "error: the value is out of the range of "
                        "representable values for float";
             VLOG(0) << "please check line<" << instance_cout << "> in file<"
                     << filename << ">";
+            VLOG(0) << "Error occured when parsing " << i
+                    << " th slot with total slots number: "
+                    << all_slots_.size();
+            VLOG(0) << "and in this slot: " << j
+                    << " th id with total id number: " << num;
             return false;
           }
-          if (i + 1 != num && endptr - str == len) {
+          if (j + 1 != num && endptr - str == len) {
             VLOG(0) << "error: there is a wrong with the number of ids.";
+            VLOG(0) << "Error occured when parsing " << i
+                    << " th slot with total slots number: "
+                    << all_slots_.size();
+            VLOG(0) << "and in this slot: " << j
+                    << " th id with total id number: " << num;
             VLOG(0) << "please check line<" << instance_cout << "> in file<"
                     << filename << ">";
             return false;
           }
         }
       } else if (all_slots_type_[i] == "uint64") {
-        for (int i = 0; i < num; ++i) {
+        for (int j = 0; j < num; ++j) {
           strtoull(endptr, &endptr, 10);
           if (errno == ERANGE) {
             VLOG(0) << "error: the value is out of the range of "
                        "representable values for uint64_t";
+            VLOG(0) << "Error occured when parsing " << i
+                    << " th slot with total slots number: "
+                    << all_slots_.size();
+            VLOG(0) << "and in this slot: " << j
+                    << " th id with total id number: " << num;
             VLOG(0) << "please check line<" << instance_cout << "> in file<"
                     << filename << ">";
             return false;
           }
-          if (i + 1 != num && endptr - str == len) {
+          if (j + 1 != num && endptr - str == len) {
             VLOG(0) << "error: there is a wrong with the number of ids.";
+            VLOG(0) << "Error occured when parsing " << i
+                    << " th slot with total slots number: "
+                    << all_slots_.size();
+            VLOG(0) << "and in this slot: " << j
+                    << " th id with total id number: " << num;
             VLOG(0) << "please check line<" << instance_cout << "> in file<"
                     << filename << ">";
             return false;
@@ -632,8 +658,13 @@ bool MultiSlotDataFeed::ParseOneInstanceFromPipe(
               "The number of ids can not be zero, you need padding "
               "it in data generator; or if there is something wrong with "
               "the data, please check if the data contains unresolvable "
-              "characters.\nplease check this error line: %s",
-              str));
+              "characters.\nplease check this error line: %s, \n Specifically, "
+              "something wrong happened(the length of this slot's feasign is 0)"
+              "when we parse the %d th slots."
+              "Maybe something wrong around this slot",
+              "\nWe detect the feasign number of this slot is %d, "
+              "which is illegal.",
+              str, i, num));
       if (idx != -1) {
         (*instance)[idx].Init(all_slots_type_[i]);
         if ((*instance)[idx].GetType()[0] == 'f') {  // float
@@ -683,8 +714,13 @@ bool MultiSlotDataFeed::ParseOneInstance(std::vector<MultiSlotType>* instance) {
               "The number of ids can not be zero, you need padding "
               "it in data generator; or if there is something wrong with "
               "the data, please check if the data contains unresolvable "
-              "characters.\nplease check this error line: %s.",
-              str));
+              "characters.\nplease check this error line: %s, \n Specifically, "
+              "something wrong happened(the length of this slot's feasign is 0)"
+              "when we parse the %d th slots."
+              "Maybe something wrong around this slot",
+              "\nWe detect the feasign number of this slot is %d, "
+              "which is illegal.",
+              str, i, num));
 
       if (idx != -1) {
         (*instance)[idx].Init(all_slots_type_[i]);
@@ -916,8 +952,13 @@ bool MultiSlotInMemoryDataFeed::ParseOneInstanceFromPipe(Record* instance) {
               "The number of ids can not be zero, you need padding "
               "it in data generator; or if there is something wrong with "
               "the data, please check if the data contains unresolvable "
-              "characters.\nplease check this error line: %s.",
-              str));
+              "characters.\nplease check this error line: %s, \n Specifically, "
+              "something wrong happened(the length of this slot's feasign is 0)"
+              "when we parse the %d th slots."
+              "Maybe something wrong around this slot",
+              "\nWe detect the feasign number of this slot is %d, "
+              "which is illegal.",
+              str, i, num));
       if (idx != -1) {
         if (all_slots_type_[i][0] == 'f') {  // float
           for (int j = 0; j < num; ++j) {
@@ -982,8 +1023,13 @@ bool MultiSlotInMemoryDataFeed::ParseOneInstance(Record* instance) {
               "The number of ids can not be zero, you need padding "
               "it in data generator; or if there is something wrong with "
               "the data, please check if the data contains unresolvable "
-              "characters.\nplease check this error line: %s.",
-              str));
+              "characters.\nplease check this error line: %s, \n Specifically, "
+              "something wrong happened(the length of this slot's feasign is 0)"
+              "when we parse the %d th slots."
+              "Maybe something wrong around this slot",
+              "\nWe detect the feasign number of this slot is %d, "
+              "which is illegal.",
+              str, i, num));
 
       if (idx != -1) {
         if (all_slots_type_[i][0] == 'f') {  // float
diff --git a/paddle/fluid/framework/data_layout_transform.cc b/paddle/fluid/framework/data_layout_transform.cc
index 3cea7a66d01051824a1de01d62c237636771804b..f757e244e38ec965d62d673e63ed082ca70c63c7 100644
--- a/paddle/fluid/framework/data_layout_transform.cc
+++ b/paddle/fluid/framework/data_layout_transform.cc
@@ -116,6 +116,8 @@ void* GetDataFromTensor(const Tensor& tensor, mkldnn::memory::data_type type) {
       return platform::to_void_cast(tensor.data<unsigned char>());
     case mkldnn::memory::data_type::s32:
       return platform::to_void_cast(tensor.data<int32_t>());
+    case mkldnn::memory::data_type::bf16:
+      return platform::to_void_cast(tensor.data<paddle::platform::bfloat16>());
     default:
       PADDLE_THROW(
           platform::errors::InvalidArgument("Wrong mkldnn type provided."));
diff --git a/paddle/fluid/framework/data_layout_transform.h b/paddle/fluid/framework/data_layout_transform.h
index 6eb84ef9d7c01b589cc95a78ea9727a81f6dc36e..b92c47c2eb018603e1b3156921fb2c1702864c57 100644
--- a/paddle/fluid/framework/data_layout_transform.h
+++ b/paddle/fluid/framework/data_layout_transform.h
@@ -61,7 +61,8 @@ inline MKLDNNDataType ToMKLDNNDataType(proto::VarType::Type type) {
       {DataTypeTrait<float>::DataType(), MKLDNNDataType::f32},
       {DataTypeTrait<int8_t>::DataType(), MKLDNNDataType::s8},
       {DataTypeTrait<uint8_t>::DataType(), MKLDNNDataType::u8},
-      {DataTypeTrait<int32_t>::DataType(), MKLDNNDataType::s32}};
+      {DataTypeTrait<int32_t>::DataType(), MKLDNNDataType::s32},
+      {DataTypeTrait<platform::bfloat16>::DataType(), MKLDNNDataType::bf16}};
   auto iter = dict.find(static_cast<int>(type));
   if (iter != dict.end()) return iter->second;
   return MKLDNNDataType::undef;
@@ -74,6 +75,9 @@ void innerTransDataLayoutFromMKLDNN(DataLayout in_layout, DataLayout out_layout,
 void TransDataLayoutFromMKLDNN(const OpKernelType& kernel_type_for_var,
                                const OpKernelType& expected_kernel_type,
                                const Tensor& in, Tensor* out);
+
+void* GetDataFromTensor(const Tensor& tensor, MKLDNNDataType type);
+
 #endif
 
 std::vector<int> GetAxis(const DataLayout& from, const DataLayout& to);
diff --git a/paddle/fluid/framework/data_layout_transform_test.cc b/paddle/fluid/framework/data_layout_transform_test.cc
index a0d08826b854fea9256382f0e065fd59dda8c8b3..8dfad23db65178c46140b887811846e413bebd00 100644
--- a/paddle/fluid/framework/data_layout_transform_test.cc
+++ b/paddle/fluid/framework/data_layout_transform_test.cc
@@ -43,3 +43,17 @@ TEST(DataTransform, DataLayoutFunction) {
   EXPECT_TRUE(in.layout() == paddle::framework::DataLayout::kNHWC);
   EXPECT_TRUE(in.dims() == paddle::framework::make_ddim({2, 3, 1, 2}));
 }
+
+#ifdef PADDLE_WITH_MKLDNN
+TEST(DataTransform, GetDataFromTensorDNNL) {
+  auto place = paddle::platform::CPUPlace();
+  paddle::framework::Tensor in = paddle::framework::Tensor();
+  in.mutable_data<paddle::platform::bfloat16>(
+      paddle::framework::make_ddim({2, 3, 1, 2}), place);
+
+  void* in_data =
+      paddle::framework::GetDataFromTensor(in, dnnl::memory::data_type::bf16);
+  EXPECT_EQ(in_data, paddle::platform::to_void_cast(
+                         in.data<paddle::platform::bfloat16>()));
+}
+#endif
diff --git a/paddle/fluid/framework/data_type.cc b/paddle/fluid/framework/data_type.cc
index f479d92483c1c39a0b43e0d8c514237bf89bcc00..8188d5cde1b90436d040e8b9dcc1070ac85bf319 100644
--- a/paddle/fluid/framework/data_type.cc
+++ b/paddle/fluid/framework/data_type.cc
@@ -18,6 +18,7 @@
 #include <unordered_map>
 
 using float16 = paddle::platform::float16;
+using bfloat16 = paddle::platform::bfloat16;
 
 namespace paddle {
 namespace framework {
diff --git a/paddle/fluid/framework/data_type.h b/paddle/fluid/framework/data_type.h
index 2c4a7b4d02727437742b19cc6d51e209e4346d03..720e422e114835f367317d4ba265254856885c15 100644
--- a/paddle/fluid/framework/data_type.h
+++ b/paddle/fluid/framework/data_type.h
@@ -17,6 +17,8 @@ limitations under the License. */
 #include <typeindex>
 #include "paddle/fluid/framework/framework.pb.h"
 #include "paddle/fluid/platform/enforce.h"
+
+#include "paddle/fluid/platform/bfloat16.h"
 #include "paddle/fluid/platform/float16.h"
 
 namespace paddle {
@@ -36,15 +38,16 @@ struct DataTypeTrait<void> {
 #define _ForEachDataTypeHelper_(callback, cpp_type, proto_type) \
   callback(cpp_type, ::paddle::framework::proto::VarType::proto_type);
 
-#define _ForEachDataType_(callback)                                     \
-  _ForEachDataTypeHelper_(callback, float, FP32);                       \
-  _ForEachDataTypeHelper_(callback, ::paddle::platform::float16, FP16); \
-  _ForEachDataTypeHelper_(callback, double, FP64);                      \
-  _ForEachDataTypeHelper_(callback, int, INT32);                        \
-  _ForEachDataTypeHelper_(callback, int64_t, INT64);                    \
-  _ForEachDataTypeHelper_(callback, bool, BOOL);                        \
-  _ForEachDataTypeHelper_(callback, uint8_t, UINT8);                    \
-  _ForEachDataTypeHelper_(callback, int16_t, INT16);                    \
+#define _ForEachDataType_(callback)                                      \
+  _ForEachDataTypeHelper_(callback, float, FP32);                        \
+  _ForEachDataTypeHelper_(callback, ::paddle::platform::float16, FP16);  \
+  _ForEachDataTypeHelper_(callback, ::paddle::platform::bfloat16, BF16); \
+  _ForEachDataTypeHelper_(callback, double, FP64);                       \
+  _ForEachDataTypeHelper_(callback, int, INT32);                         \
+  _ForEachDataTypeHelper_(callback, int64_t, INT64);                     \
+  _ForEachDataTypeHelper_(callback, bool, BOOL);                         \
+  _ForEachDataTypeHelper_(callback, uint8_t, UINT8);                     \
+  _ForEachDataTypeHelper_(callback, int16_t, INT16);                     \
   _ForEachDataTypeHelper_(callback, int8_t, INT8)
 
 #define _ForEachDataTypeSmall_(callback)           \
diff --git a/paddle/fluid/framework/data_type_test.cc b/paddle/fluid/framework/data_type_test.cc
index 2a380201f297f42dd82a6809bef9a72660066819..331596da33acc151810cd616ea6d5bdcae333b30 100644
--- a/paddle/fluid/framework/data_type_test.cc
+++ b/paddle/fluid/framework/data_type_test.cc
@@ -38,3 +38,25 @@ TEST(DataType, float16) {
   std::string type = "::paddle::platform::float16";
   EXPECT_STREQ(f::DataTypeToString(dtype).c_str(), type.c_str());
 }
+
+TEST(DataType, bfloat16) {
+  using paddle::framework::Tensor;
+  using paddle::platform::CPUPlace;
+  using paddle::platform::bfloat16;
+  namespace f = paddle::framework;
+  f::proto::VarType::Type dtype = f::proto::VarType::BF16;
+
+  Tensor tensor;
+  CPUPlace cpu;
+  tensor.mutable_data(cpu, dtype);
+
+  // test bf16 tensor
+  EXPECT_EQ(tensor.type(), f::ToDataType(typeid(bfloat16)));
+
+  // test bf16 size
+  EXPECT_EQ(f::SizeOfType(dtype), 2u);
+
+  // test debug info
+  std::string type = "::paddle::platform::bfloat16";
+  EXPECT_STREQ(f::DataTypeToString(dtype).c_str(), type.c_str());
+}
diff --git a/paddle/fluid/framework/data_type_transform.cc b/paddle/fluid/framework/data_type_transform.cc
index 44542f05d9d5c92f58a84dc2be59782bae2ff3aa..3d56152c237695126d2eecb0c51ebd964a85a690 100644
--- a/paddle/fluid/framework/data_type_transform.cc
+++ b/paddle/fluid/framework/data_type_transform.cc
@@ -77,6 +77,10 @@ void TransDataType(const OpKernelType& kernel_type_for_var,
       framework::VisitDataType(dst_type,
                                CastDataType<platform::float16>(in, out, ctx));
       break;
+    case proto::VarType::BF16:
+      framework::VisitDataType(dst_type,
+                               CastDataType<platform::bfloat16>(in, out, ctx));
+      break;
     case proto::VarType::FP32:
       framework::VisitDataType(dst_type, CastDataType<float>(in, out, ctx));
       break;
diff --git a/paddle/fluid/framework/data_type_transform_test.cc b/paddle/fluid/framework/data_type_transform_test.cc
index bbebea9f13fd37469a0e9b7be9719aca128f5687..ea7a665bcbe02ff382f1b3bf04ce177a674483c9 100644
--- a/paddle/fluid/framework/data_type_transform_test.cc
+++ b/paddle/fluid/framework/data_type_transform_test.cc
@@ -24,6 +24,11 @@ TEST(DataTypeTransform, CPUTransform) {
       paddle::framework::DataLayout::kAnyLayout,
       paddle::framework::LibraryType::kPlain);
 
+  auto kernel_bf16 = paddle::framework::OpKernelType(
+      paddle::framework::proto::VarType::BF16, place,
+      paddle::framework::DataLayout::kAnyLayout,
+      paddle::framework::LibraryType::kPlain);
+
   auto kernel_fp32 = paddle::framework::OpKernelType(
       paddle::framework::proto::VarType::FP32, place,
       paddle::framework::DataLayout::kAnyLayout,
@@ -189,4 +194,120 @@ TEST(DataTypeTransform, CPUTransform) {
                 static_cast<paddle::platform::float16>(in_data_bool[i]).x);
     }
   }
+
+  // data type transform from/to bfloat16
+  {
+    paddle::framework::Tensor in;
+    paddle::framework::Tensor out;
+
+    paddle::platform::bfloat16* ptr =
+        in.mutable_data<paddle::platform::bfloat16>(
+            paddle::framework::make_ddim({2, 3}), place);
+    int data_number = 2 * 3;
+
+    for (int i = 0; i < data_number; ++i) {
+      ptr[i] = i;
+    }
+
+    // transform from bfloat16 to other data types
+    paddle::framework::TransDataType(kernel_bf16, kernel_fp32, in, &out);
+    float* out_data_float = out.data<float>();
+    for (int i = 0; i < data_number; ++i) {
+      EXPECT_EQ(out_data_float[i], static_cast<float>(ptr[i]));
+    }
+
+    paddle::framework::TransDataType(kernel_bf16, kernel_fp64, in, &out);
+    double* out_data_double = out.data<double>();
+    for (int i = 0; i < data_number; ++i) {
+      EXPECT_EQ(out_data_double[i], static_cast<double>(ptr[i]));
+    }
+
+    paddle::framework::TransDataType(kernel_bf16, kernel_int32, in, &out);
+    int* out_data_int = out.data<int>();
+    for (int i = 0; i < data_number; ++i) {
+      EXPECT_EQ(out_data_int[i], static_cast<int>(ptr[i]));
+    }
+
+    paddle::framework::TransDataType(kernel_bf16, kernel_int64, in, &out);
+    int64_t* out_data_int64 = out.data<int64_t>();
+    for (int i = 0; i < data_number; ++i) {
+      EXPECT_EQ(out_data_int64[i], static_cast<int64_t>(ptr[i]));
+    }
+
+    paddle::framework::TransDataType(kernel_bf16, kernel_bool, in, &out);
+    bool* out_data_bool = out.data<bool>();
+    for (int i = 0; i < data_number; ++i) {
+      EXPECT_EQ(out_data_bool[i], static_cast<bool>(ptr[i]));
+    }
+
+    // transform float to bfloat16
+    float* in_data_float =
+        in.mutable_data<float>(paddle::framework::make_ddim({2, 3}), place);
+    for (int i = 0; i < data_number; ++i) {
+      in_data_float[i] = i;
+    }
+
+    paddle::framework::TransDataType(kernel_fp32, kernel_bf16, in, &out);
+    ptr = out.data<paddle::platform::bfloat16>();
+    for (int i = 0; i < data_number; ++i) {
+      EXPECT_EQ(ptr[i].x,
+                static_cast<paddle::platform::bfloat16>(in_data_float[i]).x);
+    }
+
+    // transform double to bfloat16
+    double* in_data_double =
+        in.mutable_data<double>(paddle::framework::make_ddim({2, 3}), place);
+    for (int i = 0; i < data_number; ++i) {
+      in_data_double[i] = i;
+    }
+
+    paddle::framework::TransDataType(kernel_fp64, kernel_bf16, in, &out);
+    ptr = out.data<paddle::platform::bfloat16>();
+    for (int i = 0; i < data_number; ++i) {
+      EXPECT_EQ(ptr[i].x,
+                static_cast<paddle::platform::bfloat16>(in_data_double[i]).x);
+    }
+
+    // transform int to bfloat16
+    int* in_data_int =
+        in.mutable_data<int>(paddle::framework::make_ddim({2, 3}), place);
+    for (int i = 0; i < data_number; ++i) {
+      in_data_int[i] = i;
+    }
+
+    paddle::framework::TransDataType(kernel_int32, kernel_bf16, in, &out);
+    ptr = out.data<paddle::platform::bfloat16>();
+    for (int i = 0; i < data_number; ++i) {
+      EXPECT_EQ(ptr[i].x,
+                static_cast<paddle::platform::bfloat16>(in_data_int[i]).x);
+    }
+
+    // transform int64 to bfloat16
+    int64_t* in_data_int64 =
+        in.mutable_data<int64_t>(paddle::framework::make_ddim({2, 3}), place);
+    for (int i = 0; i < data_number; ++i) {
+      in_data_int64[i] = i;
+    }
+
+    paddle::framework::TransDataType(kernel_int64, kernel_bf16, in, &out);
+    ptr = out.data<paddle::platform::bfloat16>();
+    for (int i = 0; i < data_number; ++i) {
+      EXPECT_EQ(ptr[i].x,
+                static_cast<paddle::platform::bfloat16>(in_data_int64[i]).x);
+    }
+
+    // transform bool to bfloat16
+    bool* in_data_bool =
+        in.mutable_data<bool>(paddle::framework::make_ddim({2, 3}), place);
+    for (int i = 0; i < data_number; ++i) {
+      in_data_bool[i] = i;
+    }
+
+    paddle::framework::TransDataType(kernel_bool, kernel_bf16, in, &out);
+    ptr = out.data<paddle::platform::bfloat16>();
+    for (int i = 0; i < data_number; ++i) {
+      EXPECT_EQ(ptr[i].x,
+                static_cast<paddle::platform::bfloat16>(in_data_bool[i]).x);
+    }
+  }
 }
diff --git a/paddle/fluid/framework/details/CMakeLists.txt b/paddle/fluid/framework/details/CMakeLists.txt
index a3cc4d1721e20a72817606bd773129230a8154ce..8281ec2143890aa2bb886347ccc0eff8145c67f3 100644
--- a/paddle/fluid/framework/details/CMakeLists.txt
+++ b/paddle/fluid/framework/details/CMakeLists.txt
@@ -74,6 +74,7 @@ set(SSA_GRAPH_EXECUTOR_DEPS graph framework_proto
     eager_deletion_pass
     buffer_shared_inplace_op_pass
     buffer_shared_cross_op_memory_reuse_pass
+    inplace_addto_op_pass
     set_reader_device_info_utils
     add_reader_dependency_pass)
 cc_library(ssa_graph_executor SRCS ssa_graph_executor.cc DEPS ${SSA_GRAPH_EXECUTOR_DEPS})
diff --git a/paddle/fluid/framework/details/all_reduce_op_handle.cc b/paddle/fluid/framework/details/all_reduce_op_handle.cc
index 7fc08f3e0f20f243425b351b43c124d4519753f6..939a2fc8fc9c73472ff5c25633610fa70c7cec6d 100644
--- a/paddle/fluid/framework/details/all_reduce_op_handle.cc
+++ b/paddle/fluid/framework/details/all_reduce_op_handle.cc
@@ -12,7 +12,9 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 #include "paddle/fluid/framework/details/all_reduce_op_handle.h"
+
 #include <algorithm>
+
 #include "paddle/fluid/framework/details/container_cast.h"
 #include "paddle/fluid/framework/details/reduce_and_gather.h"
 #include "paddle/fluid/framework/details/variable_visitor.h"
@@ -34,14 +36,24 @@ AllReduceOpHandle::AllReduceOpHandle(ir::Node *node,
                                      const std::vector<platform::Place> &places,
                                      const platform::NCCLCommunicator *ctxs)
     : NCCLOpHandleBase(node, places, ctxs), local_scopes_(local_scopes) {
-  PADDLE_ENFORCE_EQ(places_.size(), local_scopes_.size());
+  PADDLE_ENFORCE_EQ(places_.size(), local_scopes_.size(),
+                    platform::errors::InvalidArgument(
+                        "The number of places and the number of local scopes "
+                        "should be equal, but got number of places is %d and "
+                        "number of local scopes is %d.",
+                        places_.size(), local_scopes_.size()));
 }
 #else
 AllReduceOpHandle::AllReduceOpHandle(ir::Node *node,
                                      const std::vector<Scope *> &local_scopes,
                                      const std::vector<platform::Place> &places)
     : OpHandleBase(node), local_scopes_(local_scopes), places_(places) {
-  PADDLE_ENFORCE_EQ(places_.size(), local_scopes_.size());
+  PADDLE_ENFORCE_EQ(places_.size(), local_scopes_.size(),
+                    platform::errors::InvalidArgument(
+                        "The number of places and the number of local scopes "
+                        "should be equal, but got number of places is %d and "
+                        "number of local scopes is %d.",
+                        places_.size(), local_scopes_.size()));
 }
 #endif
 
@@ -60,13 +72,25 @@ void AllReduceOpHandle::AllReduceImpl(
     const std::vector<VarHandle *> &in_var_handles,
     const std::vector<VarHandle *> &out_var_handles) {
   size_t num_places = places_.size();
-  PADDLE_ENFORCE_EQ(
-      in_var_handles.size(), num_places,
-      "The NoDummyInputSize should be equal to the number of places.");
+  PADDLE_ENFORCE_EQ(in_var_handles.size(), num_places,
+                    platform::errors::InvalidArgument(
+                        "The NoDummyInputSize should be equal "
+                        "to the number of places, but got NoDummyInputSize is "
+                        "%d and the number of place is %d.",
+                        in_var_handles.size(), num_places));
   PADDLE_ENFORCE_EQ(
       in_var_handles.size(), out_var_handles.size(),
-      "The NoDummyInputSize and NoDummyOutputSize should be equal.");
-  PADDLE_ENFORCE_EQ(local_exec_scopes_.size(), num_places);
+      platform::errors::InvalidArgument(
+          "The NoDummyInputSize and NoDummyOutputSize should be "
+          "equal, but got NoDummyInputSize is %d and NoDummyOutputSize is %d.",
+          in_var_handles.size(), out_var_handles.size()));
+  PADDLE_ENFORCE_EQ(
+      local_exec_scopes_.size(), num_places,
+      platform::errors::InvalidArgument(
+          "The number of local scopes should be equal "
+          "to the number of places, but got the number of local scopes is "
+          "%d and the number of place is %d.",
+          in_var_handles.size(), num_places));
 
   std::vector<const void *> lod_tensor_data;
   std::vector<platform::Place> places;
@@ -78,23 +102,36 @@ void AllReduceOpHandle::AllReduceImpl(
   for (size_t i = 0; i < local_exec_scopes_.size(); ++i) {
     auto &local_scope = local_exec_scopes_[i];
     auto var = local_scope->FindVar(in_var_handles[i]->name());
-    PADDLE_ENFORCE_NOT_NULL(var, "%s is not found int scope.",
-                            in_var_handles[i]->name());
+    PADDLE_ENFORCE_NOT_NULL(var, platform::errors::NotFound(
+                                     "Variable %s is not found in local scope.",
+                                     in_var_handles[i]->name()));
     auto &lod_tensor = var->Get<LoDTensor>();
 
     if (i == 0) {
       numel = static_cast<int64_t>(lod_tensor.numel());
       // only enforce place0, we will enforce other palce numel == place0 numel
       PADDLE_ENFORCE_GT(
-          numel, 0, platform::errors::InvalidArgument(
-                        "The numel of tensos=[%s] must > 0. But now numel=[%d]",
-                        in_var_handles[i]->name(), numel));
+          numel, 0,
+          platform::errors::PreconditionNotMet(
+              "The numel of tensor %s should be > 0, but got numel is %d.",
+              in_var_handles[i]->name(), numel));
       dtype = lod_tensor.type();
       is_gpu_place = platform::is_gpu_place(lod_tensor.place());
     }
-    PADDLE_ENFORCE_EQ(numel, static_cast<int64_t>(lod_tensor.numel()));
-    PADDLE_ENFORCE_EQ(dtype, lod_tensor.type());
-    PADDLE_ENFORCE_EQ(is_gpu_place, platform::is_gpu_place(lod_tensor.place()));
+    PADDLE_ENFORCE_EQ(
+        numel, static_cast<int64_t>(lod_tensor.numel()),
+        platform::errors::PreconditionNotMet(
+            "The size of tensors of the same variable in different local "
+            "scopes should be equal."));
+    PADDLE_ENFORCE_EQ(
+        dtype, lod_tensor.type(),
+        platform::errors::PreconditionNotMet(
+            "The dtype of tensors of the same variable in different local "
+            "scopes should be equal."));
+    PADDLE_ENFORCE_EQ(is_gpu_place, platform::is_gpu_place(lod_tensor.place()),
+                      platform::errors::PreconditionNotMet(
+                          "The place type of tensors of the same variable "
+                          "in different local scopes should be equal."));
 
     lod_tensor_data.emplace_back(lod_tensor.data<void>());
     places.emplace_back(lod_tensor.place());
@@ -102,8 +139,12 @@ void AllReduceOpHandle::AllReduceImpl(
     VLOG(10) << "place:" << i << ", input_name:" << in_var_handles[i]->name()
              << ", out_name:" << out_var_handles[i]->name();
 
-    PADDLE_ENFORCE_EQ(in_var_handles[i]->name(), out_var_handles[i]->name(),
-                      "The name of input and output should be equal.");
+    PADDLE_ENFORCE_EQ(
+        in_var_handles[i]->name(), out_var_handles[i]->name(),
+        platform::errors::InvalidArgument(
+            "The name of input and output of all_reduce op should be equal, "
+            "but got input is %s and output is %s.",
+            in_var_handles[i]->name(), out_var_handles[i]->name()));
   }
 
   std::vector<std::string> grad_var_names;
@@ -122,7 +163,9 @@ void AllReduceOpHandle::AllReduceFunc(
     const std::vector<std::string> &out_var_names) {
   if (is_gpu_place(places[0])) {
 #if defined(PADDLE_WITH_NCCL)
-    PADDLE_ENFORCE_NOT_NULL(nccl_ctxs_, "nccl_ctxs should not be nullptr.");
+    PADDLE_ENFORCE_NOT_NULL(nccl_ctxs_,
+                            platform::errors::InvalidArgument(
+                                "The nccl context should not be NULL."));
     ncclDataType_t nccl_dtype = platform::ToNCCLDataType(dtype);
     std::vector<std::function<void()>> all_reduce_calls;
     for (size_t i = 0; i < local_exec_scopes_.size(); ++i) {
@@ -134,7 +177,8 @@ void AllReduceOpHandle::AllReduceFunc(
     }
     NCCLAllReduceFunc(all_reduce_calls);
 #else
-    PADDLE_THROW("Not compiled with CUDA.");
+    PADDLE_THROW(
+        platform::errors::PreconditionNotMet("Not compiled with CUDA."));
 #endif
   } else {  // Special handle CPU only Operator's gradient. Like CRF
     auto &trg = *local_exec_scopes_[0]
diff --git a/paddle/fluid/framework/details/async_ssa_graph_executor.cc b/paddle/fluid/framework/details/async_ssa_graph_executor.cc
index d42bd0b16d7a84987517326af9567809fd29da4d..12c0d6749029c657a829e8d2b04a2113fbe8946a 100644
--- a/paddle/fluid/framework/details/async_ssa_graph_executor.cc
+++ b/paddle/fluid/framework/details/async_ssa_graph_executor.cc
@@ -89,8 +89,19 @@ AsyncSSAGraphExecutor::AsyncSSAGraphExecutor(
       places_(std::move(places)),
       graphs_(std::move(graphs)) {
   VLOG(3) << "build AsyncSSAGraphExecutor";
-  PADDLE_ENFORCE_EQ(places_.size(), local_scopes_.size());
-  PADDLE_ENFORCE_EQ(local_scopes_.size(), local_exec_scopes_.size());
+  PADDLE_ENFORCE_EQ(places_.size(), local_scopes_.size(),
+                    platform::errors::InvalidArgument(
+                        "The number of places and the number of local scopes "
+                        "should be equal, but got number of places is %d and "
+                        "number of local scopes is %d.",
+                        places_.size(), local_scopes_.size()));
+  PADDLE_ENFORCE_EQ(
+      local_scopes_.size(), local_exec_scopes_.size(),
+      platform::errors::InvalidArgument(
+          "The number of local scopes and the number of local execution scopes "
+          "should be equal, but got number of local scopes is %d and "
+          "number of local execution scopes is %d.",
+          local_scopes_.size(), local_exec_scopes_.size()));
 
   // set the correct size of thread pool to each device.
   strategy_.num_threads_ = strategy_.num_threads_ < places_.size()
diff --git a/paddle/fluid/framework/details/build_strategy.h b/paddle/fluid/framework/details/build_strategy.h
index 5388df6bc504203abb57237f2d23a324367ce087..01d496d4ea7f7f0d0347b552e13d988fdc68e0c7 100644
--- a/paddle/fluid/framework/details/build_strategy.h
+++ b/paddle/fluid/framework/details/build_strategy.h
@@ -19,6 +19,7 @@
 #include <unordered_set>
 #include <utility>
 #include <vector>
+
 #include "boost/optional.hpp"
 #include "paddle/fluid/framework/ir/pass_builder.h"
 #include "paddle/fluid/framework/program_desc.h"
@@ -119,6 +120,9 @@ struct BuildStrategy {
   // Turn on inplace by default.
   bool enable_inplace_{true};
 
+  // Turn off inplace addto by default.
+  bool enable_addto_{false};
+
   // FIXME(zcd): is_distribution_ is a temporary field, because in pserver mode,
   // num_trainers is 1, so the current fields of build_strategy doesn't tell if
   // it's distributed model.
diff --git a/paddle/fluid/framework/details/fast_threaded_ssa_graph_executor.cc b/paddle/fluid/framework/details/fast_threaded_ssa_graph_executor.cc
index e440dff2af6b5649d34f47c3b696edeb8a1ba0a2..7f1d3c9b340c9ee92c45c038bf42cf409d535158 100644
--- a/paddle/fluid/framework/details/fast_threaded_ssa_graph_executor.cc
+++ b/paddle/fluid/framework/details/fast_threaded_ssa_graph_executor.cc
@@ -12,12 +12,14 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 #include "paddle/fluid/framework/details/fast_threaded_ssa_graph_executor.h"
+
 #include <deque>
 #include <memory>
 #include <string>
 #include <unordered_map>
 #include <unordered_set>
 #include <vector>
+
 #include "paddle/fluid/framework/details/computation_op_handle.h"
 #include "paddle/fluid/framework/details/fetch_async_op_handle.h"
 #include "paddle/fluid/framework/details/multi_devices_helper.h"
@@ -48,7 +50,9 @@ FastThreadedSSAGraphExecutor::FastThreadedSSAGraphExecutor(
       bootstrap_ops_.emplace_back(op);
     }
   }
-  PADDLE_ENFORCE_GT(op_deps_.size(), 0, "The graph doesn't have operators.");
+  PADDLE_ENFORCE_GT(op_deps_.size(), 0,
+                    platform::errors::PreconditionNotMet(
+                        "The graph doesn't have operators."));
   PrepareAtomicOpDeps();
 }
 
diff --git a/paddle/fluid/framework/details/fetch_op_handle.cc b/paddle/fluid/framework/details/fetch_op_handle.cc
index ae69960ef78c3e35143c66226133bd0dceac8b79..aedb8db46a5d9c90f176588d1dfd206e0abaf616 100644
--- a/paddle/fluid/framework/details/fetch_op_handle.cc
+++ b/paddle/fluid/framework/details/fetch_op_handle.cc
@@ -13,9 +13,11 @@
 // limitations under the License.
 
 #include "paddle/fluid/framework/details/fetch_op_handle.h"
+
 #include <string>
 #include <utility>
 #include <vector>
+
 #include "paddle/fluid/platform/profiler.h"
 
 namespace paddle {
@@ -138,8 +140,10 @@ void FetchOpHandle::RunImpl() {
     auto *var_handle = static_cast<VarHandle *>(inputs_[i]);
     auto &scope = scopes.at(var_handle->scope_idx());
     auto *var = scope->FindVar(var_handle->name());
-    PADDLE_ENFORCE_NOT_NULL(var, "Cannot find variable %s in execution scope",
-                            var_handle->name());
+    PADDLE_ENFORCE_NOT_NULL(
+        var,
+        platform::errors::NotFound(
+            "Cannot find variable %s in execution scope.", var_handle->name()));
 
     if (var->IsType<LoDTensor>()) {
       auto &t = var->Get<framework::LoDTensor>();
diff --git a/paddle/fluid/framework/details/nan_inf_utils_detail.cc b/paddle/fluid/framework/details/nan_inf_utils_detail.cc
index 956b099e883f9ea6d96db8716cb0fa693a3796d4..0ad84f5890acaf1c793000859ed3fbc7c1fc22d3 100644
--- a/paddle/fluid/framework/details/nan_inf_utils_detail.cc
+++ b/paddle/fluid/framework/details/nan_inf_utils_detail.cc
@@ -167,6 +167,8 @@ static void PrintNanInf(const T* value, const size_t numel, int print_num,
 // more detail see: 180 page of
 // https://www.openmp.org/wp-content/uploads/OpenMP4.0.0.pdf
 #pragma omp declare reduction(+ : paddle::platform::float16 : omp_out += omp_in)
+#pragma omp declare reduction(+ : paddle::platform::bfloat16 : omp_out += \
+                              omp_in)
 #endif
 
 template <typename T>
@@ -205,6 +207,21 @@ void CheckNanInf<paddle::platform::float16>(
     PrintNanInf(value, numel, print_num, op_type, var_name);
   }
 }
+
+template <>
+void CheckNanInf<paddle::platform::bfloat16>(
+    const paddle::platform::bfloat16* value, const size_t numel, int print_num,
+    const std::string& op_type, const std::string& var_name) {
+  float sum = 0.0f;
+#pragma omp parallel for reduction(+ : sum)
+  for (size_t i = 0; i < numel; ++i) {
+    sum += static_cast<float>(value[i] - value[i]);
+  }
+
+  if (std::isnan(sum) || std::isinf(sum)) {
+    PrintNanInf(value, numel, print_num, op_type, var_name);
+  }
+}
 #endif
 
 template <>
diff --git a/paddle/fluid/framework/details/op_handle_base.cc b/paddle/fluid/framework/details/op_handle_base.cc
index 35fe5d631fbaad61ce64ccf70d58d176aa3d3a20..459bcff5c0b740be0d495a6ad648da7424bd1a42 100644
--- a/paddle/fluid/framework/details/op_handle_base.cc
+++ b/paddle/fluid/framework/details/op_handle_base.cc
@@ -12,6 +12,7 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 #include "paddle/fluid/framework/details/op_handle_base.h"
+
 #include <map>
 #include <unordered_set>
 
@@ -88,6 +89,12 @@ void OpHandleBase::Run(bool use_cuda) {
   PADDLE_ENFORCE(!use_cuda);
 #endif
 
+  // skip running current op, used with inplace_addto_op_pass
+  if (skip_running_) {
+    VLOG(4) << "skip running: " << Name();
+    return;
+  }
+
   RunImpl();
 }
 
diff --git a/paddle/fluid/framework/details/op_handle_base.h b/paddle/fluid/framework/details/op_handle_base.h
index c5aa1295aad695175e53b17d729006ffc67ce3ab..097f54d5d5891390fdd479d3e6f62ae0e97cd0d4 100644
--- a/paddle/fluid/framework/details/op_handle_base.h
+++ b/paddle/fluid/framework/details/op_handle_base.h
@@ -18,6 +18,7 @@
 #include <unordered_map>
 #include <unordered_set>
 #include <vector>
+
 #include "paddle/fluid/framework/details/var_handle.h"
 #include "paddle/fluid/framework/ir/node.h"
 #include "paddle/fluid/platform/device_context.h"
@@ -52,6 +53,10 @@ class OpHandleBase {
 
   virtual Priority GetPriority() const { return kNormal; }
 
+  virtual bool GetSkipRunning() const { return skip_running_; }
+
+  virtual void SetSkipRunning(bool skip_runing) { skip_running_ = skip_runing; }
+
   virtual std::string Name() const = 0;
 
   void Run(bool use_cuda);
@@ -131,6 +136,7 @@ class OpHandleBase {
   std::map<platform::Place, platform::DeviceContext *> dev_ctxes_;
 
   std::vector<Scope *> local_exec_scopes_;
+  bool skip_running_ = false;
 
 #ifdef PADDLE_WITH_CUDA
   std::unordered_map<int, cudaEvent_t> events_;
diff --git a/paddle/fluid/framework/details/parallel_ssa_graph_executor.cc b/paddle/fluid/framework/details/parallel_ssa_graph_executor.cc
index e7d466c4af0711219c5a10a4c739ae3eb998e27d..35834fe5d7480819311a15ec54ab9412fc0a7cee 100644
--- a/paddle/fluid/framework/details/parallel_ssa_graph_executor.cc
+++ b/paddle/fluid/framework/details/parallel_ssa_graph_executor.cc
@@ -13,9 +13,11 @@
 // limitations under the License.
 
 #include "paddle/fluid/framework/details/parallel_ssa_graph_executor.h"
+
 #include <algorithm>
 #include <memory>
 #include <utility>
+
 #include "paddle/fluid/framework/ir/graph_helper.h"
 
 namespace paddle {
@@ -104,7 +106,12 @@ ParallelSSAGraphExecutor::ParallelSSAGraphExecutor(
       places_(places),
       graphs_(std::move(graphs)),
       feed_status_(places.size(), FeedStatus::kNone) {
-  PADDLE_ENFORCE_EQ(places_.size(), local_scopes_.size());
+  PADDLE_ENFORCE_EQ(places_.size(), local_scopes_.size(),
+                    platform::errors::InvalidArgument(
+                        "The number of places and the number of local scopes "
+                        "should be equal, but got number of places is %d and "
+                        "number of local scopes is %d.",
+                        places_.size(), local_scopes_.size()));
 
   PADDLE_ENFORCE_EQ(places_.size(), graphs_.size(),
                     platform::errors::InvalidArgument(
diff --git a/paddle/fluid/framework/details/scope_buffered_ssa_graph_executor.cc b/paddle/fluid/framework/details/scope_buffered_ssa_graph_executor.cc
index fe86d002ca8b33695839be3c5d2ff5fd20672952..7cc1f54131416ed454846c75c8c8a6849ec20e6c 100644
--- a/paddle/fluid/framework/details/scope_buffered_ssa_graph_executor.cc
+++ b/paddle/fluid/framework/details/scope_buffered_ssa_graph_executor.cc
@@ -13,10 +13,12 @@
 // limitations under the License.
 
 #include "paddle/fluid/framework/details/scope_buffered_ssa_graph_executor.h"
+
 #include <stdexcept>
 #include <string>
 #include <utility>
 #include <vector>
+
 #include "paddle/fluid/framework/details/multi_devices_helper.h"
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/framework/variable_helper.h"
@@ -37,7 +39,13 @@ ScopeBufferedSSAGraphExecutor::ScopeBufferedSSAGraphExecutor(
       var_infos_(std::move(var_infos)),
       places_(std::move(places)),
       scope_monitor_(places_, local_exec_scopes_) {
-  PADDLE_ENFORCE_EQ(local_scopes_.size(), local_exec_scopes_.size());
+  PADDLE_ENFORCE_EQ(
+      local_scopes_.size(), local_exec_scopes_.size(),
+      platform::errors::InvalidArgument(
+          "The number of local scopes and the number of local execution scopes "
+          "should be equal, but got number of local scopes is %d and "
+          "number of local execution scopes is %d.",
+          local_scopes_.size(), local_exec_scopes_.size()));
   PrepareLocalExeScopes();
 }
 
diff --git a/paddle/fluid/framework/details/share_tensor_buffer_functor.cc b/paddle/fluid/framework/details/share_tensor_buffer_functor.cc
index 6fdec553f3d65debdf8f6d95eeeb8ebe30b4a36a..5fbaf3cbfe028638ad9219d9e1286480ae16ee6b 100644
--- a/paddle/fluid/framework/details/share_tensor_buffer_functor.cc
+++ b/paddle/fluid/framework/details/share_tensor_buffer_functor.cc
@@ -13,9 +13,11 @@
 // limitations under the License.
 
 #include "paddle/fluid/framework/details/share_tensor_buffer_functor.h"
+
 #include <string>
 #include <unordered_map>
 #include <unordered_set>
+
 #include "paddle/fluid/framework/lod_tensor.h"
 #include "paddle/fluid/framework/selected_rows.h"
 #include "paddle/fluid/platform/enforce.h"
@@ -29,7 +31,8 @@ static inline const Tensor &GetTensorFromVar(const Variable *var) {
   if (var->IsType<LoDTensor>()) {
     return var->Get<LoDTensor>();
   } else {
-    PADDLE_THROW("Variable must be type of LoDTensor");
+    PADDLE_THROW(platform::errors::InvalidArgument(
+        "Variable must be type of LoDTensor."));
   }
 }
 
@@ -37,20 +40,27 @@ static inline Tensor *GetMutableTensorFromVar(Variable *var) {
   if (var->IsType<LoDTensor>()) {
     return var->GetMutable<LoDTensor>();
   } else {
-    PADDLE_THROW("Variable must be type of LoDTensor");
+    PADDLE_THROW(platform::errors::InvalidArgument(
+        "Variable must be type of LoDTensor."));
   }
 }
 
 ShareTensorBufferFunctor::ShareTensorBufferFunctor(
     Scope *scope, size_t scope_idx, const std::string &op_type,
     const std::vector<const ir::MemOptVarInfo *> &in_var_infos,
-    const std::vector<std::string> &out_var_names)
+    const std::vector<std::string> &out_var_names, bool share_dims)
     : scope_(scope),
       scope_idx_(scope_idx),
       op_type_(op_type),
       in_var_infos_(in_var_infos),
-      out_var_names_(out_var_names) {
-  PADDLE_ENFORCE_EQ(in_var_infos_.size(), out_var_names_.size());
+      out_var_names_(out_var_names),
+      share_dims_(share_dims) {
+  PADDLE_ENFORCE_EQ(in_var_infos_.size(), out_var_names_.size(),
+                    platform::errors::PreconditionNotMet(
+                        "The number of input variables and output variables "
+                        "should be equal, but got number of input variables is "
+                        "%d and number of output variables is %d.",
+                        in_var_infos_.size(), out_var_names_.size()));
   for (size_t i = 0; i < in_var_infos_.size(); ++i) {
     AddReuseVarPair(in_var_infos_[i], out_var_names_[i]);
   }
@@ -67,32 +77,59 @@ ShareTensorBufferFunctor::ReusedVars() const {
 
 void ShareTensorBufferFunctor::AddReuseVarPair(
     const ir::MemOptVarInfo *in_var_info, const std::string &out_var_name) {
-  PADDLE_ENFORCE_NOT_NULL(in_var_info, "in_var_info cannot be nullptr");
+  PADDLE_ENFORCE_NOT_NULL(
+      in_var_info,
+      platform::errors::InvalidArgument(
+          "The input variables to be inplaced should not be NULL."));
   PADDLE_ENFORCE_NE(in_var_info->Name(), out_var_name,
-                    "in/out cannot have same name: %s", out_var_name);
+                    platform::errors::InvalidArgument(
+                        "The input variable and output variable to be inplaced "
+                        "cannot have the same name: %s.",
+                        out_var_name));
   in_var_infos_.emplace_back(in_var_info);
   out_var_names_.emplace_back(out_var_name);
 }
 
 void ShareTensorBufferFunctor::CallOnce() {
-  PADDLE_ENFORCE(in_out_vars_.empty(), "in_out_vars_ must be initialized here");
+  PADDLE_ENFORCE(in_out_vars_.empty(),
+                 platform::errors::InvalidArgument(
+                     "The input-output variable pairs to be "
+                     "inplaced should be initialized here."));
   for (size_t i = 0; i < in_var_infos_.size(); ++i) {
     auto *in_var = exec_scope_->FindVar(in_var_infos_[i]->Name());
     auto *out_var = exec_scope_->FindVar(out_var_names_[i]);
-    PADDLE_ENFORCE_NOT_NULL(in_var);
-    PADDLE_ENFORCE_NOT_NULL(out_var);
-    PADDLE_ENFORCE_NE(in_var, out_var);
+    PADDLE_ENFORCE_NOT_NULL(
+        in_var, platform::errors::NotFound(
+                    "The input variable(%s)to be inplaced should not be NULL.",
+                    in_var_infos_[i]->Name()));
+    PADDLE_ENFORCE_NOT_NULL(
+        out_var,
+        platform::errors::NotFound(
+            "The output variable(%s) to be inplaced should not be NULL.",
+            out_var_names_[i]));
+    PADDLE_ENFORCE_NE(
+        in_var, out_var,
+        platform::errors::PreconditionNotMet(
+            "The input variable and output variable to be inplaced "
+            "cannot be the same variable(%s).",
+            out_var_names_[i]));
     in_out_vars_.emplace_back(in_var, out_var);
   }
 }
 
 void ShareTensorBufferFunctor::operator()(Scope *exec_scope) {
   if (!exec_scope_) {
-    PADDLE_ENFORCE_NOT_NULL(exec_scope);
+    PADDLE_ENFORCE_NOT_NULL(exec_scope,
+                            platform::errors::InvalidArgument(
+                                "The given execution scope should not be NULL "
+                                "if the cached scope is NULL."));
     exec_scope_ = exec_scope;
     CallOnce();
   } else {
-    PADDLE_ENFORCE(exec_scope_ == exec_scope, "Scope must be the same");
+    PADDLE_ENFORCE_EQ(exec_scope_, exec_scope,
+                      platform::errors::InvalidArgument(
+                          "The given execution scope and the cached execution "
+                          "scope should be the same."));
   }
 
   for (size_t i = 0; i < in_var_infos_.size(); ++i) {
@@ -115,6 +152,13 @@ void ShareTensorBufferFunctor::operator()(Scope *exec_scope) {
     } else {
       out_tensor->ShareBufferWith(in_tensor);
 
+      // NOTE(zhiqiu): In the case of inplace addto, if the operator of
+      // the in_out_vars is skipped during running, we should set the dims of
+      // output as the same as input.
+      if (share_dims_) {
+        out_tensor->Resize(in_tensor.dims());
+      }
+
       VLOG(2) << "Share tensor buffer when running " << op_type_ << " : "
               << in_var_info->Name() << " -> " << out_var_names_[i];
     }
diff --git a/paddle/fluid/framework/details/share_tensor_buffer_functor.h b/paddle/fluid/framework/details/share_tensor_buffer_functor.h
index 774dcd056e59bc8f090a5ceb916e73843c8c9df6..be49d1c432b2ab2b9741d873ba005b400e9f0829 100644
--- a/paddle/fluid/framework/details/share_tensor_buffer_functor.h
+++ b/paddle/fluid/framework/details/share_tensor_buffer_functor.h
@@ -19,6 +19,7 @@
 #include <unordered_set>
 #include <utility>
 #include <vector>
+
 #include "paddle/fluid/framework/details/op_handle_base.h"
 #include "paddle/fluid/framework/ir/memory_optimize_pass/memory_optimization_var_info.h"
 #include "paddle/fluid/framework/scope.h"
@@ -40,11 +41,13 @@ class ShareTensorBufferFunctor {
   ShareTensorBufferFunctor(
       Scope *scope, size_t scope_idx, const std::string &op_type,
       const std::vector<const ir::MemOptVarInfo *> &in_var_infos,
-      const std::vector<std::string> &out_var_names);
+      const std::vector<std::string> &out_var_names, bool share_dims = false);
 
   void AddReuseVarPair(const ir::MemOptVarInfo *in_var_info,
                        const std::string &out_var_name);
 
+  void SetShareDims(bool share_dims) { share_dims_ = share_dims; }
+
   void operator()(Scope *exec_scope);
 
   std::unordered_map<std::string, std::string> ReusedVars() const;
@@ -66,6 +69,11 @@ class ShareTensorBufferFunctor {
   std::vector<std::string> out_var_names_;
 
   std::vector<std::pair<const Variable *, Variable *>> in_out_vars_;
+
+  // NOTE(zhiqiu): In the case of inplace addto, if the operator of
+  // the in_out_vars is skipped during running, we should set the dims of output
+  // as the same as input.
+  bool share_dims_{false};
 };
 
 }  // namespace details
diff --git a/paddle/fluid/framework/details/share_tensor_buffer_op_handle.cc b/paddle/fluid/framework/details/share_tensor_buffer_op_handle.cc
index f06507257f1e9fc8b1783201adb533ec7b032c09..be3f5515a971900258ab5914b579deffe5d5b7d6 100644
--- a/paddle/fluid/framework/details/share_tensor_buffer_op_handle.cc
+++ b/paddle/fluid/framework/details/share_tensor_buffer_op_handle.cc
@@ -13,8 +13,10 @@
 // limitations under the License.
 
 #include "paddle/fluid/framework/details/share_tensor_buffer_op_handle.h"
+
 #include <string>
 #include <unordered_set>
+
 #include "paddle/fluid/framework/ir/memory_optimize_pass/memory_optimization_var_info.h"
 #include "paddle/fluid/framework/lod_tensor.h"
 #include "paddle/fluid/framework/scope.h"
@@ -32,26 +34,35 @@ ComputationOpHandle *GetUniquePendingComputationOpHandle(
     for (ir::Node *pending_op : out_var->outputs) {
       auto &op = pending_op->Wrapper<OpHandleBase>();
       auto *compute_op = dynamic_cast<ComputationOpHandle *>(&op);
-      PADDLE_ENFORCE_NOT_NULL(compute_op);
+      PADDLE_ENFORCE_NOT_NULL(
+          compute_op,
+          platform::errors::PreconditionNotMet(
+              "The pending OpHandle should be ComputationOpHandle."));
 
       if (result_op == nullptr) {
         result_op = compute_op;
       } else {
-        PADDLE_ENFORCE_EQ(result_op, compute_op);
+        PADDLE_ENFORCE_EQ(
+            result_op, compute_op,
+            platform::errors::PreconditionNotMet(
+                "The pending OpHandle should be the unique one."));
       }
     }
   }
 
-  PADDLE_ENFORCE_NOT_NULL(result_op);
+  PADDLE_ENFORCE_NOT_NULL(result_op,
+                          platform::errors::PreconditionNotMet(
+                              "The pending OpHandle should not be NULL."));
   return result_op;
 }
 
 ShareTensorBufferOpHandle::ShareTensorBufferOpHandle(
     ir::Node *node, Scope *scope, size_t scope_idx, const std::string &op_type,
     const std::vector<const ir::MemOptVarInfo *> &in_var_infos,
-    const std::vector<std::string> &out_var_names)
+    const std::vector<std::string> &out_var_names, bool share_dims)
     : OpHandleBase(node),
-      functor_(scope, scope_idx, op_type, in_var_infos, out_var_names) {}
+      functor_(scope, scope_idx, op_type, in_var_infos, out_var_names,
+               share_dims) {}
 
 std::unordered_map<std::string, std::string>
 ShareTensorBufferOpHandle::ReusedVars() const {
@@ -63,6 +74,10 @@ void ShareTensorBufferOpHandle::AddReuseVarPair(
   functor_.AddReuseVarPair(in_var_info, out_var_name);
 }
 
+void ShareTensorBufferOpHandle::SetShareDims(bool share_dims) {
+  functor_.SetShareDims(share_dims);
+}
+
 void ShareTensorBufferOpHandle::InitCUDA() {
 #ifdef PADDLE_WITH_CUDA
   int dev_id =
diff --git a/paddle/fluid/framework/details/share_tensor_buffer_op_handle.h b/paddle/fluid/framework/details/share_tensor_buffer_op_handle.h
index b22f5621fe44d887d70d82ce4dc9e26596d23f4e..a02c346485eca813f0d0f0b432b8b647e2fe4414 100644
--- a/paddle/fluid/framework/details/share_tensor_buffer_op_handle.h
+++ b/paddle/fluid/framework/details/share_tensor_buffer_op_handle.h
@@ -17,6 +17,7 @@
 #include <unordered_map>
 #include <utility>
 #include <vector>
+
 #include "paddle/fluid/framework/details/computation_op_handle.h"
 #include "paddle/fluid/framework/details/op_handle_base.h"
 #include "paddle/fluid/framework/details/share_tensor_buffer_functor.h"
@@ -31,7 +32,7 @@ class ShareTensorBufferOpHandle : public OpHandleBase {
       ir::Node *node, Scope *scope, size_t scope_idx,
       const std::string &op_type,
       const std::vector<const ir::MemOptVarInfo *> &in_vars_infos,
-      const std::vector<std::string> &out_var_names);
+      const std::vector<std::string> &out_var_names, bool share_dims = false);
 
   std::unordered_map<std::string, std::string> ReusedVars() const;
 
@@ -42,6 +43,8 @@ class ShareTensorBufferOpHandle : public OpHandleBase {
   void AddReuseVarPair(const ir::MemOptVarInfo *in_var_info,
                        const std::string &out_var_name);
 
+  void SetShareDims(bool share_dims);
+
   const ShareTensorBufferFunctor &Functor() const { return functor_; }
 
  protected:
diff --git a/paddle/fluid/framework/details/ssa_graph_executor.cc b/paddle/fluid/framework/details/ssa_graph_executor.cc
index 71123f708e3ca149d9fd634f55652cede5a57b50..2723a46dcfae3582a9286bcacba8d2e0a4990ac5 100644
--- a/paddle/fluid/framework/details/ssa_graph_executor.cc
+++ b/paddle/fluid/framework/details/ssa_graph_executor.cc
@@ -13,6 +13,7 @@
 // limitations under the License.
 
 #include "paddle/fluid/framework/details/ssa_graph_executor.h"
+
 #include "paddle/fluid/framework/details/fetch_async_op_handle.h"
 
 namespace paddle {
@@ -27,8 +28,9 @@ void ClearFetchOp(ir::Graph* graph, std::vector<OpHandleBase*>* fetch_ops) {
     PADDLE_ENFORCE_EQ(dynamic_cast<FetchOpHandle*>(op) != nullptr ||
                           dynamic_cast<FetchAsyncOpHandle*>(op) != nullptr,
                       true,
-                      "The input ops of ClearFetchOp function should be "
-                      "FetchOpHandle or FetchAsyncOpHandle.");
+                      platform::errors::PreconditionNotMet(
+                          "The input ops of ClearFetchOp function should be "
+                          "FetchOpHandle or FetchAsyncOpHandle."));
     for (auto& out_var : op->Node()->outputs) {
       graph->RemoveNode(out_var);
     }
diff --git a/paddle/fluid/framework/details/threaded_ssa_graph_executor.cc b/paddle/fluid/framework/details/threaded_ssa_graph_executor.cc
index 92c3a0cd6b9c01497199fece0a9bdafc89f64678..2ed52b3bd94733e329ccf8270054b23b1ad29d87 100644
--- a/paddle/fluid/framework/details/threaded_ssa_graph_executor.cc
+++ b/paddle/fluid/framework/details/threaded_ssa_graph_executor.cc
@@ -13,6 +13,7 @@
 // limitations under the License.
 
 #include "paddle/fluid/framework/details/threaded_ssa_graph_executor.h"
+
 #include "paddle/fluid/framework/ir/graph_helper.h"
 #include "paddle/fluid/platform/profiler.h"
 
@@ -138,7 +139,10 @@ inline FetchResultType ThreadedSSAGraphExecutor::RunImpl(
         }
       }
     }
-    PADDLE_ENFORCE(ready_ops.empty());
+    PADDLE_ENFORCE_EQ(
+        ready_ops.empty(), true,
+        platform::errors::Fatal("After the execution of computation graph, "
+                                "there are unexecuted operators left."));
   }
 
   // Wait FetchOps.
@@ -165,9 +169,8 @@ void ThreadedSSAGraphExecutor::InsertFetchOps(
     FetchResultType *fetch_data, bool return_merged) {
   std::unordered_map<std::string, std::vector<VarHandleBase *>> fetched_vars;
   std::unordered_set<VarHandleBase *> local_ready_vars;
-  std::unordered_set<std::string> fetch_tensor_set(fetch_tensors.begin(),
-                                                   fetch_tensors.end());
-  for (auto &fetch_var_name : fetch_tensor_set) {
+
+  for (auto &fetch_var_name : fetch_tensors) {
     for (auto &var_map : graph_->Get<details::GraphVars>(details::kGraphVars)) {
       auto it = var_map.find(fetch_var_name);
       if (it != var_map.end()) {
@@ -231,7 +234,11 @@ void ThreadedSSAGraphExecutor::InsertFetchOps(
       ready_ops->insert(static_cast<OpHandleBase *>(op));
     }
   }
-  PADDLE_ENFORCE_EQ(local_ready_vars.size(), 0);
+  PADDLE_ENFORCE_EQ(
+      local_ready_vars.size(), 0,
+      platform::errors::Fatal(
+          "The number of ready variables should be 0, but got %d.",
+          local_ready_vars.size()));
 }
 
 void ThreadedSSAGraphExecutor::InsertPendingOp(
@@ -277,7 +284,9 @@ void ThreadedSSAGraphExecutor::PrepareOpDeps() {
     }
   }
   op_deps_->num_ops_ = ready_ops.size() + pending_ops.size();
-  PADDLE_ENFORCE_GT(op_deps_->num_ops_, 0, "The graph doesn't have operators.");
+  PADDLE_ENFORCE_GT(
+      op_deps_->num_ops_, 0,
+      platform::errors::InvalidArgument("The graph doesn't have operators."));
 
   for (auto ready_var : ready_vars) {
     pending_vars.erase(ready_var);
diff --git a/paddle/fluid/framework/details/threaded_ssa_graph_executor.h b/paddle/fluid/framework/details/threaded_ssa_graph_executor.h
index b8b584f27200bd3f89efcc20be2c6a3435274a56..45fa3adbf14080317fe004a7113b58d34145447d 100644
--- a/paddle/fluid/framework/details/threaded_ssa_graph_executor.h
+++ b/paddle/fluid/framework/details/threaded_ssa_graph_executor.h
@@ -14,6 +14,8 @@
 
 #pragma once
 
+#include <ThreadPool.h>  // ThreadPool in thrird party
+
 #include <deque>
 #include <functional>
 #include <list>
@@ -24,8 +26,6 @@
 #include <utility>
 #include <vector>
 
-#include <ThreadPool.h>  // ThreadPool in thrird party
-
 #include "paddle/fluid/framework/blocking_queue.h"
 #include "paddle/fluid/framework/details/exception_holder.h"
 #include "paddle/fluid/framework/details/execution_strategy.h"
diff --git a/paddle/fluid/framework/details/var_handle.h b/paddle/fluid/framework/details/var_handle.h
index 86428f8b7613760f59a1166189c61f3217d8017d..bb38424d3ae2d74f6f0a48e11df95b60dbf432f3 100644
--- a/paddle/fluid/framework/details/var_handle.h
+++ b/paddle/fluid/framework/details/var_handle.h
@@ -54,8 +54,10 @@ struct VarHandleBase {
 
   void AddOutput(OpHandleBase* out, ir::Node* node) {
     if (pending_ops_.find(out) == pending_ops_.end()) {
-      PADDLE_ENFORCE(out != nullptr, "The output of %s should not be nullptr",
-                     this->Node()->Name());
+      PADDLE_ENFORCE_NOT_NULL(out,
+                              platform::errors::InvalidArgument(
+                                  "The output added to VarHandle %s is NULL.",
+                                  this->Node()->Name()));
       pending_ops_.insert(out);
       node_->outputs.push_back(node);
     }
@@ -120,7 +122,10 @@ struct VarHandle : public VarHandleBase {
   bool HasEvent() { return has_event_; }
 
   const cudaEvent_t& GetEvent() {
-    PADDLE_ENFORCE(HasEvent(), "The event is not set.");
+    PADDLE_ENFORCE_EQ(
+        HasEvent(), true,
+        platform::errors::PreconditionNotMet(
+            "The cuda event is not set, maybe InitCUDA() is not called."));
     return event_;
   }
 
diff --git a/paddle/fluid/framework/details/variable_visitor.cc b/paddle/fluid/framework/details/variable_visitor.cc
index 134f759081a0778194c20785e215420d6e2bb622..fba0c1bf463ee0b9a434c350474af4be0c589e30 100644
--- a/paddle/fluid/framework/details/variable_visitor.cc
+++ b/paddle/fluid/framework/details/variable_visitor.cc
@@ -13,6 +13,7 @@
 // limitations under the License.
 
 #include "paddle/fluid/framework/details/variable_visitor.h"
+
 #include "paddle/fluid/framework/selected_rows.h"
 namespace paddle {
 namespace framework {
@@ -24,7 +25,9 @@ static void VisitVariable(Variable* var, Func* func) {
   } else if (var->IsType<SelectedRows>()) {
     (*func)(var->GetMutable<SelectedRows>());
   } else {
-    PADDLE_THROW("Not supported type %s", ToTypeName(var->Type()));
+    PADDLE_THROW(platform::errors::Unimplemented(
+        "VisitVariable is not supported for type %s.",
+        ToTypeName(var->Type())));
   }
 }
 
@@ -35,7 +38,8 @@ static void VisitVariable(const Variable& var, Func* func) {
   } else if (var.IsType<SelectedRows>()) {
     (*func)(var.Get<SelectedRows>());
   } else {
-    PADDLE_THROW("Not supported type %s", ToTypeName(var.Type()));
+    PADDLE_THROW(platform::errors::Unimplemented(
+        "VisitVariable is not supported for type %s.", ToTypeName(var.Type())));
   }
 }
 
@@ -50,7 +54,8 @@ struct TensorVisitor {
 
   template <typename T>
   void operator()() {
-    PADDLE_THROW("Not Support to get LoDTensor from %s", typeid(T).name());
+    PADDLE_THROW(platform::errors::Unimplemented(
+        "Getting tensor from type %s is not supported.", typeid(T).name()));
   }
 };
 
@@ -78,8 +83,8 @@ struct ShareDimsAndLoDVisitor {
 
   template <typename T>
   void operator()(const T&) {
-    PADDLE_ENFORCE("ShareDimsAndLoD is not supported by type %s",
-                   typeid(T).name());
+    PADDLE_THROW(platform::errors::Unimplemented(
+        "ShareDimsAndLoD is not supported for type %s.", typeid(T).name()));
   }
 };
 
@@ -89,42 +94,54 @@ void VariableVisitor::ShareDimsAndLoD(const Variable& src, Variable* trg) {
 }
 
 struct EnforceShapeAndDTypeEQVisitor {
-  const Variable* trg_;
+  const Variable* dst_;
 
   void operator()(const LoDTensor& src) {
-    auto& tensor = trg_->Get<LoDTensor>();
-    PADDLE_ENFORCE_EQ(
-        src.place().which(), tensor.place().which(),
-        "The Places of the two Variable must be all on CPU or all on GPU.");
+    auto& tensor = dst_->Get<LoDTensor>();
+    PADDLE_ENFORCE_EQ(src.place().which(), tensor.place().which(),
+                      platform::errors::PreconditionNotMet(
+                          "The place type of the two variables is not equal."));
     PADDLE_ENFORCE_EQ(src.type(), tensor.type(),
-                      "The dtype of the two Variable is not equal.");
-    PADDLE_ENFORCE_EQ(src.dims(), tensor.dims(),
-                      "The dims of the two Variable is not equal.");
+                      platform::errors::PreconditionNotMet(
+                          "The dtype of the two variables is not equal."));
+    PADDLE_ENFORCE_EQ(
+        src.dims(), tensor.dims(),
+        platform::errors::PreconditionNotMet(
+            "The layout of the two variables' tensors is not equal."));
     PADDLE_ENFORCE_EQ(src.lod(), tensor.lod(),
-                      "The lod of the two Variable is not equal.");
-    PADDLE_ENFORCE_EQ(src.layout(), tensor.layout(),
-                      "The layout of the two Variable's tensor is not equal.");
+                      platform::errors::PreconditionNotMet(
+                          "The lod of the two variable is not equal."));
+    PADDLE_ENFORCE_EQ(
+        src.layout(), tensor.layout(),
+        platform::errors::PreconditionNotMet(
+            "The layout of the two variables' tensors tensor is not equal."));
   }
 
   void operator()(const SelectedRows& src) {
-    auto& selected_rows = trg_->Get<SelectedRows>();
-    PADDLE_ENFORCE_EQ(
-        src.place().which(), selected_rows.place().which(),
-        "The Places of the two Variable must be all on CPU or all on GPU.");
+    auto& selected_rows = dst_->Get<SelectedRows>();
+    PADDLE_ENFORCE_EQ(src.place().which(), selected_rows.place().which(),
+                      platform::errors::PreconditionNotMet(
+                          "The place type of the two variables is not equal."));
     PADDLE_ENFORCE_EQ(src.value().type(), selected_rows.value().type(),
-                      "The dtype of the two Variable is not equal.");
-    PADDLE_ENFORCE_EQ(src.value().layout(), selected_rows.value().layout(),
-                      "The layout of the two Variable's tensor is not equal.");
+                      platform::errors::PreconditionNotMet(
+                          "The dtype of the two variables is not equal."));
+    PADDLE_ENFORCE_EQ(
+        src.value().layout(), selected_rows.value().layout(),
+        platform::errors::PreconditionNotMet(
+            "The layout of the two variables' tensors is not equal."));
     PADDLE_ENFORCE_EQ(src.height(), selected_rows.height(),
-                      "The height of the two Variable is not equal.");
+                      platform::errors::PreconditionNotMet(
+                          "The height of the two variables is not equal."));
     PADDLE_ENFORCE_EQ(src.GetCompleteDims(), selected_rows.GetCompleteDims(),
-                      "The dims of the two Variable is not equal.");
+                      platform::errors::PreconditionNotMet(
+                          "The dims of the two variables is not equal."));
   }
 
   template <typename T>
   void operator()(const T&) {
-    PADDLE_ENFORCE("EnforceShapeAndDTypeEQ is not supported by type %s",
-                   typeid(T).name());
+    PADDLE_THROW(platform::errors::Unimplemented(
+        "EnforceShapeAndDTypeEQ is not supported for type %s.",
+        typeid(T).name()));
   }
 };
 
diff --git a/paddle/fluid/framework/device_worker.h b/paddle/fluid/framework/device_worker.h
index 04befbe1ca01d4bfec5872a63565f21d110a6c67..efe6fa1b2daffcbbaa2b7945e7139ac83f689bcd 100644
--- a/paddle/fluid/framework/device_worker.h
+++ b/paddle/fluid/framework/device_worker.h
@@ -441,6 +441,7 @@ class SectionWorker : public DeviceWorker {
     skip_vars_ = skip_vars;
   }
   static void ResetBatchId() { batch_id_ = 0; }
+  static void ResetThreadCompletedFlag() { threads_completed = false; }
 
   static std::atomic<int> cpu_id_;
 
diff --git a/paddle/fluid/framework/distributed_strategy.proto b/paddle/fluid/framework/distributed_strategy.proto
old mode 100755
new mode 100644
index 551d1342edeb335d1cad4782f85ae9f94f8739bd..df482f43346c57cc59af42936b6a7308b76cbd3a
--- a/paddle/fluid/framework/distributed_strategy.proto
+++ b/paddle/fluid/framework/distributed_strategy.proto
@@ -36,7 +36,15 @@ message AMPConfig {
   repeated string custom_black_varnames = 9;
 }
 
-message LocalSGDConfig { optional int32 k_steps = 1 [ default = 4 ]; }
+message LocalSGDConfig {
+  optional int32 k_steps = 1 [ default = 1 ];
+  optional int32 begin_step = 2 [ default = 1 ];
+}
+
+message AdaptiveLocalSGDConfig {
+  optional int32 init_k_steps = 1 [ default = 1 ];
+  optional int32 begin_step = 2 [ default = 1 ];
+}
 
 message GradientMergeConfig {
   optional int32 k_steps = 1 [ default = 1 ];
@@ -52,6 +60,8 @@ message DGCConfig {
 message LarsConfig {
   optional float lars_coeff = 1 [ default = 0.001 ];
   optional float lars_weight_decay = 2 [ default = 0.0005 ];
+  optional float epsilon = 3 [ default = 0.0 ];
+  repeated string exclude_from_weight_decay = 4;
 }
 
 message LambConfig {
@@ -116,6 +126,7 @@ message DistributedStrategy {
   optional bool cudnn_exhaustive_search = 21 [ default = true ];
   optional int32 conv_workspace_size_limit = 22 [ default = 4000 ];
   optional bool cudnn_batchnorm_spatial_persistent = 23 [ default = true ];
+  optional bool adaptive_localsgd = 24 [ default = false ];
 
   optional RecomputeConfig recompute_configs = 101;
   optional AMPConfig amp_configs = 102;
@@ -126,6 +137,7 @@ message DistributedStrategy {
   optional AsyncConfig a_sync_configs = 107;
   optional LarsConfig lars_configs = 108;
   optional LambConfig lamb_configs = 109;
+  optional AdaptiveLocalSGDConfig adaptive_localsgd_configs = 110;
   optional BuildStrategy build_strategy = 201;
   optional ExecutionStrategy execution_strategy = 202;
 }
diff --git a/paddle/fluid/framework/dlpack_tensor.cc b/paddle/fluid/framework/dlpack_tensor.cc
index 180b33d0cb72e2c4c9e6e8caff9f0ef5f1b04689..915589b3242b7d5675e630aca7310185fd109ec2 100644
--- a/paddle/fluid/framework/dlpack_tensor.cc
+++ b/paddle/fluid/framework/dlpack_tensor.cc
@@ -23,6 +23,7 @@ template <typename T>
 static ::DLDataType GetDLDataTypeCode() {
   ::DLDataType dtype;
   if (std::is_same<T, platform::float16>::value ||
+      std::is_same<T, platform::bfloat16>::value ||
       std::is_floating_point<T>::value) {
     dtype.code = kDLFloat;
   } else if (std::is_unsigned<T>::value) {
diff --git a/paddle/fluid/framework/fleet/gloo_wrapper.cc b/paddle/fluid/framework/fleet/gloo_wrapper.cc
index bb958f1ac015bfd1a71b3ccd530406a33e4e37cb..f195dde40843c8c4ee5168d11ad0b8eac8199f4e 100644
--- a/paddle/fluid/framework/fleet/gloo_wrapper.cc
+++ b/paddle/fluid/framework/fleet/gloo_wrapper.cc
@@ -19,6 +19,8 @@ limitations under the License. */
 namespace gloo {
 namespace rendezvous {
 
+constexpr int kNodeSize = 136;
+
 HdfsStore::HdfsStore(const std::string& path) {
   path_ = path;
   wait_sleep_ms_ = 10000;
@@ -213,12 +215,14 @@ void ParallelConnectContext::connectFullMesh(
   storeKey << rank;
   store.set(storeKey.str(), allBytes);
 
+  auto total_add_size = kNodeSize * (size - 1);
+
   std::vector<std::shared_ptr<std::thread>> connect_threads(thread_num_);
   // Connect every pair
   for (uint32_t i = 0; i < connect_threads.size(); ++i) {
     connect_threads[i].reset(new std::thread(
-        [&store, &transportContext, this](size_t thread_idx,
-                                          size_t thread_num) -> void {
+        [&store, &transportContext, total_add_size, this](
+            size_t thread_idx, size_t thread_num) -> void {
           for (int i = thread_idx; i < size; i += thread_num) {
             if (i == rank) {
               continue;
@@ -226,8 +230,23 @@ void ParallelConnectContext::connectFullMesh(
             // Wait for address of other side of this pair to become available
             std::string key = std::to_string(i);
             store.wait({key}, getTimeout());
+
+            std::vector<char> allAddrs;
+            auto max_retry_times = 5;
             // Connect to other side of this pair
-            auto allAddrs = store.get(key);
+
+            while (max_retry_times > 0) {
+              allAddrs = store.get(key);
+
+              VLOG(3) << "store get all address size: " << allAddrs.size()
+                      << " except: " << total_add_size;
+              if (allAddrs.size() == static_cast<size_t>(total_add_size)) {
+                break;
+              }
+
+              --max_retry_times;
+            }
+
             auto addr = extractAddress(allAddrs, i);
             transportContext->getPair(i)->connect(addr);
           }
diff --git a/paddle/fluid/framework/fleet/nccl_wrapper.cc b/paddle/fluid/framework/fleet/nccl_wrapper.cc
index d5a25605cf81147b520bf541e38f4f75e53ae756..33a91388fd8cc97d181df46ab826d384860d38f5 100644
--- a/paddle/fluid/framework/fleet/nccl_wrapper.cc
+++ b/paddle/fluid/framework/fleet/nccl_wrapper.cc
@@ -25,7 +25,7 @@ bool NCCLWrapper::is_initialized_ = false;
 
 void NCCLWrapper::InitNCCL() {
 #if defined(PADDLE_WITH_NCCL)
-  PADDLE_ENFORCE(platform::dynload::ncclCommInitRank(
+  PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::ncclCommInitRank(
       &(nccl_info_.comm_), nccl_info_.global_ranks_, nccl_info_.nccl_id_,
       nccl_info_.my_global_rank_));
 #endif
@@ -41,7 +41,8 @@ void NCCLWrapper::SetNCCLId(const NCCLInfo& nccl_info) {
 
 NCCLInfo NCCLWrapper::GetNCCLId() {
 #if defined(PADDLE_WITH_NCCL)
-  PADDLE_ENFORCE(platform::dynload::ncclGetUniqueId(&(nccl_info_.nccl_id_)));
+  PADDLE_ENFORCE_CUDA_SUCCESS(
+      platform::dynload::ncclGetUniqueId(&(nccl_info_.nccl_id_)));
 #endif
   return nccl_info_;
 }
@@ -52,8 +53,8 @@ void NCCLWrapper::SetRankInfo(const int local_rank, const int global_rank,
   nccl_info_.local_rank_ = local_rank;
   nccl_info_.my_global_rank_ = global_rank;
   nccl_info_.global_ranks_ = ranks;
-  PADDLE_ENFORCE(cudaSetDevice(local_rank));
-  PADDLE_ENFORCE(cudaStreamCreate(&(nccl_info_.stream_)));
+  PADDLE_ENFORCE_CUDA_SUCCESS(cudaSetDevice(local_rank));
+  PADDLE_ENFORCE_CUDA_SUCCESS(cudaStreamCreate(&(nccl_info_.stream_)));
 #endif
   return;
 }
@@ -65,7 +66,7 @@ void NCCLWrapper::SyncVar(const int root_rank, const Scope& scope,
     auto var = scope.FindVar(name);
     LoDTensor* tensor = var->GetMutable<LoDTensor>();
     int32_t total_size = tensor->numel();
-    PADDLE_ENFORCE(platform::dynload::ncclBcast(
+    PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::ncclBcast(
         reinterpret_cast<void*>(tensor->data<float>()), total_size, ncclFloat,
         root_rank, nccl_info_.comm_, nccl_info_.stream_));
     cudaStreamSynchronize(nccl_info_.stream_);
diff --git a/paddle/fluid/framework/generator.cc b/paddle/fluid/framework/generator.cc
index 9bde9e20b19a0b14ce4489b91d9ab3d5273f7f9a..d51e97d98e902a87cd2a44d2019e93e8dfc30fc8 100644
--- a/paddle/fluid/framework/generator.cc
+++ b/paddle/fluid/framework/generator.cc
@@ -21,10 +21,46 @@ limitations under the License. */
 #include <unordered_map>
 #include <unordered_set>
 #include <utility>
+#include <vector>
+
+#include "paddle/fluid/platform/enforce.h"
+#include "paddle/fluid/platform/gpu_info.h"
+#include "paddle/fluid/platform/place.h"
 
 namespace paddle {
 namespace framework {
 
+const std::shared_ptr<Generator>& GetDefaultCUDAGenerator(int64_t device_id) {
+#ifdef PADDLE_WITH_CUDA
+
+  static int64_t num_cuda_devices = -1;
+  static std::once_flag num_devices_init_flag;
+  static std::deque<std::once_flag> cuda_device_flags;
+  static std::vector<std::shared_ptr<Generator>> default_cuda_generators;
+
+  std::call_once(num_devices_init_flag, []() {
+    num_cuda_devices = paddle::platform::GetCUDADeviceCount();
+    cuda_device_flags.resize(num_cuda_devices);
+    default_cuda_generators.resize(num_cuda_devices);
+  });
+  if (device_id < 0) {
+    PADDLE_THROW(platform::errors::InvalidArgument(
+        "cuda device id shoule be greater than 0"));
+  }
+
+  std::call_once(cuda_device_flags[device_id], [device_id]() {
+    default_cuda_generators[device_id] =
+        std::make_shared<Generator>(GetRandomSeed(), device_id);
+    VLOG(4) << "initial seed: "
+            << default_cuda_generators[device_id]->GetCurrentSeed();
+  });
+  return default_cuda_generators[device_id];
+#else
+  PADDLE_THROW(platform::errors::PermissionDenied(
+      "getDefaultCUDAGenerator only support in CUDA place"));
+#endif
+}
+
 const std::shared_ptr<Generator>& DefaultCPUGenerator() {
   static auto default_cpu_generator =
       std::make_shared<Generator>(GetRandomSeed());
@@ -103,6 +139,7 @@ uint64_t Generator::Seed() {
 void Generator::SetCurrentSeed(uint64_t seed) {
   std::lock_guard<std::mutex> lock(this->mu_);
   this->state_.current_seed = seed;
+  this->state_.thread_offset = 0;
   std::seed_seq seq({seed});
   this->engine_->seed(seq);
 }
@@ -123,6 +160,22 @@ uint64_t Generator::Random64() {
   return (*engine)();
 }
 
+std::pair<uint64_t, uint64_t> Generator::IncrementOffset(
+    uint64_t increament_offset) {
+  uint64_t cur_offset = this->state_.thread_offset;
+#ifdef PADDLE_WITH_CUDA
+  std::lock_guard<std::mutex> lock(this->mu_);
+
+  this->state_.thread_offset += increament_offset;
+
+#else
+  PADDLE_THROW(platform::errors::PermissionDenied(
+      "Increment Offset only support in CUDA place"));
+#endif
+  return std::make_pair(static_cast<int>(this->state_.current_seed),
+                        cur_offset);
+}
+
 void Generator::SetIsInitPy(bool is_init_py) {
   this->is_init_py_ = is_init_py;
   VLOG(4) << "SetIsInitPy:" << this->is_init_py_;
diff --git a/paddle/fluid/framework/generator.h b/paddle/fluid/framework/generator.h
index 82b35f7ad550e770e8d10457ddf6cdf8e6fbd709..a279c2e4e1458293b6579b7b7cb2111e440e5d5e 100644
--- a/paddle/fluid/framework/generator.h
+++ b/paddle/fluid/framework/generator.h
@@ -38,6 +38,7 @@ static uint64_t GetRandomSeed() {
 struct GeneratorState {
   int64_t device = -1;
   uint64_t current_seed = 34342423252;
+  uint64_t thread_offset = 0;
   std::mt19937_64 cpu_engine;
 };
 
@@ -49,6 +50,7 @@ struct Generator {
     this->state_.cpu_engine = *engine;
     this->state_.device = -1;
     this->state_.current_seed = seed;
+    this->state_.thread_offset = 0;
     this->engine_ = engine;
     VLOG(4) << "initial seed: " << this->state_.current_seed
             << ", cpu engine: " << &this->state_.cpu_engine;
@@ -59,11 +61,25 @@ struct Generator {
     this->state_.cpu_engine = *engine;
     this->state_.device = -1;
     this->state_.current_seed = seed;
+    this->state_.thread_offset = 0;
     this->engine_ = engine;
     VLOG(4) << "initial seed: " << this->state_.current_seed
             << ", cpu engine: " << &this->state_.cpu_engine;
     this->is_init_py_ = true;  // TODO(zhiqiu): remove it in future
   }
+  Generator(uint64_t seed, uint64_t device_id) {
+    std::seed_seq seq({seed});
+    auto engine = std::make_shared<std::mt19937_64>(seq);
+    this->state_.cpu_engine = *engine;
+    this->state_.device = device_id;
+    this->state_.current_seed = seed;
+    this->state_.thread_offset = 0;
+    this->engine_ = engine;
+    VLOG(4) << "initial seed: " << this->state_.current_seed
+            << ", cpu engine: " << &this->state_.cpu_engine;
+    this->is_init_py_ = false;  // TODO(zhiqiu): remove it in future
+  }
+
   Generator(const Generator& other) = delete;
 
   // get random state
@@ -83,8 +99,11 @@ struct Generator {
 
   uint64_t Random64();
 
+  std::pair<uint64_t, uint64_t> IncrementOffset(uint64_t increament_offset);
+
   void SetIsInitPy(bool);
   bool GetIsInitPy() const;
+  uint64_t get_device_id() { return this->state_.device; }
 
  private:
   GeneratorState state_;
@@ -105,5 +124,8 @@ std::shared_ptr<std::mt19937_64> OpDefaultCPUEngine();
 
 std::shared_ptr<std::mt19937_64> GetCPURandomEngine(uint64_t);
 
+const std::shared_ptr<Generator>& GetDefaultCUDAGenerator(
+    int64_t device_id = -1);
+
 }  // namespace framework
 }  // namespace paddle
diff --git a/paddle/fluid/framework/ir/CMakeLists.txt b/paddle/fluid/framework/ir/CMakeLists.txt
index 8787aa8a94a44c2c36868fea4b88ede5f91b19f4..5bb833f613529a81d5ae4e18fc5ad7cd1136354b 100644
--- a/paddle/fluid/framework/ir/CMakeLists.txt
+++ b/paddle/fluid/framework/ir/CMakeLists.txt
@@ -102,6 +102,8 @@ if(WITH_MKLDNN)
     pass_library(conv_concat_relu_mkldnn_fuse_pass inference DIR mkldnn)
     pass_library(conv_elementwise_add_mkldnn_fuse_pass inference DIR mkldnn)
     pass_library(scale_matmul_fuse_pass inference DIR mkldnn)
+    pass_library(cpu_bfloat16_placement_pass inference DIR mkldnn)
+    pass_library(cpu_bfloat16_pass inference DIR mkldnn)
     pass_library(fc_mkldnn_pass inference DIR mkldnn)
     pass_library(cpu_quantize_placement_pass base DIR mkldnn)
     pass_library(cpu_quantize_pass inference DIR mkldnn)
@@ -162,4 +164,6 @@ endif()
     cc_test(test_cpu_quantize_squash_pass SRCS mkldnn/cpu_quantize_squash_pass_tester.cc DEPS cpu_quantize_squash_pass naive_executor)
     cc_test(test_reshape_transpose_matmul_mkldnn_fuse_pass SRCS mkldnn/reshape_transpose_matmul_mkldnn_fuse_pass_tester.cc DEPS reshape_transpose_matmul_mkldnn_fuse_pass)
     cc_test(test_matmul_transpose_reshape_fuse_pass SRCS mkldnn/matmul_transpose_reshape_fuse_pass_tester.cc DEPS matmul_transpose_reshape_fuse_pass)
+    cc_test(test_cpu_bfloat16_placement_pass SRCS mkldnn/cpu_bfloat16_placement_pass_tester.cc DEPS cpu_bfloat16_placement_pass)
+    cc_test(test_cpu_bfloat16_pass SRCS mkldnn/cpu_bfloat16_pass_tester.cc DEPS cpu_bfloat16_pass)
 endif ()
diff --git a/paddle/fluid/framework/ir/conv_affine_channel_fuse_pass.cc b/paddle/fluid/framework/ir/conv_affine_channel_fuse_pass.cc
index b50b4f37caecd8d8d5c393ee3a5c5b76c1f406be..fd8b55a6b7deb9bf4685b27f8849a49ab77f64e9 100644
--- a/paddle/fluid/framework/ir/conv_affine_channel_fuse_pass.cc
+++ b/paddle/fluid/framework/ir/conv_affine_channel_fuse_pass.cc
@@ -18,6 +18,7 @@
 #include <string>
 #include <vector>
 #include "paddle/fluid/framework/lod_tensor.h"
+#include "paddle/fluid/framework/op_version_registry.h"
 #include "paddle/fluid/operators/math/cpu_vec.h"
 #include "paddle/fluid/platform/enforce.h"
 
@@ -225,3 +226,14 @@ REGISTER_PASS(conv_affine_channel_fuse_pass,
               paddle::framework::ir::ConvAffineChannelFusePass);
 REGISTER_PASS(conv_eltwiseadd_affine_channel_fuse_pass,
               paddle::framework::ir::ConvEltwiseAddAffineChannelFusePass);
+REGISTER_PASS_CAPABILITY(conv_affine_channel_fuse_pass)
+    .AddCombination(
+        paddle::framework::compatible::OpVersionComparatorCombination()
+            .EQ("conv2d", 0)
+            .EQ("affine_channel", 0));
+REGISTER_PASS_CAPABILITY(conv_eltwiseadd_affine_channel_fuse_pass)
+    .AddCombination(
+        paddle::framework::compatible::OpVersionComparatorCombination()
+            .EQ("conv2d", 0)
+            .EQ("elementwise_add", 0)
+            .EQ("affine_channel", 0));
diff --git a/paddle/fluid/framework/ir/conv_bn_fuse_pass.cc b/paddle/fluid/framework/ir/conv_bn_fuse_pass.cc
index 9d3e0806ac79d838765ca5a4bbf61d0f67ab6ed5..fb787e08814429781bf64efda2f1eb915f185f63 100644
--- a/paddle/fluid/framework/ir/conv_bn_fuse_pass.cc
+++ b/paddle/fluid/framework/ir/conv_bn_fuse_pass.cc
@@ -18,6 +18,7 @@
 #include <string>
 #include <vector>
 #include "paddle/fluid/framework/lod_tensor.h"
+#include "paddle/fluid/framework/op_version_registry.h"
 #include "paddle/fluid/operators/math/cpu_vec.h"
 #include "paddle/fluid/platform/enforce.h"
 
@@ -372,3 +373,14 @@ REGISTER_PASS(depthwise_conv_bn_fuse_pass,
               paddle::framework::ir::DepthwiseConvBNFusePass);
 REGISTER_PASS(depthwise_conv_eltwiseadd_bn_fuse_pass,
               paddle::framework::ir::DepthwiseConvEltwiseAddBNFusePass);
+REGISTER_PASS_CAPABILITY(conv_bn_fuse_pass)
+    .AddCombination(
+        paddle::framework::compatible::OpVersionComparatorCombination()
+            .EQ("conv2d", 0)
+            .EQ("batch_norm", 0));
+REGISTER_PASS_CAPABILITY(conv_eltwiseadd_bn_fuse_pass)
+    .AddCombination(
+        paddle::framework::compatible::OpVersionComparatorCombination()
+            .EQ("conv2d", 0)
+            .EQ("elementwise_add", 0)
+            .EQ("batch_norm", 0));
diff --git a/paddle/fluid/framework/ir/conv_elementwise_add2_act_fuse_pass.cc b/paddle/fluid/framework/ir/conv_elementwise_add2_act_fuse_pass.cc
index 2627da7dc40f19a9df22d2f44a4b1032df5cea01..ad6af69ae02e4f6262ee8760dbda90e0b5833feb 100644
--- a/paddle/fluid/framework/ir/conv_elementwise_add2_act_fuse_pass.cc
+++ b/paddle/fluid/framework/ir/conv_elementwise_add2_act_fuse_pass.cc
@@ -11,9 +11,9 @@
 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 // See the License for the specific language governing permissions and
 // limitations under the License.
-
 #include "paddle/fluid/framework/ir/conv_elementwise_add2_act_fuse_pass.h"
 #include <string>
+#include "paddle/fluid/framework/op_version_registry.h"
 
 namespace paddle {
 namespace framework {
@@ -116,3 +116,10 @@ void ConvElementwiseAdd2ActFusePass::ApplyImpl(ir::Graph* graph) const {
 
 REGISTER_PASS(conv_elementwise_add2_act_fuse_pass,
               paddle::framework::ir::ConvElementwiseAdd2ActFusePass);
+REGISTER_PASS_CAPABILITY(conv_elementwise_add2_act_fuse_pass)
+    .AddCombination(
+        paddle::framework::compatible::OpVersionComparatorCombination()
+            .EQ("conv2d", 0)
+            .EQ("elementwise_add", 0)
+            .EQ("relu", 0)
+            .EQ("identity", 0));
diff --git a/paddle/fluid/framework/ir/conv_elementwise_add_act_fuse_pass.cc b/paddle/fluid/framework/ir/conv_elementwise_add_act_fuse_pass.cc
index 0b454a0407e48fcf2693975b00c60ee5448786e4..c5fa47ec55fe9a15caca493a4b0c72c22f2cf5c7 100644
--- a/paddle/fluid/framework/ir/conv_elementwise_add_act_fuse_pass.cc
+++ b/paddle/fluid/framework/ir/conv_elementwise_add_act_fuse_pass.cc
@@ -15,6 +15,7 @@
 #include "paddle/fluid/framework/ir/conv_elementwise_add_act_fuse_pass.h"
 #include <string>
 #include "paddle/fluid/framework/ir/graph_viz_pass.h"
+#include "paddle/fluid/framework/op_version_registry.h"
 
 namespace paddle {
 namespace framework {
@@ -102,3 +103,10 @@ void ConvElementwiseAddActFusePass::ApplyImpl(ir::Graph* graph) const {
 
 REGISTER_PASS(conv_elementwise_add_act_fuse_pass,
               paddle::framework::ir::ConvElementwiseAddActFusePass);
+REGISTER_PASS_CAPABILITY(conv_elementwise_add_act_fuse_pass)
+    .AddCombination(
+        paddle::framework::compatible::OpVersionComparatorCombination()
+            .EQ("conv2d", 0)
+            .EQ("elementwise_add", 0)
+            .EQ("relu", 0)
+            .EQ("identity", 0));
diff --git a/paddle/fluid/framework/ir/conv_elementwise_add_fuse_pass.cc b/paddle/fluid/framework/ir/conv_elementwise_add_fuse_pass.cc
index 007770cf57d278d155650c00996413e3bc8e7b53..38c0b773ddeddddea68ecefd6c8525449c52d7a6 100644
--- a/paddle/fluid/framework/ir/conv_elementwise_add_fuse_pass.cc
+++ b/paddle/fluid/framework/ir/conv_elementwise_add_fuse_pass.cc
@@ -12,10 +12,10 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include <string>
-
 #include "paddle/fluid/framework/ir/conv_elementwise_add_fuse_pass.h"
+#include <string>
 #include "paddle/fluid/framework/ir/graph_viz_pass.h"
+#include "paddle/fluid/framework/op_version_registry.h"
 
 namespace paddle {
 namespace framework {
@@ -89,3 +89,8 @@ void ConvElementwiseAddFusePass::ApplyImpl(ir::Graph* graph) const {
 
 REGISTER_PASS(conv_elementwise_add_fuse_pass,
               paddle::framework::ir::ConvElementwiseAddFusePass);
+REGISTER_PASS_CAPABILITY(conv_elementwise_add_fuse_pass)
+    .AddCombination(
+        paddle::framework::compatible::OpVersionComparatorCombination()
+            .EQ("conv2d", 0)
+            .EQ("elementwise_add", 0));
diff --git a/paddle/fluid/framework/ir/embedding_eltwise_layernorm_fuse_pass.cc b/paddle/fluid/framework/ir/embedding_eltwise_layernorm_fuse_pass.cc
index 7612df9ab915011001b57b37e4fd559d393302a7..3f88a460d140f6d7389194a29e37128f5ba5b458 100644
--- a/paddle/fluid/framework/ir/embedding_eltwise_layernorm_fuse_pass.cc
+++ b/paddle/fluid/framework/ir/embedding_eltwise_layernorm_fuse_pass.cc
@@ -19,6 +19,7 @@
 #include <vector>
 #include "paddle/fluid/framework/ddim.h"
 #include "paddle/fluid/framework/lod_tensor.h"
+#include "paddle/fluid/framework/op_version_registry.h"
 
 namespace paddle {
 namespace framework {
@@ -334,3 +335,8 @@ void EmbeddingEltwiseLayerNormFusePass::ApplyImpl(Graph* graph) const {
 
 REGISTER_PASS(embedding_eltwise_layernorm_fuse_pass,
               paddle::framework::ir::EmbeddingEltwiseLayerNormFusePass);
+REGISTER_PASS_CAPABILITY(embedding_eltwise_layernorm_fuse_pass)
+    .AddCombination(
+        paddle::framework::compatible::OpVersionComparatorCombination()
+            .EQ("lookup_table", 0)
+            .EQ("elementweise_add", 0));
diff --git a/paddle/fluid/framework/ir/embedding_eltwise_layernorm_fuse_pass_tester.cc b/paddle/fluid/framework/ir/embedding_eltwise_layernorm_fuse_pass_tester.cc
index 71c9dbae1a46af1ecae0aaff3fde52de8142d4bb..727e42629f9fab9183668ae0cc84ae54eb01982c 100644
--- a/paddle/fluid/framework/ir/embedding_eltwise_layernorm_fuse_pass_tester.cc
+++ b/paddle/fluid/framework/ir/embedding_eltwise_layernorm_fuse_pass_tester.cc
@@ -16,12 +16,13 @@ limitations under the License. */
 
 #include <gtest/gtest.h>
 #include "paddle/fluid/framework/ir/pass_tester_helper.h"
+#include "paddle/fluid/framework/op_version_registry.h"
 
 namespace paddle {
 namespace framework {
 namespace ir {
 
-TEST(SkipLayerNormFusePass, basic) {
+TEST(EmbeddingElewiseLayernormFusePass, basic) {
   // inputs                           operator            output
   // --------------------------------------------------------------------
   // (x, y)                       elementwise_add    -> elementwise_out
@@ -91,6 +92,12 @@ TEST(SkipLayerNormFusePass, basic) {
           "The number of fusion nodes does not meet expectations after fuse"));
 }
 
+TEST(EmbeddingElewiseLayernormFusePass, pass_op_version_check) {
+  ASSERT_TRUE(
+      paddle::framework::compatible::PassVersionCheckerRegistrar::GetInstance()
+          .IsPassCompatible("embedding_eltwise_layernorm_fuse_pass"));
+}
+
 }  // namespace ir
 }  // namespace framework
 }  // namespace paddle
diff --git a/paddle/fluid/framework/ir/embedding_fc_lstm_fuse_pass.cc b/paddle/fluid/framework/ir/embedding_fc_lstm_fuse_pass.cc
index c50b7476c6a9616a784646b3ef6a43140ac2d401..02e3e2542f6e8dea47c53fd298c7ae7512a72c36 100644
--- a/paddle/fluid/framework/ir/embedding_fc_lstm_fuse_pass.cc
+++ b/paddle/fluid/framework/ir/embedding_fc_lstm_fuse_pass.cc
@@ -23,6 +23,8 @@
 #include "paddle/fluid/operators/math/cpu_vec.h"
 #include "paddle/fluid/platform/cpu_info.h"
 
+#include "paddle/fluid/framework/op_version_registry.h"
+
 namespace paddle {
 namespace framework {
 namespace ir {
@@ -34,7 +36,7 @@ static int BuildFusion(Graph* graph, const std::string& name_scope,
 
   // Build pattern
   PDNode* x = pattern->NewNode(patterns::PDNodeName(name_scope, "x"))
-                  ->assert_is_op_input("lookup_table")
+                  ->assert_is_op_input("lookup_table_v2")
                   ->assert_var_not_persistable();
   patterns::Embedding embedding_pattern(pattern, name_scope);
   // TODO(jczaja): Intermediate can only be for val that are not used anywhere
@@ -256,3 +258,11 @@ void EmbeddingFCLSTMFusePass::ApplyImpl(ir::Graph* graph) const {
 
 REGISTER_PASS(embedding_fc_lstm_fuse_pass,
               paddle::framework::ir::EmbeddingFCLSTMFusePass);
+REGISTER_PASS_CAPABILITY(embedding_fc_lstm_fuse_pass)
+    .AddCombination(
+        paddle::framework::compatible::OpVersionComparatorCombination()
+            .EQ("lookup_table_v2", 0)
+            .EQ("mul", 0)
+            .EQ("elementwise_add", 0)
+            .EQ("lstm", 0)
+            .EQ("fused_embedding_fc_lstm", 0));
diff --git a/paddle/fluid/framework/ir/fc_fuse_pass.cc b/paddle/fluid/framework/ir/fc_fuse_pass.cc
index 066a8fb975740ad5e45b4840a7404160d086b6f0..d60510a4074997a028cd914ca7a0e76335801c80 100644
--- a/paddle/fluid/framework/ir/fc_fuse_pass.cc
+++ b/paddle/fluid/framework/ir/fc_fuse_pass.cc
@@ -18,6 +18,7 @@
 #include <unordered_set>
 #include <vector>
 #include "paddle/fluid/framework/ir/graph_helper.h"
+#include "paddle/fluid/framework/op_version_registry.h"
 #include "paddle/fluid/platform/enforce.h"
 
 namespace paddle {
@@ -182,3 +183,10 @@ int FCFusePass::ApplyFCPattern(Graph* graph, bool with_relu) const {
 
 REGISTER_PASS(fc_fuse_pass, paddle::framework::ir::FCFusePass)
     .RequirePassAttr("use_gpu");
+REGISTER_PASS_CAPABILITY(fc_fuse_pass)
+    .AddCombination(
+        paddle::framework::compatible::OpVersionComparatorCombination()
+            .EQ("mul", 0)
+            .EQ("elementwise_add", 0)
+            .EQ("relu", 0)
+            .EQ("fc", 0));
diff --git a/paddle/fluid/framework/ir/fc_gru_fuse_pass.cc b/paddle/fluid/framework/ir/fc_gru_fuse_pass.cc
index a2185cdc5593cc36ed6ceda839fb13c28b45600c..f5fea90ac2fcee8e9c48ca21203b3b60cd7f7166 100644
--- a/paddle/fluid/framework/ir/fc_gru_fuse_pass.cc
+++ b/paddle/fluid/framework/ir/fc_gru_fuse_pass.cc
@@ -16,6 +16,7 @@
 #include <string>
 #include <unordered_set>
 #include "paddle/fluid/framework/lod_tensor.h"
+#include "paddle/fluid/framework/op_version_registry.h"
 
 namespace paddle {
 namespace framework {
@@ -125,7 +126,6 @@ static int BuildFusion(Graph* graph, const std::string& name_scope,
     auto* x_n = subgraph.at(x);
     GET_IR_NODE_FROM_SUBGRAPH(w, w, fc_pattern);
     GET_IR_NODE_FROM_SUBGRAPH(mul, mul, fc_pattern);
-    GET_IR_NODE_FROM_SUBGRAPH(fc_out, elementwise_add_out, fc_pattern);
     GET_IR_NODE_FROM_SUBGRAPH(Weight, Weight, gru_pattern);
     GET_IR_NODE_FROM_SUBGRAPH(gru, gru, gru_pattern);
     GET_IR_NODE_FROM_SUBGRAPH(Bias, Bias, gru_pattern);
@@ -136,10 +136,17 @@ static int BuildFusion(Graph* graph, const std::string& name_scope,
                               gru_pattern);
     GET_IR_NODE_FROM_SUBGRAPH(BatchHidden, BatchHidden, gru_pattern);
 
+    // TODO(wilber): Support origin_mode=True.
+    if (gru->Op()->GetAttrIfExists<bool>("origin_mode") == true) {
+      LOG(INFO) << "fc_gru_fuse_pass not supported when origin_mode=True.";
+      return;
+    }
+
     if (with_fc_bias) {
       GET_IR_NODE_FROM_SUBGRAPH(mul_out, mul_out, fc_pattern);
       GET_IR_NODE_FROM_SUBGRAPH(fc_bias, bias, fc_pattern);
       GET_IR_NODE_FROM_SUBGRAPH(elementwise_add, elementwise_add, fc_pattern);
+      GET_IR_NODE_FROM_SUBGRAPH(fc_out, elementwise_add_out, fc_pattern);
 
       gru_creater(gru, x_n, w, Weight, Bias, Hidden, fc_bias);
       // Remove unneeded nodes.
@@ -188,3 +195,16 @@ void FCGRUFusePass::ApplyImpl(ir::Graph* graph) const {
 
 REGISTER_PASS(mul_gru_fuse_pass, paddle::framework::ir::MulGRUFusePass);
 REGISTER_PASS(fc_gru_fuse_pass, paddle::framework::ir::FCGRUFusePass);
+REGISTER_PASS_CAPABILITY(mul_gru_fuse_pass)
+    .AddCombination(
+        paddle::framework::compatible::OpVersionComparatorCombination()
+            .EQ("mul", 0)
+            .EQ("gru", 0)
+            .EQ("fusion_gru", 0));
+REGISTER_PASS_CAPABILITY(fc_gru_fuse_pass)
+    .AddCombination(
+        paddle::framework::compatible::OpVersionComparatorCombination()
+            .EQ("mul", 0)
+            .EQ("elementwise_add", 0)
+            .EQ("gru", 0)
+            .EQ("fusion_gru", 0));
diff --git a/paddle/fluid/framework/ir/fc_lstm_fuse_pass.cc b/paddle/fluid/framework/ir/fc_lstm_fuse_pass.cc
index 12c7fc051e23a946ec9049e061499056f009bfa3..a3c57e14e1aedbed1e4cf462d4883cd83bf2fa10 100644
--- a/paddle/fluid/framework/ir/fc_lstm_fuse_pass.cc
+++ b/paddle/fluid/framework/ir/fc_lstm_fuse_pass.cc
@@ -16,6 +16,7 @@
 #include <string>
 #include <unordered_set>
 #include "paddle/fluid/framework/lod_tensor.h"
+#include "paddle/fluid/framework/op_version_registry.h"
 
 namespace paddle {
 namespace framework {
@@ -196,3 +197,17 @@ void FCLstmFusePass::ApplyImpl(ir::Graph* graph) const {
 
 REGISTER_PASS(mul_lstm_fuse_pass, paddle::framework::ir::MulLstmFusePass);
 REGISTER_PASS(fc_lstm_fuse_pass, paddle::framework::ir::FCLstmFusePass);
+
+REGISTER_PASS_CAPABILITY(fc_lstm_fuse_pass)
+    .AddCombination(
+        paddle::framework::compatible::OpVersionComparatorCombination()
+            .EQ("mul", 0)
+            .EQ("elementwise_add", 0)
+            .EQ("lstm", 0)
+            .EQ("fusion_lstm", 0));
+REGISTER_PASS_CAPABILITY(mul_lstm_fuse_pass)
+    .AddCombination(
+        paddle::framework::compatible::OpVersionComparatorCombination()
+            .EQ("mul", 0)
+            .EQ("lstm", 0)
+            .EQ("fusion_lstm", 0));
diff --git a/paddle/fluid/framework/ir/graph_pattern_detector.cc b/paddle/fluid/framework/ir/graph_pattern_detector.cc
index 3d65fe595373fa98ba237f04134c75d4a60a7242..9c1eaa99a3ca04ddbeecab639d5587d5509e3f00 100644
--- a/paddle/fluid/framework/ir/graph_pattern_detector.cc
+++ b/paddle/fluid/framework/ir/graph_pattern_detector.cc
@@ -1892,6 +1892,82 @@ PDNode *patterns::QuantizePlacement::operator()(
   return op;
 }
 
+PDNode *patterns::Bfloat16Placement::operator()(
+    const std::unordered_set<std::string> &bfloat16_enabled_op_types) {
+  std::unordered_set<std::string> supported_op_types =
+      std::unordered_set<std::string>();
+  if (!bfloat16_enabled_op_types.empty()) {
+    supported_op_types = bfloat16_enabled_op_types;
+  }
+  auto *op = pattern->NewNode(op_repr())->assert_is_ops(supported_op_types);
+  return op;
+}
+
+PDNode *patterns::OrphanedBfloat16::operator()() {
+  auto *prev_op = pattern->NewNode(prev_op_repr())->assert_is_op();
+  prev_op->assert_more([&](Node *node) {
+    return node->Op()->GetAttrIfExists<std::string>("mkldnn_data_type") ==
+           "float32";
+  });
+  auto *prev_out = pattern->NewNode(prev_out_repr())->AsOutput();
+
+  auto *op = pattern->NewNode(op_repr())->assert_is_op();
+  op->assert_more([&](Node *node) {
+    return node->Op()->GetAttrIfExists<std::string>("mkldnn_data_type") ==
+           "bfloat16";
+  });
+  auto *op_out = pattern->NewNode(op_out_repr())->AsOutput();
+
+  auto *next_op = pattern->NewNode(next_op_repr())->assert_is_op();
+  next_op->assert_more([&](Node *node) {
+    return node->Op()->GetAttrIfExists<std::string>("mkldnn_data_type") ==
+           "float32";
+  });
+
+  prev_op->LinksTo({prev_out});
+  op->LinksFrom({prev_out}).LinksTo({op_out});
+  next_op->LinksFrom({op_out});
+  return next_op;
+}
+
+PDNode *patterns::LastBfloat16Ops::operator()() {
+  auto *op = pattern->NewNode(op_repr())->assert_is_op();
+  op->assert_more([&](Node *node) {
+    return node->Op()->GetAttrIfExists<std::string>("mkldnn_data_type") ==
+           "bfloat16";
+  });
+  auto *op_out = pattern->NewNode(op_out_repr())->AsOutput();
+
+  auto *next_op = pattern->NewNode(next_op_repr())->assert_is_op();
+  next_op->assert_more([&](Node *node) {
+    return node->Op()->GetAttrIfExists<std::string>("mkldnn_data_type") !=
+           "bfloat16";
+  });
+
+  op->LinksTo({op_out});
+  next_op->LinksFrom({op_out});
+  return next_op;
+}
+
+PDNode *patterns::FirstBfloat16Ops::operator()() {
+  auto *prev_op = pattern->NewNode(prev_op_repr())->assert_is_op();
+  prev_op->assert_more([&](Node *node) {
+    return node->Op()->GetAttrIfExists<std::string>("mkldnn_data_type") !=
+           "bfloat16";
+  });
+  auto *op_in = pattern->NewNode(op_in_repr())->AsOutput();
+
+  auto *op = pattern->NewNode(op_repr())->assert_is_op();
+  op->assert_more([&](Node *node) {
+    return node->Op()->GetAttrIfExists<std::string>("mkldnn_data_type") ==
+           "bfloat16";
+  });
+
+  prev_op->LinksTo({op_in});
+  op->LinksFrom({op_in});
+  return op;
+}
+
 PDNode *patterns::MKLDNNInPlace::operator()() {
   const std::unordered_set<std::string> &supported_op_types = {
       "abs",
diff --git a/paddle/fluid/framework/ir/graph_pattern_detector.h b/paddle/fluid/framework/ir/graph_pattern_detector.h
index 0803265884165bc754489b18d07c0d277a4bd92b..053c1fe832b0088d2abdd3f8eb40a0042e5e2dfe 100644
--- a/paddle/fluid/framework/ir/graph_pattern_detector.h
+++ b/paddle/fluid/framework/ir/graph_pattern_detector.h
@@ -1129,6 +1129,47 @@ struct QuantizePlacement : public PatternBase {
   PATTERN_DECL_NODE(op);
 };
 
+struct Bfloat16Placement : public PatternBase {
+  Bfloat16Placement(PDPattern* pattern, const std::string& name_scope)
+      : PatternBase(pattern, name_scope, "bfloat16_placement") {}
+  PDNode* operator()(
+      const std::unordered_set<std::string>& bfloat16_enabled_op_types);
+
+  PATTERN_DECL_NODE(op);
+};
+
+struct OrphanedBfloat16 : public PatternBase {
+  OrphanedBfloat16(PDPattern* pattern, const std::string& name_scope)
+      : PatternBase(pattern, name_scope, "orphaned_bfloat16") {}
+  PDNode* operator()();
+
+  PATTERN_DECL_NODE(prev_op);
+  PATTERN_DECL_NODE(prev_out);
+  PATTERN_DECL_NODE(op);
+  PATTERN_DECL_NODE(op_out);
+  PATTERN_DECL_NODE(next_op);
+};
+
+struct LastBfloat16Ops : public PatternBase {
+  LastBfloat16Ops(PDPattern* pattern, const std::string& name_scope)
+      : PatternBase(pattern, name_scope, "last_bfloat16_ops") {}
+  PDNode* operator()();
+
+  PATTERN_DECL_NODE(op);
+  PATTERN_DECL_NODE(op_out);
+  PATTERN_DECL_NODE(next_op);
+};
+
+struct FirstBfloat16Ops : public PatternBase {
+  FirstBfloat16Ops(PDPattern* pattern, const std::string& name_scope)
+      : PatternBase(pattern, name_scope, "first_bfloat16_ops") {}
+  PDNode* operator()();
+
+  PATTERN_DECL_NODE(prev_op);
+  PATTERN_DECL_NODE(op_in);
+  PATTERN_DECL_NODE(op);
+};
+
 // Pattern used for enforcing inplace computation for in-place computation
 // supporting DNNL ops. softmax, batch_norm and layer_norm
 struct MKLDNNInPlace : public PatternBase {
diff --git a/paddle/fluid/framework/ir/memory_optimize_pass/CMakeLists.txt b/paddle/fluid/framework/ir/memory_optimize_pass/CMakeLists.txt
index 726a2d90fcf03c3e2023485e983ea64f93231f73..a8c0973cac488ceb96249a898e819af7565c6c7a 100644
--- a/paddle/fluid/framework/ir/memory_optimize_pass/CMakeLists.txt
+++ b/paddle/fluid/framework/ir/memory_optimize_pass/CMakeLists.txt
@@ -13,4 +13,6 @@ cc_library(memory_reuse_pass SRCS memory_reuse_pass.cc DEPS computation_op_handl
 cc_library(buffer_shared_inplace_op_pass SRCS buffer_shared_inplace_op_pass.cc DEPS memory_reuse_pass)
 cc_library(buffer_shared_cross_op_memory_reuse_pass SRCS buffer_shared_cross_op_memory_reuse_pass.cc DEPS memory_reuse_pass) 
 
+cc_library(inplace_addto_op_pass SRCS inplace_addto_op_pass.cc DEPS memory_reuse_pass)
+
 cc_test(test_reference_count_pass_last_lived_ops SRCS test_reference_count_pass_last_lived_ops.cc DEPS parallel_executor elementwise_mul_op elementwise_add_op scale_op)
diff --git a/paddle/fluid/framework/ir/memory_optimize_pass/buffer_shared_inplace_op_pass.cc b/paddle/fluid/framework/ir/memory_optimize_pass/buffer_shared_inplace_op_pass.cc
index 0b42f2ebd5555a5c73527d9819ff254411a399d4..ce7f27d27559c70cf164f6bb641fa0ee6f02a2a0 100644
--- a/paddle/fluid/framework/ir/memory_optimize_pass/buffer_shared_inplace_op_pass.cc
+++ b/paddle/fluid/framework/ir/memory_optimize_pass/buffer_shared_inplace_op_pass.cc
@@ -16,6 +16,7 @@
 #include <unordered_map>
 #include <unordered_set>
 #include <vector>
+
 #include "paddle/fluid/framework/details/computation_op_handle.h"
 #include "paddle/fluid/framework/details/multi_devices_helper.h"
 #include "paddle/fluid/framework/details/share_tensor_buffer_op_handle.h"
@@ -141,11 +142,12 @@ void BufferSharedInplaceOpPass::Run(Graph *graph) const {
         VLOG(4) << "Inplace performed in op " << op_type << ": "
                 << in_var_handle_ptr->Name() << " -> "
                 << out_var_handle_ptr->Name()
-                << ". Debug String is: " << op->GetOp()->DebugString();
+                << ". Debug String is: " << op->GetOp()->DebugString()
+                << ". ReuseType: " << ReuseType();
       } else {
         VLOG(3) << "Inplace failed in op " << op_type << ": "
                 << in_var_handle_ptr->Name() << " -> "
-                << out_var_handle_ptr->Name();
+                << out_var_handle_ptr->Name() << ". ReuseType: " << ReuseType();
       }
     }
   }
diff --git a/paddle/fluid/framework/ir/memory_optimize_pass/inplace_addto_op_pass.cc b/paddle/fluid/framework/ir/memory_optimize_pass/inplace_addto_op_pass.cc
new file mode 100644
index 0000000000000000000000000000000000000000..81c63f46bda453ec8705cf4bc93dd9e3acf844ec
--- /dev/null
+++ b/paddle/fluid/framework/ir/memory_optimize_pass/inplace_addto_op_pass.cc
@@ -0,0 +1,221 @@
+// Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <string>
+#include <unordered_map>
+#include <unordered_set>
+#include <vector>
+
+#include "paddle/fluid/framework/details/computation_op_handle.h"
+#include "paddle/fluid/framework/details/multi_devices_helper.h"
+#include "paddle/fluid/framework/details/share_tensor_buffer_op_handle.h"
+#include "paddle/fluid/framework/ir/memory_optimize_pass/memory_optimization_var_info.h"
+#include "paddle/fluid/framework/ir/memory_optimize_pass/memory_reuse_pass.h"
+#include "paddle/fluid/framework/ir/memory_optimize_pass/reference_count_pass_helper.h"
+#include "paddle/fluid/framework/ir/pass.h"
+
+namespace paddle {
+namespace framework {
+namespace ir {
+
+class InplaceAddToOpPass : public MemoryReusePass {
+ protected:
+  std::string ReuseType() const override { return "inplace_addto"; }
+
+  void Run(Graph *graph) const override;
+
+ private:
+  // 1. Add last living op of in_var, add any last living op of out_var
+  // 2. Set reference count of in_var to be 2
+  void UpdateLastLiveOpOfVar(details::ComputationOpHandle *op,
+                             details::VarHandle *in_var,
+                             details::VarHandle *out_var) const override {
+    size_t scope_idx = op->GetScopeIdx();
+    auto *last_live_ops_of_vars_ =
+        &Get<std::vector<LastLiveOpsOfVars>>(kLastLiveOpsOfVars);
+    auto *var_infos_ = &(Get<MemOptVarInfoMapList>(kMemOptVarInfoMapList));
+    auto out_var_op_iter =
+        (*last_live_ops_of_vars_)[scope_idx].find(out_var->Name());
+
+    // In Reduce mode, some output variable(gradient of parameter) does not have
+    // last live ops
+    details::ComputationOpHandle *last_live_op_of_in_var = nullptr;
+    if (out_var_op_iter == (*last_live_ops_of_vars_)[scope_idx].end()) {
+      last_live_op_of_in_var = op;
+    } else {
+      PADDLE_ENFORCE_EQ(
+          out_var_op_iter->second.ops().empty(), false,
+          platform::errors::InvalidArgument(
+              "Var(%s)'s last live op should not empty.", out_var->Name()));
+      last_live_op_of_in_var = *(out_var_op_iter->second.ops().begin());
+    }
+
+    auto *last_live_ops_of_in_var =
+        (*last_live_ops_of_vars_)[scope_idx][in_var->Name()].mutable_ops();
+    // last_live_ops_of_in_var->clear();
+    last_live_ops_of_in_var->insert(last_live_op_of_in_var);
+
+    auto in_var_info_iter = (*var_infos_)[scope_idx].find(in_var->Name());
+    PADDLE_ENFORCE_NE(
+        in_var_info_iter, (*var_infos_)[scope_idx].end(),
+        platform::errors::NotFound("Cannot find variable %s.", in_var->Name()));
+
+    in_var_info_iter->second->SetRefCnt(2);  // before inplace, it is 1
+  }
+};
+
+void InplaceAddToOpPass::Run(Graph *graph) const {
+  const auto &last_live_ops =
+      Get<std::vector<LastLiveOpsOfVars>>(kLastLiveOpsOfVars);
+
+  bool use_cuda = Get<bool>(kUseCuda);
+
+  // Currently, only perform InplaceAddToOpPass on cuda place
+  if (!use_cuda) {
+    return;
+  }
+
+  // Step 1: Build a reverse map of last_live_ops
+  // i.e.: op -> vars
+  std::unordered_map<details::ComputationOpHandle *,
+                     std::unordered_map<std::string, ir::Node *>>
+      candidate_ops;
+  for (auto &each_scope_ops : last_live_ops) {
+    for (auto &pair : each_scope_ops) {
+      // If variable has more than 1 last lived ops, this variable cannot
+      // be inplaced.
+      if (pair.second.ops().size() != 1) {
+        continue;
+      }
+
+      auto *op = *(pair.second.ops().begin());
+      const std::string &op_type = op->GetOp()->Type();
+      const framework::OpDesc *op_desc = op->Node()->Op();
+      PADDLE_ENFORCE_NOT_NULL(
+          op_desc, platform::errors::NotFound("Op(%s) can not find opdesc.",
+                                              op->Name()));
+
+      // only grad op should be processed.
+      if (op_type != "grad_add") {
+        continue;
+      }
+
+      const std::string &var_name = pair.first;
+      auto in_nodes = this->FindNodesByName(var_name, op->Node()->inputs);
+      if (in_nodes.size() == 1) {
+        candidate_ops[op][var_name] = *in_nodes.begin();
+      }
+      VLOG(4) << "Find op " << op_type << " with input(" << var_name
+              << ") that can do inplace add to";
+    }
+  }
+
+  // Step 2: Check which vars can be inplaced indeed
+  for (auto &op_vars_pair : candidate_ops) {
+    auto *op = op_vars_pair.first;
+
+    // The original gradient accumulation is g = sum(g_0, g_1,..., g_n), and it
+    // could be changed as follws if inplace addto is enabled:
+    // g_sum_0 = g_0
+    // g_sum_1 = grad_add(g_sum_0, g_1)
+    // g_sum_2 = grad_add(g_sum_1, g_2)
+    // ...
+    // g_sum_n = grad_add(g_sum_n-1, g_n)
+
+    // here we will add inplace for each grad_add, for example, for the first
+    // grad_add, g_sum_0 -> g1, g_sum_1 -> g1, and set grad_add as skipped.
+
+    const std::string &op_type = op->GetOp()->Type();
+
+    PADDLE_ENFORCE_EQ(op->Node()->inputs.size(), 2,
+                      platform::errors::InvalidArgument(
+                          "The size of inputs of %s should be 2, but got %d",
+                          op_type, op->Node()->inputs.size()));
+
+    PADDLE_ENFORCE_EQ(op->Node()->outputs.size(), 1,
+                      platform::errors::InvalidArgument(
+                          "The size of outputs of %s should be 1, but got %d",
+                          op_type, op->Node()->outputs.size()));
+
+    auto *left_var_ptr = dynamic_cast<details::VarHandle *>(
+        &(op->Node()->inputs[0]->Wrapper<details::VarHandleBase>()));
+    auto *right_var_ptr = dynamic_cast<details::VarHandle *>(
+        &(op->Node()->inputs[1]->Wrapper<details::VarHandleBase>()));
+    auto *out_var_ptr = dynamic_cast<details::VarHandle *>(
+        &(op->Node()->outputs[0]->Wrapper<details::VarHandleBase>()));
+
+    if (left_var_ptr == nullptr || right_var_ptr == nullptr ||
+        out_var_ptr == nullptr) {
+      continue;
+    }
+
+    // auto *left_generated_op = dynamic_cast<details::ComputationOpHandle *>(
+    //     left_var_ptr->GeneratedOp());
+
+    auto *right_generated_op = dynamic_cast<details::ComputationOpHandle *>(
+        right_var_ptr->GeneratedOp());
+
+    auto *out_generated_op = dynamic_cast<details::ComputationOpHandle *>(
+        out_var_ptr->GeneratedOp());
+
+    // NOTE(zhiqiu): currently, only conv2d_grad supports addto strategy
+    if (right_generated_op->Name() != "conv2d_grad") {
+      continue;
+    }
+
+    // NOTE(zhiqiu): Normally, if we inplace a->b, we should let a generated
+    // before b. However, in the situation of inplace addto, we do not care
+    // the order, since a+b is equal to b+a. Is there any exception for that?
+
+    // AddDependencyVar(right_generated_op, left_generated_op);
+    // no need, as discussed above.
+
+    // step (a): inplace right_var->left_var of grad_add
+
+    this->AddReuseVar(right_generated_op, left_var_ptr, right_var_ptr);
+    UpdateLastLiveOpOfVar(right_generated_op, left_var_ptr, right_var_ptr);
+    VLOG(4) << "Inplace performed in op " << right_generated_op->GetOp()->Type()
+            << ": " << left_var_ptr->Name() << " -> " << right_var_ptr->Name()
+            << ". Debug String is: "
+            << right_generated_op->GetOp()->DebugString()
+            << ". ReuseType: " << ReuseType();
+
+    // step (b): inplace out -> right_var of grad_add
+
+    this->AddReuseVar(out_generated_op, right_var_ptr, out_var_ptr, true);
+
+    VLOG(4) << "Inplace performed in op " << op_type << ": "
+            << left_var_ptr->Name() << " -> " << out_var_ptr->Name()
+            << ". Debug String is: " << op->GetOp()->DebugString()
+            << ". ReuseType: " << ReuseType();
+
+    // step (c): make right_var cannot inplace afterwards. canbe done
+    // aotomatically since CollectReusedVars is called before any reuse.
+
+    // step (d): make right_var's generated op use addto
+    right_generated_op->GetOp()->SetAttr("use_addto", true);
+
+    // step (e): make grad_add skip running
+    op->SetSkipRunning(true);
+  }
+}
+
+}  // namespace ir
+}  // namespace framework
+}  // namespace paddle
+
+REGISTER_PASS(inplace_addto_op_pass, paddle::framework::ir::InplaceAddToOpPass)
+    .RequirePassAttr(paddle::framework::ir::kMemOptVarInfoMapList)
+    .RequirePassAttr(paddle::framework::ir::kLastLiveOpsOfVars)
+    .RequirePassAttr(paddle::framework::ir::kUseCuda);
diff --git a/paddle/fluid/framework/ir/memory_optimize_pass/memory_reuse_pass.cc b/paddle/fluid/framework/ir/memory_optimize_pass/memory_reuse_pass.cc
index 221b0a76e7ef5b01d87c63fb466a9b980f1e69b4..3e3b9864a7b408267ac73de053c1692628e9a14c 100644
--- a/paddle/fluid/framework/ir/memory_optimize_pass/memory_reuse_pass.cc
+++ b/paddle/fluid/framework/ir/memory_optimize_pass/memory_reuse_pass.cc
@@ -13,6 +13,7 @@
 // limitations under the License.
 
 #include "paddle/fluid/framework/ir/memory_optimize_pass/memory_reuse_pass.h"
+
 #include <functional>
 #include <map>
 #include <string>
@@ -73,6 +74,7 @@ bool MemoryReusePass::TryReuseVar(details::VarHandle *in_var,
           out_var->Name()));
   if (IsVarPairReusable(*in_var, *out_var)) {
     AddReuseVar(op, in_var, out_var);
+    UpdateLastLiveOpOfVar(op, in_var, out_var);
     return true;
   } else {
     return false;
@@ -324,7 +326,8 @@ bool MemoryReusePass::IsVarPairReusable(
 
 void MemoryReusePass::AddReuseVar(details::ComputationOpHandle *op,
                                   details::VarHandle *in_var,
-                                  details::VarHandle *out_var) const {
+                                  details::VarHandle *out_var,
+                                  bool share_dims) const {
   PADDLE_ENFORCE_GT(
       (*var_infos_)[op->GetScopeIdx()].count(in_var->Name()), 0,
       platform::errors::NotFound("Var(%s) does not in mem opt var infos.",
@@ -344,13 +347,15 @@ void MemoryReusePass::AddReuseVar(details::ComputationOpHandle *op,
     share_buffer_op->AddInput(in_var);
   }
 
+  if (share_dims) {
+    share_buffer_op->SetShareDims(true);
+  }
+
   share_buffer_op->AddReuseVarPair(
       (*var_infos_)[op->GetScopeIdx()].at(in_var->Name()).get(),
       out_var->Name());
   reused_in_var_names_[op->GetScopeIdx()].insert(in_var->Name());
   reused_out_var_names_[op->GetScopeIdx()].insert(out_var->Name());
-
-  UpdateLastLiveOpOfVar(op, in_var, out_var);
 }
 
 // 1. Set last living op of in_var to be any last living op of out_var
diff --git a/paddle/fluid/framework/ir/memory_optimize_pass/memory_reuse_pass.h b/paddle/fluid/framework/ir/memory_optimize_pass/memory_reuse_pass.h
index 822744191847586dc429b6896ff6f490381c5901..1c0c6ae60205b14f97bd15bceeb126d0eb54f654 100644
--- a/paddle/fluid/framework/ir/memory_optimize_pass/memory_reuse_pass.h
+++ b/paddle/fluid/framework/ir/memory_optimize_pass/memory_reuse_pass.h
@@ -18,6 +18,7 @@
 #include <unordered_map>
 #include <unordered_set>
 #include <vector>
+
 #include "paddle/fluid/framework/details/computation_op_handle.h"
 #include "paddle/fluid/framework/details/multi_devices_helper.h"
 #include "paddle/fluid/framework/details/share_tensor_buffer_op_handle.h"
@@ -92,6 +93,12 @@ class MemoryReusePass : public Pass {
 
   int64_t GetMemorySize(const details::VarHandle &var) const;
 
+  void AddReuseVar(details::ComputationOpHandle *op, details::VarHandle *in_var,
+                   details::VarHandle *out_var, bool share_dims = false) const;
+  virtual void UpdateLastLiveOpOfVar(details::ComputationOpHandle *op,
+                                     details::VarHandle *in_var,
+                                     details::VarHandle *out_var) const;
+
  private:
   VarDesc *GetVarDesc(const details::VarHandle &var) const;
 
@@ -109,13 +116,6 @@ class MemoryReusePass : public Pass {
 
   void CollectReusedVars() const;
 
-  void AddReuseVar(details::ComputationOpHandle *op, details::VarHandle *in_var,
-                   details::VarHandle *out_var) const;
-
-  void UpdateLastLiveOpOfVar(details::ComputationOpHandle *op,
-                             details::VarHandle *in_var,
-                             details::VarHandle *out_var) const;
-
  private:
   mutable Graph *graph_;
   mutable bool use_cuda_;
diff --git a/paddle/fluid/framework/ir/mkldnn/conv_bias_mkldnn_fuse_pass.cc b/paddle/fluid/framework/ir/mkldnn/conv_bias_mkldnn_fuse_pass.cc
index 82e0af3c198750296032769f2f3b04658871adb7..f7a8e3e3f6c3c77e978c57eeb7515d8cfce86471 100644
--- a/paddle/fluid/framework/ir/mkldnn/conv_bias_mkldnn_fuse_pass.cc
+++ b/paddle/fluid/framework/ir/mkldnn/conv_bias_mkldnn_fuse_pass.cc
@@ -17,6 +17,7 @@
 #include <string>
 #include <vector>
 #include "paddle/fluid/framework/lod_tensor.h"
+#include "paddle/fluid/framework/op_version_registry.h"
 #include "paddle/fluid/platform/enforce.h"
 
 namespace paddle {
@@ -84,6 +85,19 @@ void ConvBiasFusePass::ApplyImpl(ir::Graph* graph) const {
       VLOG(3) << "do not perform " + type() + "+bias fuse";
       return;
     }
+    if (conv->Op()->HasAttr("dilations")) {
+      auto dilations =
+          BOOST_GET_CONST(std::vector<int>, conv->Op()->GetAttr("dilations"));
+      for (const auto& d : dilations) {
+        if (d != 1) {
+          LOG(WARNING)
+              << "dilation conv not supported in MKLDNN, fuse not apply "
+              << "and set conv attribute use_mkldnn = false";
+          conv->Op()->SetAttr("use_mkldnn", false);
+          return;
+        }
+      }
+    }
 
     auto* eltwise_bias_tensor =
         scope->FindVar(eltwise_bias->Name())->GetMutable<LoDTensor>();
@@ -151,3 +165,8 @@ REGISTER_PASS(conv_transpose_bias_mkldnn_fuse_pass,
               paddle::framework::ir::Conv2DTransposeBiasFusePass);
 REGISTER_PASS(conv3d_bias_mkldnn_fuse_pass,
               paddle::framework::ir::Conv3DBiasFusePass);
+REGISTER_PASS_CAPABILITY(conv_bias_mkldnn_fuse_pass)
+    .AddCombination(
+        paddle::framework::compatible::OpVersionComparatorCombination()
+            .EQ("conv2d", 0)
+            .EQ("elementwise_add", 0));
diff --git a/paddle/fluid/framework/ir/mkldnn/conv_bias_mkldnn_fuse_pass_tester.cc b/paddle/fluid/framework/ir/mkldnn/conv_bias_mkldnn_fuse_pass_tester.cc
index 88aac001a93ae836d62fe3bf3fc502960eebe70f..455350d2f703c52a9ef3e5714a60573408310080 100644
--- a/paddle/fluid/framework/ir/mkldnn/conv_bias_mkldnn_fuse_pass_tester.cc
+++ b/paddle/fluid/framework/ir/mkldnn/conv_bias_mkldnn_fuse_pass_tester.cc
@@ -18,6 +18,7 @@
 #include "paddle/fluid/platform/place.h"
 
 #include "paddle/fluid/framework/op_proto_maker.h"
+#include "paddle/fluid/framework/op_version_registry.h"
 #include "paddle/fluid/imperative/type_defs.h"
 
 namespace paddle {
@@ -149,6 +150,12 @@ TEST(ConvBiasFusePass, conv2d_transpose) {
   ASSERT_EQ(pass.type(), std::string("conv2d_transpose"));
 }
 
+TEST(ConvBiasFusePass, pass_op_version_check) {
+  ASSERT_TRUE(
+      paddle::framework::compatible::PassVersionCheckerRegistrar::GetInstance()
+          .IsPassCompatible("conv_bias_mkldnn_fuse_pass"));
+}
+
 }  // namespace ir
 }  // namespace framework
 }  // namespace paddle
diff --git a/paddle/fluid/framework/ir/mkldnn/conv_elementwise_add_mkldnn_fuse_pass.cc b/paddle/fluid/framework/ir/mkldnn/conv_elementwise_add_mkldnn_fuse_pass.cc
index af2b1308e084ee937f26bf90caf2df6fb44e044b..2fb131aceaad28a365e8202dca35cfe53f8f54da 100644
--- a/paddle/fluid/framework/ir/mkldnn/conv_elementwise_add_mkldnn_fuse_pass.cc
+++ b/paddle/fluid/framework/ir/mkldnn/conv_elementwise_add_mkldnn_fuse_pass.cc
@@ -19,6 +19,7 @@
 #include <memory>
 #include <tuple>
 #include "paddle/fluid/framework/ir/graph_traits.h"
+#include "paddle/fluid/framework/op_version_registry.h"
 
 namespace paddle {
 namespace framework {
@@ -341,3 +342,8 @@ void ResidualConnectionMKLDNNFusePass::ApplyImpl(graph_ptr graph) const {
 
 REGISTER_PASS(conv_elementwise_add_mkldnn_fuse_pass,
               paddle::framework::ir::ResidualConnectionMKLDNNFusePass);
+REGISTER_PASS_CAPABILITY(conv_elementwise_add_mkldnn_fuse_pass)
+    .AddCombination(
+        paddle::framework::compatible::OpVersionComparatorCombination()
+            .EQ("conv2d", 0)
+            .EQ("elementwise_add", 0));
diff --git a/paddle/fluid/framework/ir/mkldnn/conv_elementwise_add_mkldnn_fuse_pass_tester.cc b/paddle/fluid/framework/ir/mkldnn/conv_elementwise_add_mkldnn_fuse_pass_tester.cc
index 8a13596cd50087475bf12b6cfa5920b82e24de31..fd4910fc8e95cd98fe9feaba51e70d5a143ad443 100644
--- a/paddle/fluid/framework/ir/mkldnn/conv_elementwise_add_mkldnn_fuse_pass_tester.cc
+++ b/paddle/fluid/framework/ir/mkldnn/conv_elementwise_add_mkldnn_fuse_pass_tester.cc
@@ -17,6 +17,7 @@
 
 #include "paddle/fluid/framework/ir/graph_traits.h"
 #include "paddle/fluid/framework/ir/mkldnn/conv_elementwise_add_mkldnn_fuse_pass.h"
+#include "paddle/fluid/framework/op_version_registry.h"
 
 namespace paddle {
 namespace framework {
@@ -267,6 +268,12 @@ TEST(ConvElementwiseAddMKLDNNFusePass, NoFusion) {
   AssertOpsCount(graph, 2, 1);
 }
 
+TEST(ConvElementwiseAddMKLDNNFusePass, pass_op_version_check) {
+  ASSERT_TRUE(
+      paddle::framework::compatible::PassVersionCheckerRegistrar::GetInstance()
+          .IsPassCompatible("conv_elementwise_add_mkldnn_fuse_pass"));
+}
+
 }  // namespace ir
 }  // namespace framework
 }  // namespace paddle
diff --git a/paddle/fluid/framework/ir/mkldnn/cpu_bfloat16_pass.cc b/paddle/fluid/framework/ir/mkldnn/cpu_bfloat16_pass.cc
new file mode 100644
index 0000000000000000000000000000000000000000..df498865245fc8054f9521026e0b5cd6906b136f
--- /dev/null
+++ b/paddle/fluid/framework/ir/mkldnn/cpu_bfloat16_pass.cc
@@ -0,0 +1,159 @@
+/* Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/framework/ir/mkldnn/cpu_bfloat16_pass.h"
+
+#include <string>
+#include <unordered_set>
+#include <vector>
+
+#include "paddle/fluid/framework/ir/graph_pattern_detector.h"
+#include "paddle/fluid/platform/mkldnn_helper.h"
+#include "paddle/fluid/string/pretty_log.h"
+
+namespace paddle {
+namespace framework {
+namespace ir {
+
+using string::PrettyLogDetail;
+
+void UnlinkNodes(ir::Node* a, ir::Node* b) {
+  a->outputs.erase(std::remove(a->outputs.begin(), a->outputs.end(), b),
+                   a->outputs.end());
+  b->inputs.erase(std::remove(b->inputs.begin(), b->inputs.end(), a),
+                  b->inputs.end());
+}
+
+void CPUBFloat16Pass::SetInputDataType(ir::Graph* graph) const {
+  GraphPatternDetector gpd;
+  patterns::FirstBfloat16Ops bfloat16_ops{gpd.mutable_pattern(),
+                                          "first_bfloat16_ops"};
+  bfloat16_ops();
+  int quantize_counter = 0;
+  auto handler = [&](const GraphPatternDetector::subgraph_t& subgraph,
+                     Graph* g) {
+    GET_IR_NODE_FROM_SUBGRAPH(prev_op, prev_op, bfloat16_ops);
+    GET_IR_NODE_FROM_SUBGRAPH(op_in, op_in, bfloat16_ops);
+    GET_IR_NODE_FROM_SUBGRAPH(op, op, bfloat16_ops);
+
+    if (op->Op()->Type() != "conv2d" && prev_op->Op()->Type() != "quantize") {
+      VarDesc quantize_out_desc(patterns::PDNodeName("quantize", "out"));
+      auto* quantize_out_node = g->CreateVarNode(&quantize_out_desc);
+
+      // create a quantize op node
+      OpDesc q_desc;
+      q_desc.SetType("quantize");
+      q_desc.SetInput("Input", std::vector<std::string>({op_in->Name()}));
+      q_desc.SetOutput("Output",
+                       std::vector<std::string>({quantize_out_node->Name()}));
+      q_desc.SetAttr("Scale", 1.f);
+      q_desc.SetAttr("bfloat16", true);
+      q_desc.SetAttr("output_format", Has("data_layout")
+                                          ? Get<std::string>("data_layout")
+                                          : "NCHW");
+      auto quantize_op = g->CreateOpNode(&q_desc);  // OpDesc will be copied.
+
+      std::string op_input_name;
+      for (auto name : op->Op()->InputNames()) {
+        for (auto input_name : op->Op()->Input(name)) {
+          if (input_name == op_in->Name()) op_input_name = name;
+        }
+      }
+
+      PADDLE_ENFORCE_NE(
+          op_input_name.empty(), true,
+          platform::errors::NotFound(
+              "Operator before operator should have input as op output"));
+
+      op->Op()->SetInput(op_input_name,
+                         std::vector<std::string>({quantize_out_node->Name()}));
+
+      UnlinkNodes(op_in, op);
+      IR_NODE_LINK_TO(op_in, quantize_op);
+      IR_NODE_LINK_TO(quantize_op, quantize_out_node);
+      IR_NODE_LINK_TO(quantize_out_node, op);
+      quantize_counter++;
+    }
+  };
+  gpd(graph, handler);
+  PrettyLogDetail("---    added %d quantize op before bfloat16 op",
+                  quantize_counter);
+}
+
+void CPUBFloat16Pass::SetOutputDataType(ir::Graph* graph) const {
+  GraphPatternDetector gpd;
+  patterns::LastBfloat16Ops bfloat16_ops{gpd.mutable_pattern(),
+                                         "last_bfloat16_ops"};
+  bfloat16_ops();
+  int force_fp32_counter = 0, dequantize_counter = 0;
+
+  auto handler = [&](const GraphPatternDetector::subgraph_t& subgraph,
+                     Graph* g) {
+    GET_IR_NODE_FROM_SUBGRAPH(op, op, bfloat16_ops);
+    GET_IR_NODE_FROM_SUBGRAPH(op_out, op_out, bfloat16_ops);
+    GET_IR_NODE_FROM_SUBGRAPH(next_op, next_op, bfloat16_ops);
+
+    if ((op->Op()->HasAttr("force_fp32_output") ||
+         op->Op()->HasProtoAttr("force_fp32_output")) &&
+        !op->Op()->GetAttrIfExists<bool>("fuse_residual_connection")) {
+      op->Op()->SetAttr("force_fp32_output", true);
+      force_fp32_counter++;
+    } else if (op->Op()->Type() != "prior_box") {
+      // Create dequantize input variable
+      VarDesc dequantize_in_desc(patterns::PDNodeName("dequantize", "in"));
+      auto* dequantize_in_node = g->CreateVarNode(&dequantize_in_desc);
+
+      // create a dequantize op node for output.
+      OpDesc deq_desc;
+      deq_desc.SetType("dequantize");
+      deq_desc.SetInput("Input",
+                        std::vector<std::string>({dequantize_in_node->Name()}));
+      deq_desc.SetOutput("Output", std::vector<std::string>({op_out->Name()}));
+      deq_desc.SetAttr("Scale", 1.0f);
+      auto dequantize_op = g->CreateOpNode(&deq_desc);
+
+      std::string op_output_name;
+      for (auto name : op->Op()->OutputNames()) {
+        for (auto output_name : op->Op()->Output(name)) {
+          if (output_name == op_out->Name()) op_output_name = name;
+        }
+      }
+
+      PADDLE_ENFORCE_NE(
+          op_output_name.empty(), true,
+          platform::errors::NotFound(
+              "Operator after operator should have input as op output"));
+
+      op->Op()->SetOutput(op_output_name, std::vector<std::string>(
+                                              {dequantize_in_node->Name()}));
+
+      UnlinkNodes(op, op_out);
+      IR_NODE_LINK_TO(op, dequantize_in_node);
+      IR_NODE_LINK_TO(dequantize_in_node, dequantize_op);
+      IR_NODE_LINK_TO(dequantize_op, op_out);
+      dequantize_counter++;
+    }
+  };
+  gpd(graph, handler);
+  PrettyLogDetail("---    added %d dequantize op and used %d force_fp32_output",
+                  dequantize_counter, force_fp32_counter);
+}
+
+void CPUBFloat16Pass::ApplyImpl(ir::Graph* graph) const {
+  SetInputDataType(graph);
+  SetOutputDataType(graph);
+}
+
+}  // namespace ir
+}  // namespace framework
+}  // namespace paddle
+
+REGISTER_PASS(cpu_bfloat16_pass, paddle::framework::ir::CPUBFloat16Pass);
diff --git a/paddle/fluid/framework/ir/mkldnn/cpu_bfloat16_pass.h b/paddle/fluid/framework/ir/mkldnn/cpu_bfloat16_pass.h
new file mode 100644
index 0000000000000000000000000000000000000000..3a7271f7ddc59a2bdcab8457bc34d5c5c6397268
--- /dev/null
+++ b/paddle/fluid/framework/ir/mkldnn/cpu_bfloat16_pass.h
@@ -0,0 +1,34 @@
+/* Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include <memory>
+
+#include "paddle/fluid/framework/ir/pass.h"
+
+namespace paddle {
+namespace framework {
+namespace ir {
+
+class CPUBFloat16Pass : public Pass {
+ protected:
+  void SetInputDataType(ir::Graph* graph) const;
+  void SetOutputDataType(ir::Graph* graph) const;
+  void ApplyImpl(ir::Graph* graph) const override;
+};
+
+}  // namespace ir
+}  // namespace framework
+}  // namespace paddle
diff --git a/paddle/fluid/framework/ir/mkldnn/cpu_bfloat16_pass_tester.cc b/paddle/fluid/framework/ir/mkldnn/cpu_bfloat16_pass_tester.cc
new file mode 100644
index 0000000000000000000000000000000000000000..15109db98321343e73fb0c3839e4f7ddf2490948
--- /dev/null
+++ b/paddle/fluid/framework/ir/mkldnn/cpu_bfloat16_pass_tester.cc
@@ -0,0 +1,145 @@
+// Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <gtest/gtest.h>
+
+#include "paddle/fluid/framework/ir/mkldnn/cpu_bfloat16_pass.h"
+#include "paddle/fluid/framework/naive_executor.h"
+#include "paddle/fluid/imperative/type_defs.h"
+#include "paddle/fluid/platform/place.h"
+
+namespace paddle {
+namespace framework {
+namespace ir {
+
+void SetOp(ProgramDesc* prog, const std::string& type, const std::string& name,
+           const std::vector<std::string>& inputs,
+           const std::vector<std::string>& outputs, bool use_mkldnn,
+           const std::string& mkldnn_data_type = "float32",
+           const bool force_fp32_output = false) {
+  auto* op = prog->MutableBlock(0)->AppendOp();
+  op->SetType(type);
+  op->SetAttr("use_mkldnn", use_mkldnn);
+  op->SetAttr("name", name);
+
+  if (type == "conv2d") {
+    op->SetInput("Input", {inputs[0]});
+    op->SetOutput("Output", {outputs[0]});
+    op->SetAttr("mkldnn_data_type", mkldnn_data_type);
+    op->SetAttr("force_fp32_output", force_fp32_output);
+  } else if (type == "pool2d" || type == "transpose2" || type == "reshape2" ||
+             type == "dropout") {
+    op->SetInput("X", {inputs[0]});
+    op->SetOutput("Out", {outputs[0]});
+    op->SetAttr("mkldnn_data_type", mkldnn_data_type);
+  } else if (type == "fc") {
+    op->SetInput("Input", {inputs[0]});
+    op->SetOutput("Out", {outputs[0]});
+    op->SetAttr("mkldnn_data_type", mkldnn_data_type);
+  } else if (type == "concat") {
+    op->SetInput("X", inputs);
+    op->SetOutput("Out", outputs);
+    op->SetAttr("mkldnn_data_type", mkldnn_data_type);
+  } else if (type == "matmul" || type == "elementwise_add") {
+    op->SetInput("X", {inputs[0]});
+    if (inputs.size() > 1) op->SetInput("Y", {inputs[1]});
+    op->SetOutput("Out", {outputs[0]});
+    op->SetAttr("mkldnn_data_type", mkldnn_data_type);
+  }
+}
+
+void PreparePass(std::unique_ptr<ir::Graph>* graph, const ProgramDesc& prog,
+                 const std::initializer_list<std::string> variable_names,
+                 int* original_nodes_num, int* current_nodes_num) {
+  auto pass = PassRegistry::Instance().Get("cpu_bfloat16_pass");
+
+  graph->reset(pass->Apply(graph->release()));
+
+  *original_nodes_num = (*graph)->Nodes().size();
+  (*graph).reset(pass->Apply((*graph).release()));
+  *current_nodes_num = (*graph)->Nodes().size();
+}
+
+static const std::initializer_list<std::string> variable_names{
+    "z", "a", "b", "c", "d", "e", "f", "g", "h", "i"};
+
+ProgramDesc BuildProgramDesc(bool use_mkldnn) {
+  ProgramDesc prog;
+  for (auto& v : variable_names) {
+    prog.MutableBlock(0)->Var(v);
+  }
+  SetOp(&prog, "dropout", "Dropout1", {"z"}, {"a"}, use_mkldnn, "float32");
+  SetOp(&prog, "conv2d", "Conv1", {"a"}, {"b"}, use_mkldnn, "bfloat16");
+  SetOp(&prog, "pool2d", "Pool1", {"b"}, {"c"}, use_mkldnn, "bfloat16");
+  SetOp(&prog, "conv2d", "Conv1", {"c"}, {"d"}, use_mkldnn, "bfloat16");
+  SetOp(&prog, "dropout", "Dropout2", {"d"}, {"e"}, use_mkldnn, "float32");
+  SetOp(&prog, "transpose2", "Transpose1", {"e"}, {"f"}, use_mkldnn,
+        "bfloat16");
+  SetOp(&prog, "reshape2", "Reshape1", {"f"}, {"g"}, use_mkldnn, "bfloat16");
+  SetOp(&prog, "concat", "Concat1", {"g"}, {"h"}, use_mkldnn, "bfloat16");
+  SetOp(&prog, "dropout", "Dropout3", {"h"}, {"i"}, use_mkldnn, "float32");
+
+  return prog;
+}
+
+void MainTest(const ProgramDesc& prog, int conv_count, int pool_count,
+              int transpose_count, int quant_count, int dequant_count,
+              int added_nodes_count) {
+  std::unique_ptr<ir::Graph> graph(new ir::Graph(prog));
+  int original_nodes_num, current_nodes_num;
+  PreparePass(&graph, prog, variable_names, &original_nodes_num,
+              &current_nodes_num);
+
+  int quantize_nodes_count = 0;
+  int dequantize_nodes_count = 0;
+  int conv2d_nodes_count = 0;
+  int pool2d_nodes_count = 0;
+  int transpose2_nodes_count = 0;
+
+  for (auto* node : graph->Nodes()) {
+    if (node->IsOp()) {
+      auto* op = node->Op();
+      if (op->Type() == "conv2d") {
+        conv2d_nodes_count++;
+      } else if (op->Type() == "pool2d") {
+        pool2d_nodes_count++;
+      } else if (op->Type() == "transpose2") {
+        transpose2_nodes_count++;
+      } else if (op->Type() == "quantize") {
+        quantize_nodes_count++;
+      } else if (op->Type() == "dequantize") {
+        dequantize_nodes_count++;
+      }
+    }
+  }
+  EXPECT_EQ(conv2d_nodes_count, conv_count);
+  EXPECT_EQ(pool2d_nodes_count, pool_count);
+  EXPECT_EQ(transpose2_nodes_count, transpose_count);
+  EXPECT_EQ(quantize_nodes_count, quant_count);
+  EXPECT_EQ(dequantize_nodes_count, dequant_count);
+  EXPECT_EQ(original_nodes_num + added_nodes_count, current_nodes_num);
+}
+
+TEST(CpuQuantizePass, quantize) {
+  bool use_mkldnn = true;
+  // 1 quantize + 1 dequantize
+  int added_nodes = 2;
+  MainTest(BuildProgramDesc(use_mkldnn), 2, 1, 1, 1, 2, added_nodes);
+}
+
+}  // namespace ir
+}  // namespace framework
+}  // namespace paddle
+
+USE_PASS(cpu_bfloat16_pass);
diff --git a/paddle/fluid/framework/ir/mkldnn/cpu_bfloat16_placement_pass.cc b/paddle/fluid/framework/ir/mkldnn/cpu_bfloat16_placement_pass.cc
new file mode 100644
index 0000000000000000000000000000000000000000..3d7a9c1107bbaac04a3a478014520a9b340b1d5f
--- /dev/null
+++ b/paddle/fluid/framework/ir/mkldnn/cpu_bfloat16_placement_pass.cc
@@ -0,0 +1,91 @@
+/* Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/framework/ir/mkldnn/cpu_bfloat16_placement_pass.h"
+
+#include <string>
+#include <unordered_set>
+
+#include "paddle/fluid/framework/ir/graph_pattern_detector.h"
+#include "paddle/fluid/platform/mkldnn_helper.h"
+#include "paddle/fluid/string/pretty_log.h"
+
+namespace paddle {
+namespace framework {
+namespace ir {
+
+using string::PrettyLogDetail;
+
+void CPUBfloat16PlacementPass::SetMkldnnDataType(
+    ir::Graph* graph, int* bfloat16_operators) const {
+  const auto& op_types_list =
+      Get<std::unordered_set<std::string>>("bfloat16_enabled_op_types");
+  // set mkldnn_data_type to bfloat16 to all operators that are in
+  // bfloat16_enabled_op_types vector or they are included to Bfloat16Placement
+  // pattern
+  GraphPatternDetector gpd;
+  patterns::Bfloat16Placement bfloat16_placement_pattern{gpd.mutable_pattern(),
+                                                         "bfloat16_placement"};
+  bfloat16_placement_pattern(op_types_list);
+
+  auto handler = [&](const GraphPatternDetector::subgraph_t& subgraph,
+                     Graph* g) {
+    GET_IR_NODE_FROM_SUBGRAPH(op, op, bfloat16_placement_pattern);
+
+    if ((op->Op()->HasAttr("mkldnn_data_type") ||
+         op->Op()->HasProtoAttr("mkldnn_data_type")) &&
+        !platform::HasOpINT8DataType(op->Op())) {
+      op->Op()->SetAttr("mkldnn_data_type", std::string("bfloat16"));
+      (*bfloat16_operators)++;
+    }
+  };
+  gpd(graph, handler);
+}
+
+void CPUBfloat16PlacementPass::RemoveOrhanedOperators(
+    ir::Graph* graph, int* bfloat16_operators) const {
+  // find orphaned bfloat16 operator that is between two float32 operators
+  // revert mkldnn_data_type attr to float32
+  GraphPatternDetector gpd;
+  patterns::OrphanedBfloat16 orphaned_bfloat16_pattern{gpd.mutable_pattern(),
+                                                       "orphaned_bfloat16"};
+  orphaned_bfloat16_pattern();
+  auto handler = [&](const GraphPatternDetector::subgraph_t& subgraph,
+                     Graph* g) {
+    GET_IR_NODE_FROM_SUBGRAPH(op, op, orphaned_bfloat16_pattern);
+
+    op->Op()->SetAttr("mkldnn_data_type", std::string("float32"));
+    bfloat16_operators--;
+  };
+  gpd(graph, handler);
+}
+
+void CPUBfloat16PlacementPass::ApplyImpl(ir::Graph* graph) const {
+  int bfloat16_operators = 0;
+  SetMkldnnDataType(graph, &bfloat16_operators);
+  RemoveOrhanedOperators(graph, &bfloat16_operators);
+  PrettyLogDetail("---    marked %d operators to bfloat16 ",
+                  bfloat16_operators);
+}
+
+}  // namespace ir
+}  // namespace framework
+}  // namespace paddle
+
+REGISTER_PASS(cpu_bfloat16_placement_pass,
+              paddle::framework::ir::CPUBfloat16PlacementPass)
+    // a vector of operator type names with bfloat16 support ("conv2d" etc.)
+    // the second param is the default value for this vector
+    .DefaultPassAttr("bfloat16_enabled_op_types",
+                     new std::unordered_set<std::string>());
diff --git a/paddle/fluid/framework/ir/mkldnn/cpu_bfloat16_placement_pass.h b/paddle/fluid/framework/ir/mkldnn/cpu_bfloat16_placement_pass.h
new file mode 100644
index 0000000000000000000000000000000000000000..1911b1a3cb32a6a23585e8240c462aa84e8d869b
--- /dev/null
+++ b/paddle/fluid/framework/ir/mkldnn/cpu_bfloat16_placement_pass.h
@@ -0,0 +1,38 @@
+/* Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include <memory>
+
+#include "paddle/fluid/framework/ir/pass.h"
+
+namespace paddle {
+namespace framework {
+namespace ir {
+/*
+ * Specifies which operators should be run on bfloat16.
+ */
+class CPUBfloat16PlacementPass : public Pass {
+ protected:
+  void SetMkldnnDataType(ir::Graph* graph, int* bfloat16_operators) const;
+
+  void RemoveOrhanedOperators(ir::Graph* graph, int* bfloat16_operators) const;
+
+  void ApplyImpl(ir::Graph* graph) const override;
+};
+
+}  // namespace ir
+}  // namespace framework
+}  // namespace paddle
diff --git a/paddle/fluid/framework/ir/mkldnn/cpu_bfloat16_placement_pass_tester.cc b/paddle/fluid/framework/ir/mkldnn/cpu_bfloat16_placement_pass_tester.cc
new file mode 100644
index 0000000000000000000000000000000000000000..b9797a4bfcc0048083e059cb003746e3278a039b
--- /dev/null
+++ b/paddle/fluid/framework/ir/mkldnn/cpu_bfloat16_placement_pass_tester.cc
@@ -0,0 +1,132 @@
+// Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <gtest/gtest.h>
+
+#include "paddle/fluid/framework/ir/mkldnn/cpu_bfloat16_placement_pass.h"
+#include "paddle/fluid/platform/mkldnn_helper.h"
+
+namespace paddle {
+namespace framework {
+namespace ir {
+
+void SetOp(ProgramDesc* prog, const std::string& type, const std::string& name,
+           const std::vector<std::string>& inputs,
+           const std::vector<std::string>& outputs,
+           const std::string& mkldnn_data_type = "float32") {
+  auto* op = prog->MutableBlock(0)->AppendOp();
+
+  op->SetType(type);
+  op->SetAttr("mkldnn_data_type", mkldnn_data_type);
+
+  if (type == "conv2d") {
+    op->SetAttr("name", name);
+    op->SetInput("Input", {inputs[0]});
+  } else if (type == "relu") {
+    op->SetInput("X", inputs);
+  } else if (type == "concat") {
+    op->SetAttr("axis", 1);
+    op->SetInput("X", {inputs[0], inputs[1]});
+  } else if (type == "pool2d") {
+    op->SetInput("X", {inputs[0]});
+  } else {
+    FAIL() << "Unexpected operator type.";
+  }
+  op->SetOutput("Out", {outputs[0]});
+}
+
+// operator                      mkldnn_data_type
+// ---------------------------------------
+// (a,b)->concat->c              float32
+// c->conv->f                    float32
+// f->relu->g                    float32
+// g->pool->h                    float32
+// h->conv->k                    float32
+// k->pool->l                    float32
+ProgramDesc BuildProgramDesc() {
+  ProgramDesc prog;
+
+  for (auto& v :
+       std::vector<std::string>({"a", "b", "c", "f", "g", "h", "k", "l"})) {
+    prog.MutableBlock(0)->Var(v);
+  }
+
+  SetOp(&prog, "concat", "concat1", {"a", "b"}, {"c"});
+  SetOp(&prog, "conv2d", "conv1", {"c"}, {"f"});
+  SetOp(&prog, "relu", "relu1", {"f"}, {"g"});
+  SetOp(&prog, "pool2d", "pool1", {"g"}, {"h"});
+  SetOp(&prog, "conv2d", "conv2", {"h"}, {"k"});
+  SetOp(&prog, "pool2d", "pool2", {"k"}, {"l"});
+
+  return prog;
+}
+
+void MainTest(std::initializer_list<std::string> bfloat16_enabled_op_types,
+              unsigned expected_bfloat16_data_type_count) {
+  auto prog = BuildProgramDesc();
+
+  std::unique_ptr<ir::Graph> graph(new ir::Graph(prog));
+
+  auto pass = PassRegistry::Instance().Get("cpu_bfloat16_placement_pass");
+  pass->Set("bfloat16_enabled_op_types",
+            new std::unordered_set<std::string>(bfloat16_enabled_op_types));
+
+  graph.reset(pass->Apply(graph.release()));
+
+  unsigned bfloat16_data_type_count = 0;
+
+  for (auto* node : graph->Nodes()) {
+    if (node->IsOp()) {
+      if (platform::HasOpBFLOAT16DataType(node->Op())) {
+        ++bfloat16_data_type_count;
+      }
+    }
+  }
+
+  EXPECT_EQ(bfloat16_data_type_count, expected_bfloat16_data_type_count);
+}
+
+void DefaultAttrTest(unsigned expected_bfloat16_data_type_count) {
+  auto prog = BuildProgramDesc();
+  std::unique_ptr<ir::Graph> graph(new ir::Graph(prog));
+  auto pass = PassRegistry::Instance().Get("cpu_bfloat16_placement_pass");
+  graph.reset(pass->Apply(graph.release()));
+
+  unsigned bfloat16_data_type_count = 0;
+  for (auto* node : graph->Nodes()) {
+    if (node->IsOp()) {
+      if (platform::HasOpBFLOAT16DataType(node->Op())) {
+        ++bfloat16_data_type_count;
+      }
+    }
+  }
+  EXPECT_EQ(bfloat16_data_type_count, expected_bfloat16_data_type_count);
+}
+
+TEST(Bfloat16PlacementPass, enable_all) {
+  MainTest({"conv2d", "pool2d", "relu", "concat"}, 6);
+}
+
+TEST(Bfloat16PlacementPass, enabled_conv_and_pool) {
+  // 2 conv2d + 2 pool2 - 1 orphaned conv2d
+  MainTest({"conv2d", "pool2d"}, 3);
+}
+
+TEST(Bfloat16PlacementPass, default_attr_value) { DefaultAttrTest(0); }
+
+}  // namespace ir
+}  // namespace framework
+}  // namespace paddle
+
+USE_PASS(cpu_bfloat16_placement_pass);
diff --git a/paddle/fluid/framework/ir/mkldnn/depthwise_conv_mkldnn_pass.cc b/paddle/fluid/framework/ir/mkldnn/depthwise_conv_mkldnn_pass.cc
index c5965701a53d4312d89f1e09f17840b09f1bd5f5..df5ba3314e637fefe930d4c45f431314dd7d8493 100644
--- a/paddle/fluid/framework/ir/mkldnn/depthwise_conv_mkldnn_pass.cc
+++ b/paddle/fluid/framework/ir/mkldnn/depthwise_conv_mkldnn_pass.cc
@@ -14,6 +14,7 @@ limitations under the License. */
 
 #include "paddle/fluid/framework/ir/mkldnn/depthwise_conv_mkldnn_pass.h"
 #include "paddle/fluid/framework/ir/graph_pattern_detector.h"
+#include "paddle/fluid/framework/op_version_registry.h"
 
 namespace paddle {
 namespace framework {
@@ -57,3 +58,7 @@ void DepthwiseConvMKLDNNPass::ApplyImpl(ir::Graph* graph) const {
 
 REGISTER_PASS(depthwise_conv_mkldnn_pass,
               paddle::framework::ir::DepthwiseConvMKLDNNPass);
+REGISTER_PASS_CAPABILITY(depthwise_conv_mkldnn_pass)
+    .AddCombination(
+        paddle::framework::compatible::OpVersionComparatorCombination().EQ(
+            "depthwise_conv2d", 0));
diff --git a/paddle/fluid/framework/ir/mkldnn/depthwise_conv_mkldnn_pass_tester.cc b/paddle/fluid/framework/ir/mkldnn/depthwise_conv_mkldnn_pass_tester.cc
index a37565236cd440bd803184d038ad4deb3c0b6150..c6c72ba33d6295d90c502ab88d7d712d76a11aad 100644
--- a/paddle/fluid/framework/ir/mkldnn/depthwise_conv_mkldnn_pass_tester.cc
+++ b/paddle/fluid/framework/ir/mkldnn/depthwise_conv_mkldnn_pass_tester.cc
@@ -16,6 +16,8 @@
 
 #include <gtest/gtest.h>
 
+#include "paddle/fluid/framework/op_version_registry.h"
+
 namespace paddle {
 namespace framework {
 namespace ir {
@@ -70,6 +72,12 @@ ProgramDesc BuildProgramDesc() {
   return prog;
 }
 
+TEST(DepthwiseConvMKLDNNPass, pass_op_version_check) {
+  ASSERT_TRUE(
+      paddle::framework::compatible::PassVersionCheckerRegistrar::GetInstance()
+          .IsPassCompatible("depthwise_conv_mkldnn_pass"));
+}
+
 TEST(DepthwiseConvMKLDNNPass, basic) {
   auto prog = BuildProgramDesc();
 
diff --git a/paddle/fluid/framework/ir/multihead_matmul_fuse_pass.cc b/paddle/fluid/framework/ir/multihead_matmul_fuse_pass.cc
index 40e01c75bb99157aedccd0692d7410b99393c009..9d2b4ebaf8ccf33e175e46c08657e7eeed467055 100644
--- a/paddle/fluid/framework/ir/multihead_matmul_fuse_pass.cc
+++ b/paddle/fluid/framework/ir/multihead_matmul_fuse_pass.cc
@@ -19,6 +19,7 @@
 #include <vector>
 #include "paddle/fluid/framework/ddim.h"
 #include "paddle/fluid/framework/lod_tensor.h"
+#include "paddle/fluid/framework/op_version_registry.h"
 #include "paddle/fluid/platform/errors.h"
 
 namespace paddle {
@@ -615,6 +616,16 @@ static int BuildFusionV2(Graph* graph, const std::string& name_scope,
     GET_IR_NODE_FROM_SUBGRAPH(transpose2_qkv_out, transpose2_qkv_out,
                               multihead_pattern);
 
+    // If weights or biases in qkv's fc are shared by multiple multihead_matmul
+    // patterns, we do not support this kind of fusion, this pass will not take
+    // effect.
+    bool is_fc_params_shared =
+        mul0_w->outputs.size() > 1 || mul1_w->outputs.size() > 1 ||
+        mul2_w->outputs.size() > 1 || eltadd0_b->outputs.size() > 1 ||
+        eltadd1_b->outputs.size() > 1 || eltadd2_b->outputs.size() > 1;
+    if (is_fc_params_shared) {
+      return;
+    }
     fuse_creater(input0, mul0, mul1, mul2, mul0_out, mul1_out, mul2_out, mul0_w,
                  mul1_w, mul2_w, eltadd0_b, eltadd1_b, eltadd2_b, eltadd_qk_b,
                  reshape2_0, reshape2_qkv_out, scale, scale_out);
@@ -697,3 +708,13 @@ REGISTER_PASS(multihead_matmul_fuse_pass,
 
 REGISTER_PASS(multihead_matmul_fuse_pass_v2,
               paddle::framework::ir::MultiHeadMatmulV2FusePass);
+REGISTER_PASS_CAPABILITY(multihead_matmul_fuse_pass_v2)
+    .AddCombination(
+        paddle::framework::compatible::OpVersionComparatorCombination()
+            .EQ("mul", 0)
+            .EQ("elementwise_add", 0)
+            .EQ("reshape2", 0)
+            .EQ("transpose2", 0)
+            .EQ("scale", 0)
+            .EQ("matmul", 0)
+            .EQ("softmax", 0));
diff --git a/paddle/fluid/framework/ir/multihead_matmul_fuse_pass_tester.cc b/paddle/fluid/framework/ir/multihead_matmul_fuse_pass_tester.cc
index d8a06b037bdefbe8776c9b95b36be80afb988393..2eda643d4e53aa061908f02c9d31b765241c318b 100644
--- a/paddle/fluid/framework/ir/multihead_matmul_fuse_pass_tester.cc
+++ b/paddle/fluid/framework/ir/multihead_matmul_fuse_pass_tester.cc
@@ -12,6 +12,7 @@ limitations under the License. */
 #include "paddle/fluid/framework/ir/multihead_matmul_fuse_pass.h"  // NOLINT
 #include <gtest/gtest.h>
 #include "paddle/fluid/framework/ir/pass_tester_helper.h"
+#include "paddle/fluid/framework/op_version_registry.h"
 
 namespace paddle {
 namespace framework {
@@ -133,6 +134,12 @@ TEST(MultiHeadMatmulFusePass, basic) {
                         num_fused_nodes_after));
 }
 
+TEST(MultiHeadMatmulFusePass, pass_op_version_check) {
+  ASSERT_TRUE(
+      paddle::framework::compatible::PassVersionCheckerRegistrar::GetInstance()
+          .IsPassCompatible("multihead_matmul_fuse_pass_v2"));
+}
+
 }  // namespace ir
 }  // namespace framework
 }  // namespace paddle
diff --git a/paddle/fluid/framework/ir/repeated_fc_relu_fuse_pass.cc b/paddle/fluid/framework/ir/repeated_fc_relu_fuse_pass.cc
index 2396a7f3c4f84f70c2f350e2121c4044c56b141a..9f6032ffa5b87daece107ad6bd3d5f9444719e44 100644
--- a/paddle/fluid/framework/ir/repeated_fc_relu_fuse_pass.cc
+++ b/paddle/fluid/framework/ir/repeated_fc_relu_fuse_pass.cc
@@ -18,6 +18,7 @@ limitations under the License. */
 #include <unordered_set>
 #include <vector>
 #include "paddle/fluid/framework/lod_tensor.h"
+#include "paddle/fluid/framework/op_version_registry.h"
 
 #define MAX_NUM_FC 10
 
@@ -174,6 +175,11 @@ void BuildRepeatedFCReluPattern(PDPattern* pattern,
             if (x->outputs.size() <= 0 || x->inputs.size() <= 0U) {
               return false;
             }
+            if (x->IsVar() && x->Var() && x->Var()->GetShape().size() > 2) {
+              VLOG(3) << "repeated fc relu only supports input dims = 2, so it "
+                         "is not applied.";
+              return false;
+            }
             int fc_idx = FindFCIdx(x);
             if (fc_idx < 0) {
               return false;
@@ -384,3 +390,8 @@ void RepeatedFCReluFusePass::ApplyImpl(ir::Graph* graph) const {
 
 REGISTER_PASS(repeated_fc_relu_fuse_pass,
               paddle::framework::ir::RepeatedFCReluFusePass);
+REGISTER_PASS_CAPABILITY(repeated_fc_relu_fuse_pass)
+    .AddCombination(
+        paddle::framework::compatible::OpVersionComparatorCombination()
+            .EQ("fc", 0)
+            .EQ("relu", 0));
diff --git a/paddle/fluid/framework/ir/seqconv_eltadd_relu_fuse_pass.cc b/paddle/fluid/framework/ir/seqconv_eltadd_relu_fuse_pass.cc
index 1485a84d001acef8542a9dda5436cfeb57518d69..75ab04f1b9130dccd42cea39dc0e074e2e2838eb 100644
--- a/paddle/fluid/framework/ir/seqconv_eltadd_relu_fuse_pass.cc
+++ b/paddle/fluid/framework/ir/seqconv_eltadd_relu_fuse_pass.cc
@@ -16,6 +16,7 @@
 #include <string>
 #include <unordered_set>
 #include "paddle/fluid/framework/lod_tensor.h"
+#include "paddle/fluid/framework/op_version_registry.h"
 
 namespace paddle {
 namespace framework {
@@ -98,3 +99,9 @@ void SeqConvEltAddReluFusePass::ApplyImpl(ir::Graph* graph) const {
 
 REGISTER_PASS(seqconv_eltadd_relu_fuse_pass,
               paddle::framework::ir::SeqConvEltAddReluFusePass);
+REGISTER_PASS_CAPABILITY(seqconv_eltadd_relu_fuse_pass)
+    .AddCombination(
+        paddle::framework::compatible::OpVersionComparatorCombination()
+            .EQ("sequence_conv", 0)
+            .EQ("elementwise_add", 0)
+            .EQ("relu", 0));
diff --git a/paddle/fluid/framework/ir/shuffle_channel_detect_pass.cc b/paddle/fluid/framework/ir/shuffle_channel_detect_pass.cc
index d9a65e71592ff464a2e6beaa2219a39103f6cae1..8bdf3940928c768fc7b0a9c7fa3d084d95f60859 100644
--- a/paddle/fluid/framework/ir/shuffle_channel_detect_pass.cc
+++ b/paddle/fluid/framework/ir/shuffle_channel_detect_pass.cc
@@ -16,6 +16,7 @@
 
 #include "paddle/fluid/framework/ir/graph_viz_pass.h"
 #include "paddle/fluid/framework/ir/shuffle_channel_detect_pass.h"
+#include "paddle/fluid/framework/op_version_registry.h"
 
 namespace paddle {
 namespace framework {
@@ -82,6 +83,9 @@ void ShuffleChannelDetectPass::ApplyImpl(ir::Graph* graph) const {
     // Delete the unneeded nodes.
     GraphSafeRemoveNodes(graph, {reshape1_op, reshape1_out, transpose_op,
                                  transpose_out, reshape2_op});
+    LOG_FIRST_N(WARNING, 1)
+        << "There is fluid.layers.shuffle_channel API already, maybe you can "
+           "use it instead of (reshape + transpose + reshape)";
   };
 
   gpd(graph, handler);
@@ -93,3 +97,8 @@ void ShuffleChannelDetectPass::ApplyImpl(ir::Graph* graph) const {
 
 REGISTER_PASS(shuffle_channel_detect_pass,
               paddle::framework::ir::ShuffleChannelDetectPass);
+REGISTER_PASS_CAPABILITY(shuffle_channel_detect_pass)
+    .AddCombination(
+        paddle::framework::compatible::OpVersionComparatorCombination()
+            .EQ("reshape2", 0)
+            .EQ("transpose2", 0));
diff --git a/paddle/fluid/framework/ir/skip_layernorm_fuse_pass.cc b/paddle/fluid/framework/ir/skip_layernorm_fuse_pass.cc
index 9dddc9154f8fc39144b38535824999b933a92106..2e3cd16d5ce49fdd6186f98c72d77c75c4053559 100644
--- a/paddle/fluid/framework/ir/skip_layernorm_fuse_pass.cc
+++ b/paddle/fluid/framework/ir/skip_layernorm_fuse_pass.cc
@@ -17,6 +17,7 @@ limitations under the License. */
 #include <unordered_set>
 #include <vector>
 #include "paddle/fluid/framework/ir/graph_pattern_detector.h"
+#include "paddle/fluid/framework/op_version_registry.h"
 
 namespace paddle {
 namespace framework {
@@ -180,3 +181,8 @@ void SkipLayerNormFusePass::ApplyImpl(ir::Graph *graph) const {
 
 REGISTER_PASS(skip_layernorm_fuse_pass,
               paddle::framework::ir::SkipLayerNormFusePass);
+REGISTER_PASS_CAPABILITY(skip_layernorm_fuse_pass)
+    .AddCombination(
+        paddle::framework::compatible::OpVersionComparatorCombination()
+            .EQ("elementwise_add", 0)
+            .EQ("layer_norm", 0));
diff --git a/paddle/fluid/framework/ir/skip_layernorm_fuse_pass_tester.cc b/paddle/fluid/framework/ir/skip_layernorm_fuse_pass_tester.cc
index d2d7469872857a070294520a589fee4ca383f065..eff5dcddf54ee49be5b14a7bdfa609079f925036 100644
--- a/paddle/fluid/framework/ir/skip_layernorm_fuse_pass_tester.cc
+++ b/paddle/fluid/framework/ir/skip_layernorm_fuse_pass_tester.cc
@@ -16,6 +16,7 @@ limitations under the License. */
 
 #include <gtest/gtest.h>
 #include "paddle/fluid/framework/ir/pass_tester_helper.h"
+#include "paddle/fluid/framework/op_version_registry.h"
 
 namespace paddle {
 namespace framework {
@@ -54,6 +55,12 @@ TEST(SkipLayerNormFusePass, basic) {
           "The number of fusion nodes does not meet expectations after fuse"));
 }
 
+TEST(SkipLayerNormFusePass, pass_op_version_check) {
+  ASSERT_TRUE(
+      paddle::framework::compatible::PassVersionCheckerRegistrar::GetInstance()
+          .IsPassCompatible("skip_layernorm_fuse_pass"));
+}
+
 }  // namespace ir
 }  // namespace framework
 }  // namespace paddle
diff --git a/paddle/fluid/framework/ir/squared_mat_sub_fuse_pass.cc b/paddle/fluid/framework/ir/squared_mat_sub_fuse_pass.cc
index 035b198bdcc51800be62acce58a538145413e92f..d74843611cdd238f1fb78153e6b946ae8a1c8473 100644
--- a/paddle/fluid/framework/ir/squared_mat_sub_fuse_pass.cc
+++ b/paddle/fluid/framework/ir/squared_mat_sub_fuse_pass.cc
@@ -17,6 +17,7 @@
 #include <unordered_set>
 #include <vector>
 #include "paddle/fluid/framework/lod_tensor.h"
+#include "paddle/fluid/framework/op_version_registry.h"
 
 namespace paddle {
 namespace framework {
@@ -77,7 +78,8 @@ PDNode* BuildSquaredMatSubPattern(PDPattern* pattern,
   };
 
   auto is_fusion_input_var = [=](Node* x, const std::string& arg_name) {
-    bool basic = var_is_op_input(x, "matmul", arg_name) &&
+    bool basic = (var_is_op_input(x, "matmul_v2", arg_name) ||
+                  var_is_op_input(x, "matmul", arg_name)) &&
                  var_is_op_input(x, "square", "X");
     if (!basic) {
       return false;
@@ -88,7 +90,8 @@ PDNode* BuildSquaredMatSubPattern(PDPattern* pattern,
     }
     auto* squared_x = squared_x_op->outputs[0];
     bool next_is_matmul_from_arg =
-        var_is_op_input(squared_x, "matmul", arg_name) &&
+        (var_is_op_input(squared_x, "matmul_v2", arg_name) ||
+         var_is_op_input(squared_x, "matmul", arg_name)) &&
         squared_x->outputs.size() == 1 &&
         squared_x->outputs[0]->outputs.size() == 1;
     if (!next_is_matmul_from_arg) {
@@ -103,7 +106,8 @@ PDNode* BuildSquaredMatSubPattern(PDPattern* pattern,
   auto is_fusion_first_mul_out = [=](Node* x) -> bool {
     bool input_is_matmul_op = x && x->inputs.size() == 1 &&
                               x->inputs[0]->IsOp() &&
-                              x->inputs[0]->Op()->Type() == "matmul";
+                              (x->inputs[0]->Op()->Type() == "matmul_v2" ||
+                               x->inputs[0]->Op()->Type() == "matmul");
     if (!input_is_matmul_op) {
       return false;
     }
@@ -167,7 +171,8 @@ PDNode* BuildSquaredMatSubPattern(PDPattern* pattern,
 
   auto* matmul_xy_op = pattern->NewNode(
       [=](Node* x) {
-        return x && x->IsOp() && x->Op()->Type() == "matmul" &&
+        return x && x->IsOp() && (x->Op()->Type() == "matmul_v2" ||
+                                  x->Op()->Type() == "matmul") &&
                is_fusion_first_mul_out(x->outputs[0]);
       },
       name_scope + "/matmul_xy_op");
@@ -189,7 +194,9 @@ PDNode* BuildSquaredMatSubPattern(PDPattern* pattern,
 
   auto is_fusion_mat_squared_x_y_op_out = [=](Node* x) -> bool {
     bool basic = x && x->IsVar() && x->inputs.size() == 1 &&
-                 x->inputs[0]->IsOp() && x->inputs[0]->Op()->Type() == "matmul";
+                 x->inputs[0]->IsOp() &&
+                 (x->inputs[0]->Op()->Type() == "matmul_v2" ||
+                  x->inputs[0]->Op()->Type() == "matmul");
     if (!basic) {
       return false;
     }
@@ -206,7 +213,8 @@ PDNode* BuildSquaredMatSubPattern(PDPattern* pattern,
 
   auto* matmul_squared_x_y_op = pattern->NewNode(
       [=](Node* x) {
-        return x && x->IsOp() && x->Op()->Type() == "matmul" &&
+        return x && x->IsOp() && (x->Op()->Type() == "matmul_v2" ||
+                                  x->Op()->Type() == "matmul") &&
                is_fusion_mat_squared_x_y_op_out(x->outputs[0]);
       },
       name_scope + "/matmul_squared_x_y_op");
@@ -378,3 +386,13 @@ void SquaredMatSubFusePass::ApplyImpl(ir::Graph* graph) const {
 
 REGISTER_PASS(squared_mat_sub_fuse_pass,
               paddle::framework::ir::SquaredMatSubFusePass);
+REGISTER_PASS_CAPABILITY(squared_mat_sub_fuse_pass)
+    .AddCombination(
+        paddle::framework::compatible::OpVersionComparatorCombination()
+            .EQ("matmul", 0)
+            .EQ("matmul_v2", 0)
+            .EQ("square", 0)
+            .EQ("elementwise_mul", 0)
+            .EQ("elementwise_sub", 0)
+            .EQ("fill_constant", 0)
+            .EQ("fusion_squared_mat_sub", 0));
diff --git a/paddle/fluid/framework/ir/squared_mat_sub_fuse_pass.h b/paddle/fluid/framework/ir/squared_mat_sub_fuse_pass.h
index b6165a512acdb9b6e3bdbf49196692ef83edb58f..56b7ec9b84314bd3634c406c31e20dd421f7fa92 100644
--- a/paddle/fluid/framework/ir/squared_mat_sub_fuse_pass.h
+++ b/paddle/fluid/framework/ir/squared_mat_sub_fuse_pass.h
@@ -24,7 +24,7 @@ namespace framework {
 namespace ir {
 
 /**
- * Fuse ( (A.^2 * B.^2) - (A * B).^2 ) .* scalar
+ * Fuse ( (A * B).^2 - (A.^2 * B.^2) ) .* scalar
  */
 class SquaredMatSubFusePass : public FusePassBase {
  public:
diff --git a/paddle/fluid/framework/ir/transpose_flatten_concat_fuse_pass.cc b/paddle/fluid/framework/ir/transpose_flatten_concat_fuse_pass.cc
index 9a0a5f07a7080593d8f13e07788c703edb92c7ad..405cefa99ebbbe147fc96f63567e13607732780e 100644
--- a/paddle/fluid/framework/ir/transpose_flatten_concat_fuse_pass.cc
+++ b/paddle/fluid/framework/ir/transpose_flatten_concat_fuse_pass.cc
@@ -20,6 +20,7 @@
 #include "paddle/fluid/framework/ir/graph_viz_pass.h"
 #include "paddle/fluid/framework/ir/node.h"
 #include "paddle/fluid/framework/ir/transpose_flatten_concat_fuse_pass.h"
+#include "paddle/fluid/framework/op_version_registry.h"
 
 namespace paddle {
 namespace framework {
@@ -145,3 +146,11 @@ void TransposeFlattenConcatFusePass::ApplyImpl(ir::Graph *graph) const {
 
 REGISTER_PASS(transpose_flatten_concat_fuse_pass,
               paddle::framework::ir::TransposeFlattenConcatFusePass);
+REGISTER_PASS_CAPABILITY(transpose_flatten_concat_fuse_pass)
+    .AddCombination(
+        paddle::framework::compatible::OpVersionComparatorCombination()
+            .EQ("transpose", 0)
+            .EQ("transpose2", 0)
+            .EQ("flatten", 0)
+            .EQ("concat", 0)
+            .EQ("fusion_transpose_flatten_concat", 0));
diff --git a/paddle/fluid/framework/op_info.h b/paddle/fluid/framework/op_info.h
index 171f08390765a64997bff28d80f9360f5da2cd1a..89b499975790060a3a3e3f665c35f8545922e6a7 100644
--- a/paddle/fluid/framework/op_info.h
+++ b/paddle/fluid/framework/op_info.h
@@ -69,7 +69,8 @@ class OpInfo {
 
   const OpCreator& Creator() const {
     PADDLE_ENFORCE_NOT_NULL(creator_,
-                            "Operator's Creator has not been registered");
+                            platform::errors::NotFound(
+                                "Operator's Creator has not been registered."));
     return creator_;
   }
 
@@ -79,11 +80,12 @@ class OpInfo {
     std::string type = proto_ ? proto_->type() : "unknown";
     PADDLE_ENFORCE_NOT_NULL(
         grad_op_maker_,
-        "Operator %s's GradOpMaker has not been "
-        "registered.\nPlease check whether %s_op has "
-        "grad_op.\nIf not, please set stop_gradient to True "
-        "for its input and output variables using var.stop_gradient=True.",
-        type.c_str(), type.c_str());
+        platform::errors::NotFound(
+            "Operator %s's GradOpMaker has not been "
+            "registered.\nPlease check whether (%s) operator has "
+            "gradient operator.\nIf not, please set stop_gradient to be True "
+            "for its input and output variables using var.stop_gradient=True.",
+            type.c_str(), type.c_str()));
     return grad_op_maker_;
   }
 
@@ -100,11 +102,12 @@ class OpInfo {
     std::string type = proto_ ? proto_->type() : "unknown";
     PADDLE_ENFORCE_NOT_NULL(
         dygraph_grad_op_maker_,
-        "Operator %s's DygraphGradOpMaker has not been "
-        "registered.\nPlease check whether %s_op has "
-        "grad_op.\nIf not, please set stop_gradient to True "
-        "for its input and output variables using var.stop_gradient=True.",
-        type.c_str(), type.c_str());
+        platform::errors::NotFound(
+            "Operator %s's DygraphGradOpMaker has not been "
+            "registered.\nPlease check whether (%s) operator has "
+            "gradient operator.\nIf not, please set stop_gradient to be True "
+            "for its input and output variables using var.stop_gradient=True.",
+            type.c_str(), type.c_str()));
     return dygraph_grad_op_maker_;
   }
 
@@ -130,14 +133,17 @@ class OpInfoMap {
   }
 
   void Insert(const std::string& type, const OpInfo& info) {
-    PADDLE_ENFORCE(!Has(type), "Operator %s has been registered", type);
+    PADDLE_ENFORCE_NE(Has(type), true,
+                      platform::errors::AlreadyExists(
+                          "Operator (%s) has been registered.", type));
     map_.insert({type, info});
   }
 
   const OpInfo& Get(const std::string& type) const {
     auto op_info_ptr = GetNullable(type);
-    PADDLE_ENFORCE_NOT_NULL(op_info_ptr, "Operator %s has not been registered",
-                            type);
+    PADDLE_ENFORCE_NOT_NULL(
+        op_info_ptr,
+        platform::errors::NotFound("Operator (%s) is not registered.", type));
     return *op_info_ptr;
   }
 
diff --git a/paddle/fluid/framework/op_kernel_type.cc b/paddle/fluid/framework/op_kernel_type.cc
index 6d4801e4a0eed7083e671e1d49b8628dfb280cf9..e64c3674e7433bb1d9e54f89b89e5f1e2c521648 100644
--- a/paddle/fluid/framework/op_kernel_type.cc
+++ b/paddle/fluid/framework/op_kernel_type.cc
@@ -33,10 +33,18 @@ size_t OpKernelType::Hash::operator()(const OpKernelType& key) const {
   cur_loc += OpKernelType::kLibBits;
 
   int customized_value = key.customized_type_value_;
-  PADDLE_ENFORCE(customized_value < (1 << OpKernelType::kCustomizeBits));
+  PADDLE_ENFORCE_LT(customized_value, (1 << OpKernelType::kCustomizeBits),
+                    platform::errors::Unavailable(
+                        "Too many custom OpKernel attribute values, expected "
+                        "maximum value is %d, received value is %d.",
+                        (1 << OpKernelType::kCustomizeBits), customized_value));
   customized_value = customized_value << cur_loc;
   cur_loc += OpKernelType::kCustomizeBits;
-  PADDLE_ENFORCE(cur_loc < 64);
+  PADDLE_ENFORCE_LT(cur_loc, 64,
+                    platform::errors::Unavailable(
+                        "Too many OpKernel attribute values, expected maximum "
+                        "value is 64, received value is %d.",
+                        cur_loc));
 
   std::hash<int> hasher;
   return hasher(place + data_type + data_layout + library_type +
diff --git a/paddle/fluid/framework/op_proto_maker.cc b/paddle/fluid/framework/op_proto_maker.cc
index 3408ab262c16197b92e407d0af6043c8a062b5d4..357c4fb5e57fb5b9172631ca57fbdbfeb19b3143 100644
--- a/paddle/fluid/framework/op_proto_maker.cc
+++ b/paddle/fluid/framework/op_proto_maker.cc
@@ -43,7 +43,9 @@ OpProtoAndCheckerMaker::VariableBuilder OpProtoAndCheckerMaker::AddOutput(
 void OpProtoAndCheckerMaker::CheckNoDuplicatedInOutAttrs() {
   std::unordered_set<std::string> names;
   auto checker = [&](const std::string& name) {
-    PADDLE_ENFORCE(!names.count(name), "[%s] is duplicated", name);
+    PADDLE_ENFORCE_EQ(
+        names.count(name), 0,
+        platform::errors::AlreadyExists("Attribute [%s] is duplicated.", name));
     names.insert(name);
   };
   for (auto& attr : proto_->attrs()) {
diff --git a/paddle/fluid/framework/op_registry.h b/paddle/fluid/framework/op_registry.h
index d8159d6a5c294b85d8d5ab9bbee3b95a5eba793f..6408fadf90ae32adf048156d1369cf22a76d20ea 100644
--- a/paddle/fluid/framework/op_registry.h
+++ b/paddle/fluid/framework/op_registry.h
@@ -54,9 +54,10 @@ class Registrar {
 template <typename... ARGS>
 struct OperatorRegistrar : public Registrar {
   explicit OperatorRegistrar(const char* op_type) {
-    if (OpInfoMap::Instance().Has(op_type)) {
-      PADDLE_THROW("'%s' is registered more than once.", op_type);
-    }
+    PADDLE_ENFORCE_EQ(
+        OpInfoMap::Instance().Has(op_type), false,
+        platform::errors::AlreadyExists(
+            "Operator '%s' is registered more than once.", op_type));
     static_assert(sizeof...(ARGS) != 0,
                   "OperatorRegistrar should be invoked at least by OpClass");
     OpInfo info;
diff --git a/paddle/fluid/framework/op_registry_test.cc b/paddle/fluid/framework/op_registry_test.cc
index 21d3454467603c58c9513351eba2c09ef6eeacba..45fe66d7db3b546604b640008e0ab61eaa84390e 100644
--- a/paddle/fluid/framework/op_registry_test.cc
+++ b/paddle/fluid/framework/op_registry_test.cc
@@ -58,7 +58,8 @@ class MyTestOpProtoAndCheckerMaker : public OpProtoAndCheckerMaker {
     AddInput("input", "input of cosine op").AsDuplicable();
     AddOutput("output", "output of cosine op").AsIntermediate();
     auto my_checker = [](int i) {
-      PADDLE_ENFORCE(i % 2 == 0, "'test_attr' must be even!");
+      PADDLE_ENFORCE_EQ(i % 2, 0, platform::errors::InvalidArgument(
+                                      "'test_attr' must be even!"));
     };
     AddAttr<int>("test_attr", "a simple test attribute")
         .AddCustomChecker(my_checker);
diff --git a/paddle/fluid/framework/op_version_registry.h b/paddle/fluid/framework/op_version_registry.h
index 79b15fc87d0b0a0ade8324710b80af634ff8878f..fea043a0ff311f7b940331b9d392296c331590e9 100644
--- a/paddle/fluid/framework/op_version_registry.h
+++ b/paddle/fluid/framework/op_version_registry.h
@@ -133,6 +133,9 @@ class OpVersion {
     checkpoints_.push_back(Checkpoint({note, op_version_desc}));
     return *this;
   }
+  uint32_t GetVersionID() const {
+    return static_cast<uint32_t>(checkpoints_.size());
+  }
 
  private:
   struct Checkpoint {
@@ -149,13 +152,21 @@ class OpVersionRegistrar {
     return instance;
   }
   OpVersion& Register(const std::string& op_type) {
-    if (op_version_map_.find(op_type) != op_version_map_.end()) {
-      PADDLE_THROW("'%s' is registered in operator version more than once.",
-                   op_type);
-    }
+    PADDLE_ENFORCE_EQ(
+        op_version_map_.find(op_type), op_version_map_.end(),
+        platform::errors::AlreadyExists(
+            "'%s' is registered in operator version more than once.", op_type));
     op_version_map_.insert({op_type, OpVersion()});
     return op_version_map_[op_type];
   }
+  uint32_t GetVersionID(const std::string& op_type) const {
+    auto it = op_version_map_.find(op_type);
+    if (it == op_version_map_.end()) {
+      return 0;
+    }
+
+    return it->second.GetVersionID();
+  }
 
  private:
   std::unordered_map<std::string, OpVersion> op_version_map_;
@@ -164,6 +175,125 @@ class OpVersionRegistrar {
   OpVersionRegistrar& operator=(const OpVersionRegistrar&) = delete;
 };
 
+class OpVersionComparator {
+ public:
+  virtual bool operator()() = 0;
+  virtual ~OpVersionComparator() = default;
+};
+
+#define ADD_OP_VERSION_COMPARATOR(cmp_name, cmp_math)                   \
+  class OpVersion##cmp_name##Comparator : public OpVersionComparator {  \
+   public:                                                              \
+    explicit OpVersion##cmp_name##Comparator(const std::string op_name, \
+                                             uint32_t target_version)   \
+        : op_name_(op_name), target_version_(target_version) {}         \
+    virtual bool operator()() {                                         \
+      return OpVersionRegistrar::GetInstance().GetVersionID(op_name_)   \
+          cmp_math target_version_;                                     \
+    }                                                                   \
+    virtual ~OpVersion##cmp_name##Comparator() {}                       \
+                                                                        \
+   private:                                                             \
+    std::string op_name_;                                               \
+    uint32_t target_version_;                                           \
+  };
+
+ADD_OP_VERSION_COMPARATOR(LE, <=);
+ADD_OP_VERSION_COMPARATOR(EQ, ==);
+ADD_OP_VERSION_COMPARATOR(GE, >=);
+ADD_OP_VERSION_COMPARATOR(NE, !=);
+
+class OpVersionComparatorCombination {
+ public:
+  OpVersionComparatorCombination() {}
+
+  OpVersionComparatorCombination& LE(const std::string& op_name,
+                                     int target_version) {
+    op_version_comparators_.push_back(std::shared_ptr<OpVersionComparator>(
+        new OpVersionLEComparator(op_name, target_version)));
+    return *this;
+  }
+  OpVersionComparatorCombination& EQ(const std::string& op_name,
+                                     int target_version) {
+    op_version_comparators_.push_back(std::shared_ptr<OpVersionComparator>(
+        new OpVersionEQComparator(op_name, target_version)));
+    return *this;
+  }
+  OpVersionComparatorCombination& GE(const std::string& op_name,
+                                     int target_version) {
+    op_version_comparators_.push_back(std::shared_ptr<OpVersionComparator>(
+        new OpVersionGEComparator(op_name, target_version)));
+    return *this;
+  }
+  OpVersionComparatorCombination& NE(const std::string& op_name,
+                                     int target_version) {
+    op_version_comparators_.push_back(std::shared_ptr<OpVersionComparator>(
+        new OpVersionNEComparator(op_name, target_version)));
+    return *this;
+  }
+
+  bool IsMatched() const {
+    for (const auto& cmp : op_version_comparators_) {
+      if (!(*cmp)()) {
+        return false;
+      }
+    }
+    return true;
+  }
+
+ private:
+  std::vector<std::shared_ptr<OpVersionComparator>> op_version_comparators_;
+};
+
+class PassVersionCheckers {
+ public:
+  PassVersionCheckers& AddCombination(
+      const OpVersionComparatorCombination& combinations) {
+    pass_version_checkers_.push_back(combinations);
+    return *this;
+  }
+  bool IsPassCompatible() const {
+    if (pass_version_checkers_.empty()) {
+      return true;
+    }
+    for (const auto& checker : pass_version_checkers_) {
+      if (checker.IsMatched()) {
+        return true;
+      }
+    }
+    return false;
+  }
+
+ private:
+  std::vector<OpVersionComparatorCombination> pass_version_checkers_;
+};
+
+class PassVersionCheckerRegistrar {
+ public:
+  static PassVersionCheckerRegistrar& GetInstance() {
+    static PassVersionCheckerRegistrar instance;
+    return instance;
+  }
+  PassVersionCheckers& Register(const std::string& pass_name) {
+    return pass_version_checkers_map_[pass_name];
+  }
+  bool IsPassCompatible(const std::string& fuse_pass_name) const {
+    auto iter = pass_version_checkers_map_.find(fuse_pass_name);
+    if (iter == pass_version_checkers_map_.end()) {
+      return true;
+    }
+    return iter->second.IsPassCompatible();
+  }
+
+ private:
+  std::unordered_map<std::string, PassVersionCheckers>
+      pass_version_checkers_map_;
+
+  PassVersionCheckerRegistrar() = default;
+  PassVersionCheckerRegistrar& operator=(const PassVersionCheckerRegistrar&) =
+      delete;
+};
+
 }  // namespace compatible
 }  // namespace framework
 }  // namespace paddle
@@ -173,3 +303,9 @@ class OpVersionRegistrar {
       RegisterOpVersion__##op_type =                                       \
           paddle::framework::compatible::OpVersionRegistrar::GetInstance() \
               .Register(#op_type)
+
+#define REGISTER_PASS_CAPABILITY(pass_name)                        \
+  static auto RegisterOpPassVersionChecker__##pass_name =          \
+      paddle::framework::compatible::PassVersionCheckerRegistrar:: \
+          GetInstance()                                            \
+              .Register(#pass_name)
diff --git a/paddle/fluid/framework/op_version_registry_test.cc b/paddle/fluid/framework/op_version_registry_test.cc
index 80ad51ad07b5a84cfabb3ace9b478b1f6ea24f95..239dbc4357854a8962567129b259a64260308b49 100644
--- a/paddle/fluid/framework/op_version_registry_test.cc
+++ b/paddle/fluid/framework/op_version_registry_test.cc
@@ -55,6 +55,72 @@ TEST(test_operator_version, test_operator_version) {
               .NewInput("X2", "The second input.")
               .NewOutput("Y2", "The second output."));
 }
+
+TEST(test_pass_op_version_checker, test_pass_op_version_checker) {
+  ASSERT_TRUE(PassVersionCheckerRegistrar::GetInstance().IsPassCompatible(
+      "no_bind_pass"));
+
+  REGISTER_PASS_CAPABILITY(test_pass1)
+      .AddCombination(
+          paddle::framework::compatible::OpVersionComparatorCombination()
+              .LE("mul", 1)
+              .EQ("fc", 0));
+  ASSERT_TRUE(PassVersionCheckerRegistrar::GetInstance().IsPassCompatible(
+      "test_pass1"));
+
+  REGISTER_PASS_CAPABILITY(test_pass2)
+      .AddCombination(
+          paddle::framework::compatible::OpVersionComparatorCombination()
+              .GE("mul", 0)
+              .NE("fc", 0));
+  ASSERT_FALSE(PassVersionCheckerRegistrar::GetInstance().IsPassCompatible(
+      "test_pass2"));
+
+  REGISTER_PASS_CAPABILITY(test_pass3)
+      .AddCombination(
+          paddle::framework::compatible::OpVersionComparatorCombination()
+              .GE("mul", 0)
+              .NE("fc", 0))
+      .AddCombination(
+          paddle::framework::compatible::OpVersionComparatorCombination()
+              .LE("mul", 1)
+              .EQ("fc", 0));
+  ASSERT_TRUE(PassVersionCheckerRegistrar::GetInstance().IsPassCompatible(
+      "test_pass3"));
+
+  REGISTER_PASS_CAPABILITY(test_pass4)
+      .AddCombination(
+          paddle::framework::compatible::OpVersionComparatorCombination()
+              .GE("test__", 5)
+              .EQ("fc", 0));
+  ASSERT_FALSE(PassVersionCheckerRegistrar::GetInstance().IsPassCompatible(
+      "test_pass4"));
+
+  REGISTER_PASS_CAPABILITY(test_pass5)
+      .AddCombination(
+          paddle::framework::compatible::OpVersionComparatorCombination()
+              .GE("test__", 4)
+              .EQ("fc", 0));
+  ASSERT_TRUE(PassVersionCheckerRegistrar::GetInstance().IsPassCompatible(
+      "test_pass5"));
+
+  REGISTER_PASS_CAPABILITY(test_pass6)
+      .AddCombination(
+          paddle::framework::compatible::OpVersionComparatorCombination()
+              .EQ("test__", 4)
+              .EQ("fc", 0));
+  ASSERT_TRUE(PassVersionCheckerRegistrar::GetInstance().IsPassCompatible(
+      "test_pass6"));
+
+  REGISTER_PASS_CAPABILITY(test_pass7)
+      .AddCombination(
+          paddle::framework::compatible::OpVersionComparatorCombination()
+              .NE("test__", 4)
+              .EQ("fc", 0));
+  ASSERT_FALSE(PassVersionCheckerRegistrar::GetInstance().IsPassCompatible(
+      "test_pass7"));
+}
+
 }  // namespace compatible
 }  // namespace framework
 }  // namespace paddle
diff --git a/paddle/fluid/framework/operator.cc b/paddle/fluid/framework/operator.cc
index ca2705f154c4f45dfccd954b23209c71701adce5..21fc293e84179da72be8cc5ee50de46a00fe9a0d 100644
--- a/paddle/fluid/framework/operator.cc
+++ b/paddle/fluid/framework/operator.cc
@@ -164,15 +164,20 @@ void OperatorBase::Run(const Scope& scope, const platform::Place& place) {
     VLOG(4) << place << " " << DebugStringEx(&scope);
     if (platform::is_gpu_place(place)) {
 #ifndef PADDLE_WITH_CUDA
-      PADDLE_THROW("Cannot run operator on place %s", place);
+      PADDLE_THROW(platform::errors::Unavailable(
+          "Cannot run operator on place %s, please recompile paddle or "
+          "reinstall Paddle with CUDA support.",
+          place));
 #else
       auto dev_id = BOOST_GET_CONST(platform::CUDAPlace, place).device;
       platform::SetDeviceId(dev_id);
 #endif
     } else if (platform::is_xpu_place(place)) {
 #ifndef PADDLE_WITH_XPU
-      PADDLE_THROW(platform::errors::Unimplemented(
-          "Cannot run operator on place %s", place));
+      PADDLE_THROW(platform::errors::Unavailable(
+          "Cannot run operator on place %s, please recompile paddle or "
+          "reinstall Paddle with XPU support.",
+          place));
 #else
       auto dev_id = BOOST_GET_CONST(platform::XPUPlace, place).device;
       platform::SetXPUDeviceId(dev_id);
@@ -214,7 +219,7 @@ std::string OperatorBase::Input(const std::string& name) const {
   auto& ins = Inputs(name);
   PADDLE_ENFORCE_LE(
       ins.size(), 1UL,
-      platform::errors::AlreadyExists(
+      platform::errors::InvalidArgument(
           "Operator %s's input %s should contain only one variable.", type_,
           name));
   return ins.empty() ? kEmptyVarName : ins[0];
@@ -223,8 +228,10 @@ std::string OperatorBase::Input(const std::string& name) const {
 const std::vector<std::string>& OperatorBase::Inputs(
     const std::string& name) const {
   auto it = inputs_.find(name);
-  PADDLE_ENFORCE(it != inputs_.end(), "Operator %s does not have the input %s.",
-                 type_, name);
+  PADDLE_ENFORCE_NE(
+      it, inputs_.end(),
+      platform::errors::NotFound("Operator %s does not have the input %s.",
+                                 type_, name));
   return it->second;
 }
 
@@ -238,17 +245,21 @@ bool OperatorBase::HasOutputs(const std::string& name) const {
 
 std::string OperatorBase::Output(const std::string& name) const {
   auto& outs = Outputs(name);
-  PADDLE_ENFORCE_LE(outs.size(), 1UL,
-                    "Operator %s's output %s should contain only one variable.",
-                    type_, name);
+  PADDLE_ENFORCE_LE(
+      outs.size(), 1UL,
+      platform::errors::InvalidArgument(
+          "Operator %s's output %s should contain only one variable.", type_,
+          name));
   return outs.empty() ? kEmptyVarName : outs[0];
 }
 
 const std::vector<std::string>& OperatorBase::Outputs(
     const std::string& name) const {
   auto it = outputs_.find(name);
-  PADDLE_ENFORCE(it != outputs_.end(),
-                 "Operator %s does not have an output called %s.", type_, name);
+  PADDLE_ENFORCE_NE(
+      it, outputs_.end(),
+      platform::errors::NotFound(
+          "Operator %s does not have an output called %s.", type_, name));
   return it->second;
 }
 
@@ -391,16 +402,19 @@ void OperatorBase::CheckAllInputOutputSet() const {
 
   for (auto& in : info_->Proto().inputs()) {
     if (!in.dispensable()) {
-      PADDLE_ENFORCE(inputs_.find(in.name()) != inputs_.end(),
-                     "Operator %s's input, %s, is not set", Type(), in.name());
+      PADDLE_ENFORCE_NE(
+          inputs_.find(in.name()), inputs_.end(),
+          platform::errors::NotFound("Operator %s's input (%s) is not set.",
+                                     Type(), in.name()));
     }
   }
 
   for (auto& out : info_->Proto().outputs()) {
     if (!out.dispensable()) {
-      PADDLE_ENFORCE(outputs_.find(out.name()) != outputs_.end(),
-                     "Operator %s's output, %s, is not set", Type(),
-                     out.name());
+      PADDLE_ENFORCE_NE(
+          outputs_.find(out.name()), outputs_.end(),
+          platform::errors::NotFound("Operator %s's output (%s) is not set.",
+                                     Type(), out.name()));
     }
   }
 }
@@ -428,8 +442,9 @@ const Tensor* GetLoDTensorOrSelectedRowsValueFromVar(const Variable& var) {
   } else if (var.IsType<SelectedRows>()) {
     return &(var.Get<SelectedRows>().value());
   } else {
-    PADDLE_THROW("Variable type_id %s, expect LoDTensor/SelectedRows.",
-                 ToTypeName(var.Type()));
+    PADDLE_THROW(platform::errors::InvalidArgument(
+        "Variable type is %s, expect LoDTensor or SelectedRows.",
+        ToTypeName(var.Type())));
   }
 }
 
@@ -439,8 +454,9 @@ Tensor* GetMutableLoDTensorOrSelectedRowsValueFromVar(Variable* var) {
   } else if (var->IsType<SelectedRows>()) {
     return var->GetMutable<SelectedRows>()->mutable_value();
   } else {
-    PADDLE_THROW("Variable type_id %s, expect LoDTensor/SelectedRows.",
-                 ToTypeName(var->Type()));
+    PADDLE_THROW(platform::errors::InvalidArgument(
+        "Variable type is %s, expect LoDTensor or SelectedRows.",
+        ToTypeName(var->Type())));
   }
 }
 
@@ -462,7 +478,7 @@ const Variable* ExecutionContext::InputVar(const std::string& name) const {
 
   PADDLE_ENFORCE_LE(
       it->second.size(), 1UL,
-      platform::errors::AlreadyExists(
+      platform::errors::InvalidArgument(
           "Operator %s's input %s should contain only one variable.",
           op_.Type(), name));
   return it->second.empty() ? nullptr : it->second[0];
@@ -472,9 +488,11 @@ Variable* ExecutionContext::OutputVar(const std::string& name) const {
   auto it = ctx_.outputs.find(name);
   if (it == ctx_.outputs.end()) return nullptr;
 
-  PADDLE_ENFORCE_LE(it->second.size(), 1UL,
-                    "Operator %s's output %s should contain only one variable.",
-                    op_.Type(), name);
+  PADDLE_ENFORCE_LE(
+      it->second.size(), 1UL,
+      platform::errors::InvalidArgument(
+          "Operator %s's output %s should contain only one variable.",
+          op_.Type(), name));
   return it->second.empty() ? nullptr : it->second[0];
 }
 
@@ -497,10 +515,11 @@ const std::vector<const Tensor*> ExecutionContext::MultiInput<Tensor>(
   std::transform(vars.begin(), vars.end(), std::back_inserter(res),
                  [&](const Variable* var) -> const Tensor* {
                    if (var == nullptr) return nullptr;
-                   PADDLE_ENFORCE(
-                       var->IsType<LoDTensor>(),
-                       "should be LoDTensor, but the received type is %s",
-                       ToTypeName(var->Type()));
+                   PADDLE_ENFORCE_EQ(var->IsType<LoDTensor>(), true,
+                                     platform::errors::InvalidArgument(
+                                         "Input variable should be LoDTensor, "
+                                         "but the received type is %s.",
+                                         ToTypeName(var->Type())));
                    return &(var->Get<LoDTensor>());
                  });
   return res;
@@ -558,8 +577,10 @@ class RuntimeInferShapeContext : public InferShapeContext {
     }
     const auto& in = it->second;
     if (in.size() == 0) return false;
-    PADDLE_ENFORCE_EQ(in.size(), 1UL,
-                      "Input %s should not have more than one inputs", name);
+    PADDLE_ENFORCE_EQ(
+        in.size(), 1UL,
+        platform::errors::InvalidArgument(
+            "Input %s should not contain more than one inputs.", name));
     return in[0] != nullptr;
   }
 
@@ -574,8 +595,10 @@ class RuntimeInferShapeContext : public InferShapeContext {
     if (out.size() == 0) {
       return false;
     }
-    PADDLE_ENFORCE_EQ(out.size(), 1UL,
-                      "Output %s should not have more than one outputs", name);
+    PADDLE_ENFORCE_EQ(
+        out.size(), 1UL,
+        platform::errors::InvalidArgument(
+            "Output %s should not contain more than one outputs.", name));
     return out[0] != nullptr;
   }
 
@@ -644,16 +667,31 @@ class RuntimeInferShapeContext : public InferShapeContext {
                 size_t j = 0) override {
     auto in_it = ctx_.inputs.find(in);
     auto out_it = ctx_.outputs.find(out);
-    PADDLE_ENFORCE(in_it != ctx_.inputs.end() && in_it->second.size() > i,
-                   "Inputs %s should have %llu argument", in, i);
-    PADDLE_ENFORCE(out_it != ctx_.outputs.end() && out_it->second.size() > j,
-                   "Outputs %s should have %llu argument", out, j);
+    PADDLE_ENFORCE_NE(
+        in_it, ctx_.inputs.end(),
+        platform::errors::NotFound("Input %s does not exist.", in));
+    PADDLE_ENFORCE_NE(
+        out_it, ctx_.outputs.end(),
+        platform::errors::NotFound("Output %s does not exist.", out));
+    PADDLE_ENFORCE_LT(i, in_it->second.size(),
+                      platform::errors::InvalidArgument(
+                          "The index of input dimension is out of range, "
+                          "excepted index less than %zu, but received %zu.",
+                          in_it->second.size(), i));
+    PADDLE_ENFORCE_LT(j, out_it->second.size(),
+                      platform::errors::InvalidArgument(
+                          "The index of output dimension is out of range, "
+                          "excepted index less than %zu, but received %zu.",
+                          out_it->second.size(), j));
 
     Variable* in_var = in_it->second[i];
     Variable* out_var = out_it->second[j];
 
-    PADDLE_ENFORCE(in_var->Type() == out_var->Type(),
-                   "The type of %s and %s is not the same.", in, out);
+    PADDLE_ENFORCE_EQ(
+        in_var->Type(), out_var->Type(),
+        platform::errors::InvalidArgument(
+            "The type of input (%s) and output (%s) are inconsistent.", in,
+            out));
 
     if (in_var->IsType<framework::SelectedRows>()) {
       auto& in_sele_rows = in_var->Get<framework::SelectedRows>();
@@ -666,9 +704,9 @@ class RuntimeInferShapeContext : public InferShapeContext {
       auto* out_lod_tensor = out_var->GetMutable<framework::LoDTensor>();
       out_lod_tensor->Resize(in_lod_tensor.dims());
     } else {
-      PADDLE_THROW(
+      PADDLE_THROW(platform::errors::Unimplemented(
           "Currently, the input type of ShareDim only can be LoDTensor "
-          "or SelectedRows.");
+          "or SelectedRows."));
     }
   }
 
@@ -721,16 +759,30 @@ class RuntimeInferShapeContext : public InferShapeContext {
                 size_t j = 0) const override {
     auto in_it = ctx_.inputs.find(in);
     auto out_it = ctx_.outputs.find(out);
-    PADDLE_ENFORCE(in_it != ctx_.inputs.end() && in_it->second.size() > i,
-                   "Inputs %s should have %llu argument", in, i);
-    PADDLE_ENFORCE(out_it != ctx_.outputs.end() && out_it->second.size() > j,
-                   "Outputs %s should have %llu argument", out, j);
+    PADDLE_ENFORCE_NE(
+        in_it, ctx_.inputs.end(),
+        platform::errors::NotFound("Input %s does not exist.", in));
+    PADDLE_ENFORCE_NE(
+        out_it, ctx_.outputs.end(),
+        platform::errors::NotFound("Output %s does not exist.", out));
+    PADDLE_ENFORCE_LT(i, in_it->second.size(),
+                      platform::errors::InvalidArgument(
+                          "The index of input dimension is out of range, "
+                          "excepted index less than %zu, but received %zu.",
+                          in_it->second.size(), i));
+    PADDLE_ENFORCE_LT(j, out_it->second.size(),
+                      platform::errors::InvalidArgument(
+                          "The index of output dimension is out of range, "
+                          "excepted index less than %zu, but received %zu.",
+                          out_it->second.size(), j));
 
     Variable* in_var = in_it->second.at(i);
     if (!in_var->IsType<LoDTensor>()) return;
     Variable* out_var = out_it->second.at(j);
-    PADDLE_ENFORCE(out_var->IsType<LoDTensor>(),
-                   "The %d-th output of Output(%s) must be LoDTensor.", j, out);
+    PADDLE_ENFORCE_EQ(
+        out_var->IsType<LoDTensor>(), true,
+        platform::errors::InvalidArgument(
+            "The %zu-th output of Output(%s) must be LoDTensor.", j, out));
     auto& in_tensor = in_var->Get<LoDTensor>();
     auto* out_tensor = out_var->GetMutable<LoDTensor>();
     out_tensor->set_lod(in_tensor.lod());
@@ -757,18 +809,18 @@ class RuntimeInferShapeContext : public InferShapeContext {
   }
 
   int32_t GetLoDLevel(const std::string& in, size_t i = 0) const override {
-    PADDLE_THROW(
+    PADDLE_THROW(platform::errors::PreconditionNotMet(
         "GetLoDLevel is only used in compile time. The calculation of "
         "output's actual lod is different among operators so that should be "
-        "set in the runtime kernel.");
+        "set in the runtime kernel."));
   }
 
   void SetLoDLevel(const std::string& out, int32_t lod_level,
                    size_t j = 0) const override {
-    PADDLE_THROW(
+    PADDLE_THROW(platform::errors::PreconditionNotMet(
         "SetLoDLevel is only used in compile time. The calculation of "
         "output's actual lod is different among operators so that should be "
-        "set in the runtime kernel.");
+        "set in the runtime kernel."));
   }
 
   bool IsRuntime() const override { return true; }
@@ -794,9 +846,11 @@ class RuntimeInferShapeContext : public InferShapeContext {
 
   DDim GetInputDim(const std::string& name) const override {
     const std::vector<Variable*>& vars = InputVars(name);
-    PADDLE_ENFORCE_EQ(vars.size(), 1UL,
-                      "Input(%s) should hold one element, but now it holds %d",
-                      name, vars.size());
+    PADDLE_ENFORCE_EQ(
+        vars.size(), 1UL,
+        platform::errors::InvalidArgument(
+            "Input(%s) should hold one element, but now it holds %zu elements.",
+            name, vars.size()));
     return this->GetDim(vars[0]);
   }
 
@@ -817,9 +871,11 @@ class RuntimeInferShapeContext : public InferShapeContext {
 
   void SetOutputDim(const std::string& name, const DDim& dim) override {
     auto& vars = OutputVars(name);
-    PADDLE_ENFORCE_EQ(vars.size(), 1UL,
-                      "Output(%s) should hold one element, but now it holds %d",
-                      name, vars.size());
+    PADDLE_ENFORCE_EQ(
+        vars.size(), 1UL,
+        platform::errors::InvalidArgument("Output(%s) should hold one element, "
+                                          "but now it holds %zu elements.",
+                                          name, vars.size()));
     SetDim(vars[0], dim);
   }
 
@@ -831,16 +887,17 @@ class RuntimeInferShapeContext : public InferShapeContext {
 
  protected:
   DDim GetDim(Variable* var) const {
-    PADDLE_ENFORCE_NOT_NULL(var);
+    PADDLE_ENFORCE_NOT_NULL(
+        var, platform::errors::InvalidArgument("Input variable is nullptr."));
     if (var->IsType<LoDTensor>()) {
       return var->Get<LoDTensor>().dims();
     } else if (var->IsType<SelectedRows>()) {
       return var->Get<SelectedRows>().GetCompleteDims();
     } else {
-      PADDLE_THROW(
-          "Only LoDTensor/SelectedRows support 'GetDim', but Variables "
-          "type_id is %s.",
-          ToTypeName(var->Type()));
+      PADDLE_THROW(platform::errors::InvalidArgument(
+          "Only LoDTensor or SelectedRows support 'GetDim', but input "
+          "Variable's type is %s.",
+          ToTypeName(var->Type())));
     }
   }
 
@@ -853,7 +910,8 @@ class RuntimeInferShapeContext : public InferShapeContext {
   }
 
   std::vector<DDim> GetRepeatedDims(const std::string& name) const override {
-    PADDLE_THROW("Only compile time support this method");
+    PADDLE_THROW(platform::errors::PreconditionNotMet(
+        "GetRepeatedDims method only ban be used in compile time."));
   }
 
   void SetDim(Variable* var, const DDim& dim) {
@@ -862,15 +920,22 @@ class RuntimeInferShapeContext : public InferShapeContext {
     } else if (var->IsType<SelectedRows>()) {
       var->GetMutable<SelectedRows>()->set_height(dim[0]);
     } else {
-      PADDLE_THROW("Variable type_id %s, expect LoDTensor/SelectedRows.",
-                   ToTypeName(var->Type()));
+      PADDLE_THROW(platform::errors::Unimplemented(
+          "Variable type error, expect LoDTensor or SelectedRows, but received "
+          "(%s).",
+          ToTypeName(var->Type())));
     }
   }
 
   void SetDims(const std::vector<Variable*>& vars,
                const std::vector<DDim>& dims) {
     size_t length = vars.size();
-    PADDLE_ENFORCE_EQ(length, dims.size());
+    PADDLE_ENFORCE_EQ(length, dims.size(),
+                      platform::errors::InvalidArgument(
+                          "The number of input variables do not match the "
+                          "number of input dimensions, the number of variables "
+                          "is %zu, the number of dimensions is %zu.",
+                          length, dims.size()));
     for (size_t i = 0; i < length; ++i) {
       if (vars[i] == nullptr) {
         continue;
@@ -881,7 +946,8 @@ class RuntimeInferShapeContext : public InferShapeContext {
 
   void SetRepeatedDims(const std::string& name,
                        const std::vector<DDim>& dims) override {
-    PADDLE_THROW("Only compile time support this method");
+    PADDLE_THROW(platform::errors::PreconditionNotMet(
+        "SetRepeatedDims method only can be used in compile time."));
   }
 
   std::vector<proto::VarType::Type> GetVarTypes(
@@ -901,16 +967,19 @@ class RuntimeInferShapeContext : public InferShapeContext {
  private:
   const std::vector<Variable*>& InputVars(const std::string& name) const {
     auto it = ctx_.inputs.find(name);
-    PADDLE_ENFORCE(it != ctx_.inputs.end(),
-                   "Operator %s does not have the input %s.", op_.Type(), name);
+    PADDLE_ENFORCE_NE(
+        it, ctx_.inputs.end(),
+        platform::errors::NotFound(
+            "Operator (%s) does not have the input (%s).", op_.Type(), name));
     return it->second;
   }
 
   const std::vector<Variable*>& OutputVars(const std::string& name) const {
     auto it = ctx_.outputs.find(name);
-    PADDLE_ENFORCE(it != ctx_.outputs.end(),
-                   "Operator %s does not have the outputs %s.", op_.Type(),
-                   name);
+    PADDLE_ENFORCE_NE(
+        it, ctx_.outputs.end(),
+        platform::errors::NotFound(
+            "Operator (%s) does not have the outputs (%s).", op_.Type(), name));
     return it->second;
   }
 
@@ -928,10 +997,14 @@ static void CheckTensorNANOrInf(const std::string& op_type,
       tensor.type() != proto::VarType::FP64) {
     return;
   }
-  PADDLE_ENFORCE(!framework::TensorContainsInf(tensor),
-                 "Operator %s output Tensor %s contains Inf", op_type, name);
-  PADDLE_ENFORCE(!framework::TensorContainsNAN(tensor),
-                 "Operator %s output Tensor %s contains NAN", op_type, name);
+  PADDLE_ENFORCE_NE(
+      framework::TensorContainsInf(tensor), true,
+      platform::errors::Fatal("Operator %s output Tensor %s contains Inf.",
+                              op_type, name));
+  PADDLE_ENFORCE_NE(
+      framework::TensorContainsNAN(tensor), true,
+      platform::errors::Fatal("Operator %s output Tensor %s contains NAN.",
+                              op_type, name));
 }
 
 void OperatorWithKernel::RuntimeInferShape(const Scope& scope,
@@ -1074,10 +1147,11 @@ void OperatorWithKernel::ChooseKernel(const RuntimeContext& ctx,
   // check if op[type] has kernel registered.
   auto& all_op_kernels = AllOpKernels();
   auto kernels_iter = all_op_kernels.find(type_);
-  if (kernels_iter == all_op_kernels.end()) {
-    PADDLE_THROW(
-        "There are no kernels which are registered in the %s operator.", type_);
-  }
+  PADDLE_ENFORCE_NE(
+      kernels_iter, all_op_kernels.end(),
+      platform::errors::Unavailable(
+          "There are no kernels which are registered in the %s operator.",
+          type_));
 
   OpKernelMap& kernels = kernels_iter->second;
 
@@ -1131,10 +1205,10 @@ void OperatorWithKernel::ChooseKernel(const RuntimeContext& ctx,
     kernel_iter = kernels.find(expected_kernel_key);
   }
 #endif
-  if (kernel_iter == kernels.end()) {
-    PADDLE_THROW("op %s does not have kernel for %s", type_,
-                 KernelTypeToString(expected_kernel_key));
-  }
+  PADDLE_ENFORCE_NE(kernel_iter, kernels.end(),
+                    platform::errors::NotFound(
+                        "Operator (%s) does not have kernel for %s.", type_,
+                        KernelTypeToString(expected_kernel_key)));
 
   std::lock_guard<std::mutex> lock(cache_update_mutex_);
   if (kernel_type_.get() == nullptr || kernel_func_.get() == nullptr) {
@@ -1149,13 +1223,14 @@ void OperatorWithKernel::TransferInplaceVarsBack(
   for (auto& var_name : inplace_vars) {
     VLOG(3) << "share inplace var " + var_name + " back to it's original scope";
     auto* origin_var = scope.FindVar(var_name);
-    PADDLE_ENFORCE_NOT_NULL(origin_var, "The var[%s] should not be nullptr.",
-                            var_name);
+    PADDLE_ENFORCE_NOT_NULL(origin_var,
+                            platform::errors::InvalidArgument(
+                                "The variable[%s] is nullptr.", var_name));
     auto* original_tensor =
         GetMutableLoDTensorOrSelectedRowsValueFromVar(origin_var);
     auto* var = transfer_scope.FindVar(var_name);
-    PADDLE_ENFORCE_NOT_NULL(var, "The var[%s] should not be nullptr.",
-                            var_name);
+    PADDLE_ENFORCE_NOT_NULL(var, platform::errors::InvalidArgument(
+                                     "The variable[%s] is nullptr.", var_name));
     auto* transformed_tensor = GetLoDTensorOrSelectedRowsValueFromVar(*var);
     auto original_dims = original_tensor->dims();
     original_tensor->ShareDataWith(*transformed_tensor);
@@ -1380,9 +1455,11 @@ proto::VarType::Type OperatorWithKernel::IndicateVarDataType(
   ParseInputDataType(ctx, name, &data_type);
   PADDLE_ENFORCE_NE(
       data_type, dafault_data_type,
-      "The Input Variable(%s) of %s Op used to determine kernel data type "
-      "is empty or not LoDTensor or SelectedRows or LoDTensorArray.",
-      name, Type());
+      platform::errors::InvalidArgument(
+          "The Input Variable(%s) of (%s) Operator used to determine kernel "
+          "data type is empty or not LoDTensor or SelectedRows or "
+          "LoDTensorArray.",
+          name, Type()));
   return data_type;
 }
 
diff --git a/paddle/fluid/framework/operator.h b/paddle/fluid/framework/operator.h
index ebecbf0498c384a55627e2b5cb31304d098a444c..bd52d7ffef5040f596bfb5ca9521a6e1062bb5aa 100644
--- a/paddle/fluid/framework/operator.h
+++ b/paddle/fluid/framework/operator.h
@@ -157,6 +157,14 @@ class OperatorBase {
         platform::errors::NotFound("(%s) is not found in AttributeMap.", name));
     return BOOST_GET_CONST(T, attrs_.at(name));
   }
+  void SetAttr(const std::string& name, const Attribute& v) {
+    PADDLE_ENFORCE_EQ(
+        HasAttr(name), true,
+        platform::errors::NotFound(
+            "The attribute %s is not found in operator %s", name, Type()));
+
+    attrs_[name] = v;
+  }
   const AttributeMap& Attrs() const { return attrs_; }
 
   const VariableNameMap& Inputs() const { return inputs_; }
diff --git a/paddle/fluid/framework/operator_test.cc b/paddle/fluid/framework/operator_test.cc
index c4ce627ff1f940f1625b8650b243d64af2641612..218fc8880bb276a75ed1dd71b04fcd9f387c9a54 100644
--- a/paddle/fluid/framework/operator_test.cc
+++ b/paddle/fluid/framework/operator_test.cc
@@ -495,9 +495,9 @@ TEST(IndicateVarDataTypeTest, other) {
     EXPECT_TRUE(
         ex_msg.find(
             "The Input Variable(Other) of "
-            "indicate_other_data_type_test Op used to "
+            "(indicate_other_data_type_test) Operator used to "
             "determine kernel data type "
-            "is empty or not LoDTensor or SelectedRows or LoDTensorArray") !=
+            "is empty or not LoDTensor or SelectedRows or LoDTensorArray.") !=
         std::string::npos);
   }
   ASSERT_TRUE(caught);
diff --git a/paddle/fluid/framework/parallel_executor.cc b/paddle/fluid/framework/parallel_executor.cc
index 12e0f97f1262ca0f6bf8fc70ab5b482fb0bdd305..535ec9cd7d950588fd7877d0913e3e851f8fe8dc 100644
--- a/paddle/fluid/framework/parallel_executor.cc
+++ b/paddle/fluid/framework/parallel_executor.cc
@@ -13,12 +13,14 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "paddle/fluid/framework/parallel_executor.h"
+
 #include <algorithm>
 #include <memory>
 #include <string>
 #include <tuple>
 #include <utility>
 #include <vector>
+
 #include "paddle/fluid/framework/details/async_ssa_graph_executor.h"
 #include "paddle/fluid/framework/details/fast_threaded_ssa_graph_executor.h"
 #include "paddle/fluid/framework/details/multi_devices_helper.h"
@@ -108,6 +110,11 @@ class ParallelExecutorPrivate {
    *                                       them.
    */
   inline void SetSkipMemoryReuse(size_t scope_idx, const std::string &name) {
+    if (mem_opt_var_infos_.size() == 0) {
+      VLOG(4) << "The mem_opt_var_infos_ is empty, maybe no memory "
+                 "optimization strategy is enabled";
+      return;
+    }
     auto iter = mem_opt_var_infos_[scope_idx].find(name);
     if (iter != mem_opt_var_infos_[scope_idx].end()) {
       iter->second->SetSkipMemoryReuse(true);
@@ -308,6 +315,7 @@ ir::Graph *ParallelExecutorPrivate::ApplyMemoryOptimizePass(ir::Graph *graph) {
   }
 
   bool need_mem_opt = build_strategy_.enable_inplace_ ||
+                      build_strategy_.enable_addto_ ||
                       build_strategy_.memory_optimize_.get() || is_gc_enabled;
 
   if (!need_mem_opt) return graph;
@@ -320,6 +328,16 @@ ir::Graph *ParallelExecutorPrivate::ApplyMemoryOptimizePass(ir::Graph *graph) {
   graph = ref_cnt_pass->Apply(graph);
   VLOG(10) << "ReferenceCountPass Applied";
 
+  if (build_strategy_.enable_addto_) {
+    auto addto_pass = ir::PassRegistry::Instance().Get("inplace_addto_op_pass");
+    addto_pass->SetNotOwned(ir::kMemOptVarInfoMapList, &mem_opt_var_infos_);
+    addto_pass->SetNotOwned(ir::kLastLiveOpsOfVars, &last_live_ops_of_vars);
+    addto_pass->SetNotOwned(ir::kUseCuda, &use_cuda_);
+    VLOG(10) << "Start to apply inplace_addto_op_pass";
+    graph = addto_pass->Apply(graph);
+    VLOG(10) << "inplace_addto_op_pass Applied";
+  }
+
   if (build_strategy_.enable_inplace_) {
     auto inplace_pass =
         ir::PassRegistry::Instance().Get("buffer_shared_inplace_pass");
@@ -1068,3 +1086,4 @@ USE_PASS(reference_count_pass);
 USE_PASS(eager_deletion_pass);
 USE_PASS(buffer_shared_inplace_pass);
 USE_PASS(buffer_shared_cross_op_memory_reuse_pass);
+USE_PASS(inplace_addto_op_pass);
diff --git a/paddle/fluid/framework/pipeline_trainer.cc b/paddle/fluid/framework/pipeline_trainer.cc
index 758b728fd9cffff6867a46a6c22c86e496103b84..d7506edbf4ca74976f067d879dad7df4923f946f 100644
--- a/paddle/fluid/framework/pipeline_trainer.cc
+++ b/paddle/fluid/framework/pipeline_trainer.cc
@@ -251,6 +251,7 @@ void PipelineTrainer::Finalize() {
   }
   root_scope_->DropKids();
   SectionWorker::ResetBatchId();
+  SectionWorker::ResetThreadCompletedFlag();
 }
 
 Scope* PipelineTrainer::GetWorkerScope(int thread_id) {
diff --git a/paddle/fluid/framework/reader.cc b/paddle/fluid/framework/reader.cc
index d3513fb7dbed0413e61796d8a843c38fbbcf93dc..b418339bf32965a454e5b240bb728c4cb41e03ba 100644
--- a/paddle/fluid/framework/reader.cc
+++ b/paddle/fluid/framework/reader.cc
@@ -20,7 +20,10 @@ namespace framework {
 
 void ReaderBase::ReadNext(std::vector<LoDTensor> *out) {
   std::lock_guard<std::mutex> lock(mu_);
-  PADDLE_ENFORCE_EQ(status_, ReaderStatus::kRunning);
+  PADDLE_ENFORCE_EQ(status_, ReaderStatus::kRunning,
+                    platform::errors::Unavailable(
+                        "The current reader has stopped running and cannot "
+                        "continue to read the next batch of data."));
   ReadNextImpl(out);
 }
 
diff --git a/paddle/fluid/framework/rw_lock.h b/paddle/fluid/framework/rw_lock.h
index f8aa87519a2fc1a14765887e95c96883d7b4589f..9b74a55304077c6c13a55f36ea8cf3b6dfbe5b9c 100644
--- a/paddle/fluid/framework/rw_lock.h
+++ b/paddle/fluid/framework/rw_lock.h
@@ -32,17 +32,21 @@ struct RWLock {
   ~RWLock() { pthread_rwlock_destroy(&lock_); }
 
   inline void RDLock() {
-    PADDLE_ENFORCE_EQ(pthread_rwlock_rdlock(&lock_), 0,
-                      "acquire read lock failed");
+    PADDLE_ENFORCE_EQ(
+        pthread_rwlock_rdlock(&lock_), 0,
+        platform::errors::External("The pthread failed to acquire read lock."));
   }
 
   inline void WRLock() {
     PADDLE_ENFORCE_EQ(pthread_rwlock_wrlock(&lock_), 0,
-                      "acquire write lock failed");
+                      platform::errors::External(
+                          "The pthread failed to acquire write lock."));
   }
 
   inline void UNLock() {
-    PADDLE_ENFORCE_EQ(pthread_rwlock_unlock(&lock_), 0, "unlock failed");
+    PADDLE_ENFORCE_EQ(
+        pthread_rwlock_unlock(&lock_), 0,
+        platform::errors::External("The pthread failed to unlock."));
   }
 
  private:
diff --git a/paddle/fluid/framework/save_load_util.cc b/paddle/fluid/framework/save_load_util.cc
index fbbbfd66b3d8c39d0ccaa7d998bb5c5e9860df4e..602b431995cc59ab67e1a32ac09a3557432c3539 100644
--- a/paddle/fluid/framework/save_load_util.cc
+++ b/paddle/fluid/framework/save_load_util.cc
@@ -33,7 +33,8 @@ void CheckInStreamState(std::istream& istre, size_t length) {
     VLOG(5) << "Can't read [" << length << "] from file"
             << "file seems breakem";
 
-    PADDLE_THROW("Model load error, file seems breaken");
+    PADDLE_THROW(platform::errors::Unavailable(
+        "Model load failed, istream state error."));
   }
 }
 
@@ -58,10 +59,11 @@ size_t ReadTensorNumber(std::istream& istre) {
              sizeof(char) * tensor_number_mark.size());
   std::string str_read_tensor_number_mark(tensor_number_mark_buffer,
                                           tensor_number_mark.size());
-  PADDLE_ENFORCE_EQ(
-      tensor_number_mark, str_read_tensor_number_mark,
-      "Tensor number mark not match, expect [%s], but read from file is [%]",
-      tensor_number_mark, str_read_tensor_number_mark);
+  PADDLE_ENFORCE_EQ(tensor_number_mark, str_read_tensor_number_mark,
+                    platform::errors::InvalidArgument(
+                        "Tensor number mark does not match, expect mark is "
+                        "[%s], but the mark read from file is [%s].",
+                        tensor_number_mark, str_read_tensor_number_mark));
 
   size_t tensor_number = 0;
   istre.read(reinterpret_cast<char*>(&tensor_number), sizeof(tensor_number));
@@ -79,10 +81,11 @@ std::string ReadTensorName(std::istream& istre) {
 
   std::string str_read_tensor_name_mark(name_mark_buffer,
                                         tensor_name_mark.size());
-  PADDLE_ENFORCE_EQ(
-      tensor_name_mark, str_read_tensor_name_mark,
-      "Tensor name mark not match, expect [%s], but read from file is [%]",
-      tensor_name_mark, str_read_tensor_name_mark);
+  PADDLE_ENFORCE_EQ(tensor_name_mark, str_read_tensor_name_mark,
+                    platform::errors::InvalidArgument(
+                        "Tensor name mark does not match, expect mark is [%s], "
+                        "but the mark read from file is [%s].",
+                        tensor_name_mark, str_read_tensor_name_mark));
 
   size_t tensor_name_length = 0;
   istre.read(reinterpret_cast<char*>(&tensor_name_length),
@@ -117,16 +120,18 @@ bool SaveStaticNameListToDisk(
 
   for (size_t i = 0; i < vec_tensor_name_list.size(); ++i) {
     auto var_ptr = scope.FindVar(vec_tensor_name_list[i]);
-    PADDLE_ENFORCE_NE(
-        var_ptr, nullptr,
-        "Variable find error, when save model, can't not find vairable [%s], "
-        "Please make sure you have run StartUpProgram",
-        vec_tensor_name_list[i]);
+    PADDLE_ENFORCE_NOT_NULL(
+        var_ptr, platform::errors::NotFound("Variable (%s) is not found when "
+                                            "saving model, please make sure "
+                                            "that exe.run(startup_program) has "
+                                            "been executed.",
+                                            vec_tensor_name_list[i]));
     Tensor* tensor = var_ptr->GetMutable<LoDTensor>();
     PADDLE_ENFORCE_EQ(tensor->IsInitialized(), true,
-                      "Paramter [%s] not initialzed,"
-                      "Please make sure you have run StartUpProgram",
-                      vec_tensor_name_list[i]);
+                      platform::errors::PreconditionNotMet(
+                          "Paramter [%s] is not initialzed, please make sure "
+                          "that exe.run(startup_program) has been executed.",
+                          vec_tensor_name_list[i]));
 
     map_tensor[vec_tensor_name_list[i]] = tensor;
   }
@@ -145,9 +150,10 @@ bool SaveDygraphVarBaseListToDisk(
     Tensor* tensor = var_ptr->GetMutable<LoDTensor>();
 
     PADDLE_ENFORCE_EQ(tensor->IsInitialized(), true,
-                      "Paramter [%s] not initialzed,"
-                      "Please make sure you have run StartUpProgram",
-                      vec_var_base_list[i]->Name());
+                      platform::errors::PreconditionNotMet(
+                          "Paramter [%s] is not initialzed, please make sure "
+                          "that exe.run(startup_program) has been executed.",
+                          vec_var_base_list[i]->Name()));
 
     map_tensor[vec_var_base_list[i]->Name()] = tensor;
   }
@@ -185,34 +191,41 @@ bool LoadStaticNameListFromDisk(
 
   for (size_t i = 0; i < vec_tensor_name_list.size(); ++i) {
     auto it = map_load_tensor.find(vec_tensor_name_list[i]);
-    PADDLE_ENFORCE(it != map_load_tensor.end(),
-                   "Paramete not found in Model file, "
-                   "Can not find [%s] in model file [%s]",
-                   vec_tensor_name_list[i], file_name);
+    PADDLE_ENFORCE_NE(it, map_load_tensor.end(),
+                      platform::errors::NotFound(
+                          "Parameter (%s) not found in model file (%s).",
+                          vec_tensor_name_list[i], file_name));
     auto var_ptr = scope.FindVar(vec_tensor_name_list[i]);
 
-    PADDLE_ENFORCE_NE(
-        var_ptr, nullptr,
-        "Parameter not created, when load model, can't not find parameter [%s] "
-        "please make sure you have run StartUpProgram",
-        vec_tensor_name_list[i]);
+    PADDLE_ENFORCE_NOT_NULL(
+        var_ptr,
+        platform::errors::PreconditionNotMet(
+            "Parameter (%s) is not created when loading model, "
+            "please make sure that exe.run(startup_program) has been executed.",
+            vec_tensor_name_list[i]));
 
     Tensor* tensor = var_ptr->GetMutable<LoDTensor>();
-    PADDLE_ENFORCE_NE(tensor, nullptr,
-                      "Paramter [%s] not initialzed "
-                      "please make sure you have run startUpProgram",
-                      vec_tensor_name_list[i]);
+    PADDLE_ENFORCE_NOT_NULL(
+        tensor,
+        platform::errors::PreconditionNotMet(
+            "Paramter [%s] is not initialzed, "
+            "please make sure that exe.run(startup_program) has been executed.",
+            vec_tensor_name_list[i]));
 
     PADDLE_ENFORCE_EQ(tensor->IsInitialized(), true,
-                      "Paramter [%s] not initialzed "
-                      "please make sure you have run StartUpProgram",
-                      vec_tensor_name_list[i]);
+                      platform::errors::PreconditionNotMet(
+                          "Paramter [%s] is not initialzed, "
+                          "please make sure that exe.run(startup_program) has "
+                          "been executed.v",
+                          vec_tensor_name_list[i]));
     PADDLE_ENFORCE_EQ(
         tensor->dims(), it->second->dims(),
-        "Shape not matching: the Program requires a parameter with a shape of "
-        "(%s), "
-        "while the loaded parameter (namely [ %s ]) has a shape of  (%s).",
-        tensor->dims(), vec_tensor_name_list[i], it->second->dims());
+        platform::errors::InvalidArgument(
+            "Shape does not match, the program requires a parameter with a "
+            "shape of "
+            "(%s), while the loaded parameter (namely [ %s ]) has a shape of "
+            "(%s).",
+            tensor->dims(), vec_tensor_name_list[i], it->second->dims()));
 
     TensorCopySync(*(it->second.get()), tensor->place(), tensor);
 
@@ -239,9 +252,9 @@ bool SaveTensorToDisk(const std::string& file_name,
   MkDirRecursively(DirName(file_name).c_str());
 
   std::ofstream fout(file_name, std::ios::binary);
-  if (!fout) {
-    PADDLE_THROW("File open error. Can not open file [%s]", file_name);
-  }
+  PADDLE_ENFORCE_EQ(
+      fout.is_open(), true,
+      platform::errors::Unavailable("File (%s) open failed.", file_name));
 
   // first 256 byte for reserve for fulture upgrade
   char* kReserveBuffer = new char[model_file_reserve_size];
@@ -292,9 +305,8 @@ bool SaveTensorToDisk(const std::string& file_name,
       TensorCopySync(*tensor, platform::CPUPlace(), &temp);
       data_ptr = temp.data<void>();
 #else
-      PADDLE_THROW(
-          "Tensor is in CUDA device, but paddle not compile with CUDA, this "
-          "should not happen");
+      PADDLE_THROW(platform::errors::Unavailable(
+          "Tensor is in CUDA device, but paddle not compiled with CUDA."));
 #endif
     }
     fout.write(static_cast<const char*>(data_ptr),
@@ -302,8 +314,9 @@ bool SaveTensorToDisk(const std::string& file_name,
   }
 
   if (!fout) {
-    PADDLE_THROW("Model save failed, data write to model file [%s] error",
-                 file_name);
+    PADDLE_THROW(platform::errors::Unavailable(
+        "Model save failed, error when writing data into model file [%s].",
+        file_name));
   }
 
   fout.close();
@@ -316,9 +329,9 @@ bool LoadTensorFromDisk(
     std::map<std::string, std::shared_ptr<Tensor>>* map_tensor) {
   std::ifstream fin(file_name, std::ios::binary);
 
-  if (!fin) {
-    PADDLE_THROW("File open error. Can not open model file [%s]", file_name);
-  }
+  PADDLE_ENFORCE_EQ(
+      fin.is_open(), true,
+      platform::errors::Unavailable("File (%s) open failed.", file_name));
 
   ReadReserveBuffer(fin);
 
@@ -331,7 +344,8 @@ bool LoadTensorFromDisk(
     uint32_t version;
     fin.read(reinterpret_cast<char*>(&version), sizeof(version));
     CheckInStreamState(fin, sizeof(version));
-    PADDLE_ENFORCE_EQ(version, 0U, "Only version 0 is supported");
+    PADDLE_ENFORCE_EQ(version, 0U, platform::errors::InvalidArgument(
+                                       "Only version 0 tensor is supported."));
     proto::VarType::TensorDesc desc;
     {
       // int32_t size
@@ -344,7 +358,7 @@ bool LoadTensorFromDisk(
       CheckInStreamState(fin, sizeof(size));
       PADDLE_ENFORCE_EQ(
           desc.ParseFromArray(buf.get(), size), true,
-          platform::errors::InvalidArgument("Cannot parse tensor desc"));
+          platform::errors::InvalidArgument("Parse tensor desc failed."));
     }
 
     {  // read tensor
diff --git a/paddle/fluid/framework/section_worker.cc b/paddle/fluid/framework/section_worker.cc
index 03b7afbb8771fadbe07a352497fa69a299928cf7..b9a3cac0ec4c9863f4e0dea102965eff8c47fe8d 100644
--- a/paddle/fluid/framework/section_worker.cc
+++ b/paddle/fluid/framework/section_worker.cc
@@ -196,7 +196,6 @@ void SectionWorker::TrainFiles() {
         if (threads_completed) {
           VLOG(3) << "thread " << thread_id_ << " completed.";
           lk.unlock();
-          threads_completed = false;
           return;
         }
         lk.unlock();
@@ -459,7 +458,6 @@ void SectionWorker::TrainFilesWithProfiler() {
                     << ", mean_time: " << op_total_time[i] / op_count[i];
           }
           VLOG(0) << "================================";
-          threads_completed = false;
           return;
         }
         lk.unlock();
diff --git a/paddle/fluid/framework/selected_rows.cc b/paddle/fluid/framework/selected_rows.cc
index 54a818250b45e593de4110f56e42a04a9ea65e00..1f402ea9dd33626a43a4d03b96256b2c2841c8b4 100644
--- a/paddle/fluid/framework/selected_rows.cc
+++ b/paddle/fluid/framework/selected_rows.cc
@@ -113,7 +113,9 @@ void DeserializeFromStream(std::istream& is, SelectedRows* selected_rows,
     // the 1st field, unit32_t version for SelectedRows
     uint32_t version;
     is.read(reinterpret_cast<char*>(&version), sizeof(version));
-    PADDLE_ENFORCE_EQ(version, 0U, "Only version 0 is supported");
+    PADDLE_ENFORCE_EQ(version, 0U,
+                      platform::errors::InvalidArgument(
+                          "Only version 0 SelectedRows is supported."));
   }
   {
     // the 2st field, rows information
@@ -155,24 +157,27 @@ int64_t SelectedRows::AutoGrownIndex(int64_t key, bool auto_grown,
   auto iter = id_to_index_.find(key);
   if (iter == id_to_index_.end()) {
     rwlock_->UNLock();
-    if (!auto_grown) {
-      PADDLE_THROW("key %d not found", key);
-    }
+    PADDLE_ENFORCE_EQ(
+        auto_grown, true,
+        platform::errors::NotFound("Input key(%lld) is not found.", key));
     rwlock_->WRLock();
     auto map_size = id_to_index_.size();
     auto vector_size = rows_.size();
     if (map_size != vector_size) {
       rwlock_->UNLock();
-      PADDLE_THROW(
-          "id_to_index_ size %d should have the same size with rows_ %d",
-          map_size, vector_size);
+      PADDLE_THROW(platform::errors::InvalidArgument(
+          "Row map size(%zu) should be equal to rows size(%zu).", map_size,
+          vector_size));
     }
     auto write_iter = id_to_index_.find(key);
     if (write_iter == id_to_index_.end()) {
       int row_num = rows_.size();
       if (row_num == value_->dims()[0]) {
         rwlock_->UNLock();
-        PADDLE_THROW("selected rows is full, then length exceed %d", row_num);
+        PADDLE_THROW(platform::errors::InvalidArgument(
+            "Selected rows is full, then length exceed the length of first "
+            "dimension (%d).",
+            row_num));
       }
       // key logic to put a key into id_to_index_
       rows_.push_back(key);
@@ -203,15 +208,20 @@ void SelectedRows::SyncIndex() {
 
 void SelectedRows::Get(const framework::Tensor& ids, framework::Tensor* value,
                        bool auto_grown, bool is_test) {
-  PADDLE_ENFORCE(value->IsInitialized(),
-                 "The value tensor should be initialized.");
+  PADDLE_ENFORCE_EQ(value->IsInitialized(), true,
+                    platform::errors::InvalidArgument(
+                        "The value tensor is not initialized."));
   if (ids.numel() == 0) {
     VLOG(3) << "keys is empty, please check data!";
   } else {
     int64_t value_width = value_->numel() / value_->dims()[0];
-    PADDLE_ENFORCE_EQ(value_width, value->numel() / value->dims()[0],
-                      "output tensor should have the same shape with table "
-                      "except the dims[0].");
+    PADDLE_ENFORCE_EQ(
+        value_width, value->numel() / value->dims()[0],
+        platform::errors::InvalidArgument(
+            "Output tensor should have the same shape with table "
+            "except the first dimmension, excepted value width not counting "
+            "the first dimension is %d, actual value width is %d.",
+            value_width, value->numel() / value->dims()[0]));
     for (int i = 0; i < ids.numel(); ++i) {
       auto id = ids.data<int64_t>()[i];
       int64_t index = AutoGrownIndex(id, auto_grown, is_test);
diff --git a/paddle/fluid/framework/selected_rows.h b/paddle/fluid/framework/selected_rows.h
index 5f733139419dbc1769d9eb4efe7e793f8fb2752f..285af1d55302a49cae058fccdd5edd13aa28137e 100644
--- a/paddle/fluid/framework/selected_rows.h
+++ b/paddle/fluid/framework/selected_rows.h
@@ -82,7 +82,8 @@ class SelectedRows {
   int64_t Index(int64_t key) const {
     auto it = std::find(rows_.begin(), rows_.end(), key);
     if (it == rows_.end()) {
-      PADDLE_THROW("id %s not in table", key);
+      PADDLE_THROW(platform::errors::NotFound(
+          "Input id (%lld) is not in current rows table.", key));
     }
     return static_cast<int64_t>(std::distance(rows_.begin(), it));
   }
diff --git a/paddle/fluid/framework/shape_inference.cc b/paddle/fluid/framework/shape_inference.cc
index 4ac872ac3d3bf918678f5294a4c35097c3fb18ab..f5bb3f68007043ad37ea32e7047c5fc546b80931 100644
--- a/paddle/fluid/framework/shape_inference.cc
+++ b/paddle/fluid/framework/shape_inference.cc
@@ -25,20 +25,22 @@ namespace framework {
 std::vector<DDim> InferShapeContext::GetReaderDims(
     const std::string &name) const {
   const std::vector<std::string> &arg_names = Inputs(name);
-  PADDLE_ENFORCE_EQ(
-      arg_names.size(), 1UL,
-      "Reader input '%s' should hold one element, but now it holds %d", name,
-      arg_names.size());
+  PADDLE_ENFORCE_EQ(arg_names.size(), 1UL,
+                    platform::errors::InvalidArgument(
+                        "Reader input '%s' should hold one element, but now it "
+                        "holds %d elements.",
+                        name, arg_names.size()));
   return this->GetRepeatedDims(arg_names[0]);
 }
 
 void InferShapeContext::SetReaderDims(const std::string &name,
                                       const std::vector<DDim> &dims) {
   const std::vector<std::string> &arg_names = Outputs(name);
-  PADDLE_ENFORCE_EQ(
-      arg_names.size(), 1UL,
-      "Reader output '%s' should hold one element, but now it holds %d", name,
-      arg_names.size());
+  PADDLE_ENFORCE_EQ(arg_names.size(), 1UL,
+                    platform::errors::InvalidArgument(
+                        "Reader output '%s' should hold one element, but now "
+                        "it holds %d elements.",
+                        name, arg_names.size()));
   return this->SetRepeatedDims(arg_names[0], dims);
 }
 
diff --git a/paddle/fluid/framework/tensor.cc b/paddle/fluid/framework/tensor.cc
index 544c014eaf98a99b1737809f2cbad39b46fdb276..0b22bab26789a3e2ebd20428adc236faa8b38dee 100644
--- a/paddle/fluid/framework/tensor.cc
+++ b/paddle/fluid/framework/tensor.cc
@@ -19,13 +19,17 @@ namespace paddle {
 namespace framework {
 extern size_t SizeOfType(proto::VarType::Type type);
 void Tensor::check_memory_size() const {
-  PADDLE_ENFORCE_NOT_NULL(
-      holder_, "Tensor holds no memory. Call Tensor::mutable_data first.");
+  PADDLE_ENFORCE_NOT_NULL(holder_, platform::errors::PreconditionNotMet(
+                                       "Tensor holds no memory. "
+                                       "Call Tensor::mutable_data firstly."));
   PADDLE_ENFORCE_LE(
       numel() * SizeOfType(type()), memory_size(),
-      "Tensor's dims_ is out of bound. Call Tensor::mutable_data "
-      "first to re-allocate memory.\n"
-      "or maybe the required data-type mismatches the data already stored.");
+      platform::errors::PreconditionNotMet(
+          "Tensor's dimension is out of bound."
+          "Tensor's dimension must be equal or less than the size of its "
+          "memory."
+          "But received  Tensor's dimension is d%, memory's size is %d.",
+          numel() * SizeOfType(type()), memory_size()));
 }
 
 Tensor::Tensor(const proto::VarType::Type& dtype) : type_(dtype), offset_(0) {}
@@ -37,15 +41,21 @@ size_t Tensor::memory_size() const {
 void* Tensor::mutable_data(const platform::Place& place,
                            proto::VarType::Type type, size_t requested_size) {
   type_ = type;
-  PADDLE_ENFORCE_GE(numel(), 0,
-                    "When calling this method, the Tensor's numel must be "
-                    "equal or larger than zero. "
-                    "Please check Tensor::dims, or Tensor::Resize has been "
-                    "called first. The Tensor's shape is [",
-                    dims(), "] now");
+  PADDLE_ENFORCE_GE(
+      numel(), 0,
+      platform::errors::PreconditionNotMet(
+          "The Tensor's element number must be equal or greater than zero. "
+          "The Tensor's shape is [",
+          dims(), "] now"));
   size_t size = numel() * SizeOfType(type);
   if (requested_size) {
-    PADDLE_ENFORCE_GE(requested_size, size);
+    PADDLE_ENFORCE_GE(
+        requested_size, size,
+        platform::errors::InvalidArgument(
+            "The requested memory size is less than the memory size of Tensor. "
+            "But received requested memory size is d%, "
+            "memory size of Tensor is %d.",
+            requested_size, size));
     size = requested_size;
   }
   /* some versions of boost::variant don't have operator!= */
@@ -62,8 +72,8 @@ void* Tensor::mutable_data(const platform::Place& place,
 
 void* Tensor::mutable_data(const platform::Place& place,
                            size_t requested_size) {
-  PADDLE_ENFORCE_NOT_NULL(
-      this->holder_, "Cannot invoke mutable data if current hold nothing.");
+  PADDLE_ENFORCE_NOT_NULL(this->holder_, platform::errors::PreconditionNotMet(
+                                             "The tensor is not initialized."));
   return mutable_data(place, type_, requested_size);
 }
 
@@ -75,12 +85,20 @@ Tensor& Tensor::ShareDataWith(const Tensor& src) {
 
 Tensor Tensor::Slice(int64_t begin_idx, int64_t end_idx) const {
   check_memory_size();
-  PADDLE_ENFORCE_GE(begin_idx, 0,
-                    "The start row index must be greater than 0.");
-  PADDLE_ENFORCE_LE(end_idx, dims_[0], "The end row index is out of bound.");
+  PADDLE_ENFORCE_GE(
+      begin_idx, 0,
+      platform::errors::OutOfRange("The start row index must be greater than 0."
+                                   "But received the start index is d%.",
+                                   begin_idx));
+  PADDLE_ENFORCE_LE(
+      end_idx, dims_[0],
+      platform::errors::OutOfRange("The end row index is out of bound."));
   PADDLE_ENFORCE_LT(
       begin_idx, end_idx,
-      "The start row index must be lesser than the end row index.");
+      platform::errors::InvalidArgument(
+          "The start row index must be less than the end row index."
+          "But received the start index = %d, the end index = %d.",
+          begin_idx, end_idx));
 
   if (dims_[0] == 1) {
     return *this;
diff --git a/paddle/fluid/framework/tensor.h b/paddle/fluid/framework/tensor.h
index d9fddc4c77d99efcb119ae37591b26bff0e51c0a..f2ccff2c133a238d02e25c65faf41dd519fdb506 100644
--- a/paddle/fluid/framework/tensor.h
+++ b/paddle/fluid/framework/tensor.h
@@ -131,13 +131,17 @@ class Tensor {
 
   const platform::Place& place() const {
     PADDLE_ENFORCE_NOT_NULL(
-        holder_, "Tensor not initialized yet when Tensor::place() is called.");
+        holder_,
+        platform::errors::PreconditionNotMet(
+            "Tensor not initialized yet when Tensor::place() is called."));
     return holder_->place();
   }
 
   proto::VarType::Type type() const {
     PADDLE_ENFORCE_NOT_NULL(
-        holder_, "Tensor not initialized yet when Tensor::type() is called.");
+        holder_,
+        platform::errors::PreconditionNotMet(
+            "Tensor not initialized yet when Tensor::type() is called."));
     return type_;
   }
 
diff --git a/paddle/fluid/framework/tensor_impl.h b/paddle/fluid/framework/tensor_impl.h
index f5171b0a8d1efc33521c2d731e88f0f96bdf41c7..986551b935e8811a6b257c2b4a613a493b3b644b 100644
--- a/paddle/fluid/framework/tensor_impl.h
+++ b/paddle/fluid/framework/tensor_impl.h
@@ -43,9 +43,13 @@ inline T* Tensor::data() {
   check_memory_size();
   bool valid =
       std::is_same<T, void>::value || type_ == DataTypeTrait<T>::DataType();
-  PADDLE_ENFORCE(
-      valid, "Tensor holds the wrong type, it holds %s, but desires to be %s",
-      DataTypeToString(type_), DataTypeToString(DataTypeTrait<T>::DataType()));
+  PADDLE_ENFORCE_EQ(
+      valid, true,
+      platform::errors::InvalidArgument(
+          "Tensor holds the wrong type, it holds %s, but desires to be %s",
+          DataTypeToString(type_),
+          DataTypeToString(DataTypeTrait<T>::DataType())));
+
   return reinterpret_cast<T*>(reinterpret_cast<uintptr_t>(holder_->ptr()) +
                               offset_);
 }
@@ -69,9 +73,12 @@ inline T* Tensor::mutable_data(const platform::Place& place,
 inline Tensor ReshapeToMatrix(const Tensor& src, int num_col_dims) {
   int rank = src.dims().size();
   PADDLE_ENFORCE_GE(
-      rank, 2,
-      "'ReshapeToMatrix()' is only used for flatten high rank "
-      "tensors to matrixs. Can not be used in reshaping vectors.");
+      rank, 2, platform::errors::InvalidArgument(
+                   "'ReshapeToMatrix()' is only used for flatten high rank "
+                   "tensors to matrixs. The dimensions of Tensor must be "
+                   "greater or equal than 2. "
+                   "But received dimensions of Tensor is %d",
+                   rank));
   if (rank == 2) {
     return src;
   }
diff --git a/paddle/fluid/framework/tensor_test.cc b/paddle/fluid/framework/tensor_test.cc
index 84f98d339a29b7d23de6e8be4b069b216be31ab2..cc972dd93d032c19015c86debebc27f7c8c0d155 100644
--- a/paddle/fluid/framework/tensor_test.cc
+++ b/paddle/fluid/framework/tensor_test.cc
@@ -41,7 +41,7 @@ TEST(Tensor, DataAssert) {
     std::string ex_msg = err.what();
     EXPECT_TRUE(ex_msg.find("holder_ should not be null") != std::string::npos);
     EXPECT_TRUE(ex_msg.find("Tensor holds no memory. Call "
-                            "Tensor::mutable_data first.") !=
+                            "Tensor::mutable_data firstly.") !=
                 std::string::npos);
   }
   ASSERT_TRUE(caught);
@@ -157,7 +157,7 @@ TEST(Tensor, ShareDataWith) {
       EXPECT_TRUE(ex_msg.find("holder_ should not be null") !=
                   std::string::npos);
       EXPECT_TRUE(ex_msg.find("Tensor holds no memory. Call "
-                              "Tensor::mutable_data first.") !=
+                              "Tensor::mutable_data firstly.") !=
                   std::string::npos);
     }
     ASSERT_TRUE(caught);
diff --git a/paddle/fluid/framework/tensor_util.cc b/paddle/fluid/framework/tensor_util.cc
index c3626c5c9e0506f12ca77aac5086cb18e272a771..0e3d11b9f0257905cbede334afd0ad84ff15cb5c 100644
--- a/paddle/fluid/framework/tensor_util.cc
+++ b/paddle/fluid/framework/tensor_util.cc
@@ -94,9 +94,17 @@ void TensorCopy(const Tensor& src, const platform::Place& dst_place,
     auto src_gpu_place = BOOST_GET_CONST(platform::CUDAPlace, src_place);
     auto dst_cpu_place = BOOST_GET_CONST(platform::CPUPlace, dst_place);
     auto ctx_place = ctx.GetPlace();
-    PADDLE_ENFORCE_EQ(platform::is_gpu_place(ctx_place), true);
+    PADDLE_ENFORCE_EQ(
+        platform::is_gpu_place(ctx_place), true,
+        platform::errors::PreconditionNotMet(
+            "Context place error, excepted GPUPlace, but actually %s.",
+            ctx_place));
     auto ctx_gpu_place = BOOST_GET_CONST(platform::CUDAPlace, ctx_place);
-    PADDLE_ENFORCE_EQ(src_gpu_place, ctx_gpu_place);
+    PADDLE_ENFORCE_EQ(src_gpu_place, ctx_gpu_place,
+                      platform::errors::Unavailable(
+                          "Source place and context place do not match, source "
+                          "place is %s, context place is %s.",
+                          src_gpu_place, ctx_gpu_place));
     auto stream =
         reinterpret_cast<const platform::CUDADeviceContext&>(ctx).stream();
     memory::Copy(dst_cpu_place, dst_ptr, src_gpu_place, src_ptr, size, stream);
@@ -106,9 +114,17 @@ void TensorCopy(const Tensor& src, const platform::Place& dst_place,
     auto src_cpu_place = BOOST_GET_CONST(platform::CPUPlace, src_place);
     auto dst_gpu_place = BOOST_GET_CONST(platform::CUDAPlace, dst_place);
     auto ctx_place = ctx.GetPlace();
-    PADDLE_ENFORCE_EQ(platform::is_gpu_place(ctx_place), true);
+    PADDLE_ENFORCE_EQ(
+        platform::is_gpu_place(ctx_place), true,
+        platform::errors::PreconditionNotMet(
+            "Context place error, excepted GPUPlace, but actually %s.",
+            ctx_place));
     auto ctx_gpu_place = BOOST_GET_CONST(platform::CUDAPlace, ctx_place);
-    PADDLE_ENFORCE_EQ(dst_gpu_place, ctx_gpu_place);
+    PADDLE_ENFORCE_EQ(dst_gpu_place, ctx_gpu_place,
+                      platform::errors::Unavailable(
+                          "Destination place and context place do not match, "
+                          "destination place is %s, context place is %s.",
+                          dst_gpu_place, ctx_gpu_place));
     auto stream =
         reinterpret_cast<const platform::CUDADeviceContext&>(ctx).stream();
     memory::Copy(dst_gpu_place, dst_ptr, src_cpu_place, src_ptr, size, stream);
@@ -164,7 +180,11 @@ void TensorCopy(const Tensor& src, const platform::Place& dst_place,
     auto src_gpu_place = BOOST_GET_CONST(platform::CUDAPlace, src_place);
     auto dst_gpu_place = BOOST_GET_CONST(platform::CUDAPlace, dst_place);
     auto ctx_place = ctx.GetPlace();
-    PADDLE_ENFORCE_EQ(platform::is_gpu_place(ctx_place), true);
+    PADDLE_ENFORCE_EQ(
+        platform::is_gpu_place(ctx_place), true,
+        platform::errors::PreconditionNotMet(
+            "Context place error, excepted GPUPlace, but actually %s.",
+            ctx_place));
     auto stream =
         reinterpret_cast<const platform::CUDADeviceContext&>(ctx).stream();
     if (platform::is_same_place(src_place, dst_place)) {
@@ -180,12 +200,14 @@ void TensorCopy(const Tensor& src, const platform::Place& dst_place,
         memory::Copy(dst_gpu_place, dst_ptr, src_gpu_place, src_ptr, size,
                      stream);
       } else {
-        PADDLE_THROW("ctx is not belong to dst_gpu_place or src_gpu_place.");
+        PADDLE_THROW(platform::errors::Unavailable(
+            "Context place dose not match the source and destination place."));
       }
     }
   }
   else {  // NOLINT
-    PADDLE_THROW("Copy from %s to %s is not supported.", src_place, dst_place);
+    PADDLE_THROW(platform::errors::Unimplemented(
+        "Copying from %s to %s is not supported.", src_place, dst_place));
   }
 #endif
 }
@@ -298,7 +320,8 @@ void TensorCopySync(const Tensor& src, const platform::Place& dst_place,
                  nullptr);
   }
   else {  // NOLINT
-    PADDLE_THROW("Copy from %s to %s is not supported.", src_place, dst_place);
+    PADDLE_THROW(platform::errors::Unimplemented(
+        "Copy from %s to %s is not supported.", src_place, dst_place));
   }
 #endif
 }
@@ -832,7 +855,9 @@ void TensorFromStream(std::istream& is, Tensor* tensor,
 void* GetDstPtrByDLDataType(DLDataType type, framework::Tensor* dst,
                             const platform::Place& dst_place) {
   // vector types not currently supported
-  PADDLE_ENFORCE_LE(type.lanes, 1, "vector types not currently supported");
+  PADDLE_ENFORCE_LE(type.lanes, 1,
+                    platform::errors::Unimplemented(
+                        "Vector type is not supported currently."));
 
   switch (type.bits) {
     case 8:
@@ -840,32 +865,37 @@ void* GetDstPtrByDLDataType(DLDataType type, framework::Tensor* dst,
         return static_cast<void*>(dst->mutable_data<int8_t>(dst_place));
       if (type.code == kDLUInt)
         return static_cast<void*>(dst->mutable_data<uint8_t>(dst_place));
-      PADDLE_THROW("There is no this type.code <%d> when type.bits is <%d>.",
-                   type.code, type.bits);
+      PADDLE_THROW(platform::errors::Unimplemented(
+          "DLDataType code <%d> is illegal when DLDataType.bits is <%d>.",
+          type.code, type.bits));
     case 16:
       if (type.code == kDLInt)
         return static_cast<void*>(dst->mutable_data<int16_t>(dst_place));
       if (type.code == kDLFloat)
         return static_cast<void*>(
             dst->mutable_data<paddle::platform::float16>(dst_place));
-      PADDLE_THROW("There is no this type.code <%d> when type.bits is <%d>.",
-                   type.code, type.bits);
+      PADDLE_THROW(platform::errors::Unimplemented(
+          "DLDataType code <%d> is illegal when DLDataType.bits is <%d>.",
+          type.code, type.bits));
     case 32:
       if (type.code == kDLInt)
         return static_cast<void*>(dst->mutable_data<int32_t>(dst_place));
       if (type.code == kDLFloat)
         return static_cast<void*>(dst->mutable_data<float>(dst_place));
-      PADDLE_THROW("There is no this type.code <%d> when type.bits is <%d>.",
-                   type.code, type.bits);
+      PADDLE_THROW(platform::errors::Unimplemented(
+          "DLDataType code <%d> is illegal when DLDataType.bits is <%d>.",
+          type.code, type.bits));
     case 64:
       if (type.code == kDLInt)
         return static_cast<void*>(dst->mutable_data<int64_t>(dst_place));
       if (type.code == kDLFloat)
         return static_cast<void*>(dst->mutable_data<double>(dst_place));
-      PADDLE_THROW("There is no this type.code <%d> when type.bits is <%d>.",
-                   type.code, type.bits);
+      PADDLE_THROW(platform::errors::Unimplemented(
+          "DLDataType code <%d> is illegal when DLDataType.bits is <%d>.",
+          type.code, type.bits));
     default:
-      PADDLE_THROW("Unsupport type.bits %d", type.bits);
+      PADDLE_THROW(platform::errors::Unimplemented(
+          "Unsupported DLDataType.bits %d.", type.bits));
   }
 }
 
diff --git a/paddle/fluid/framework/tensor_util.h b/paddle/fluid/framework/tensor_util.h
index fce0142b41d3ae9b2a6fcd4f16d38b0492fbd806..a0408dbc3dbb4ffca70ef322d93b662f1b953f7b 100644
--- a/paddle/fluid/framework/tensor_util.h
+++ b/paddle/fluid/framework/tensor_util.h
@@ -183,7 +183,11 @@ void TensorToVector(const Tensor& src, std::vector<T>* dst) {
   dst->resize(src.numel());
   auto dst_ptr = static_cast<void*>(dst->data());
 
-  PADDLE_ENFORCE_EQ(platform::is_cpu_place(src.place()), true);
+  PADDLE_ENFORCE_EQ(
+      platform::is_cpu_place(src.place()), true,
+      platform::errors::InvalidArgument(
+          "The input tensor should be CPU device, but actually it is in %s.",
+          src.place()));
 
   memory::Copy(dst_place, dst_ptr,
                BOOST_GET_CONST(platform::CPUPlace, src.place()), src_ptr, size);
diff --git a/paddle/fluid/framework/threadpool.cc b/paddle/fluid/framework/threadpool.cc
index 7f7f426d0e28224932fc96a3fefa0df1279e6475..4682bfc264b68997abd0a87233c5ed39e7e50a63 100644
--- a/paddle/fluid/framework/threadpool.cc
+++ b/paddle/fluid/framework/threadpool.cc
@@ -42,7 +42,8 @@ void ThreadPool::Init() {
       num_threads = FLAGS_dist_threadpool_size;
       VLOG(1) << "set dist_threadpool_size to " << num_threads;
     }
-    PADDLE_ENFORCE_GT(num_threads, 0);
+    PADDLE_ENFORCE_GT(num_threads, 0, platform::errors::InvalidArgument(
+                                          "The number of threads is 0."));
     threadpool_.reset(new ThreadPool(num_threads));
   }
 }
@@ -83,7 +84,8 @@ void ThreadPool::TaskLoop() {
       }
 
       if (tasks_.empty()) {
-        PADDLE_THROW("This thread has no task to Run");
+        PADDLE_THROW(platform::errors::Unavailable(
+            "Current thread has no task to Run."));
       }
 
       // pop a task from the task queue
diff --git a/paddle/fluid/framework/threadpool.h b/paddle/fluid/framework/threadpool.h
index 654d81116b280bb6a52af3f83aeec284387f3b63..09528b6fc35bf49ac3110440a62aba3200341e15 100644
--- a/paddle/fluid/framework/threadpool.h
+++ b/paddle/fluid/framework/threadpool.h
@@ -91,7 +91,8 @@ class ThreadPool {
     {
       std::unique_lock<std::mutex> lock(mutex_);
       if (!running_) {
-        PADDLE_THROW("enqueue on stopped ThreadPool");
+        PADDLE_THROW(platform::errors::Unavailable(
+            "Task is enqueued into stopped ThreadPool."));
       }
       tasks_.push(std::move(task));
     }
diff --git a/paddle/fluid/framework/var_desc.cc b/paddle/fluid/framework/var_desc.cc
index f3ea1f624ee836a483c37c2addb4d9766e87c107..2ee0b17b64b6df7a2f66b208f5b5879683db6656 100644
--- a/paddle/fluid/framework/var_desc.cc
+++ b/paddle/fluid/framework/var_desc.cc
@@ -43,8 +43,9 @@ void VarDesc::SetTensorDescNum(size_t num) {
     } break;
     default:
       PADDLE_THROW(
-          "Setting 'sub_tensor_number' is not supported by the type of var %s.",
-          this->Name());
+          platform::errors::Unavailable("Setting 'sub_tensor_number' is not "
+                                        "supported by the %s type variable.",
+                                        this->Name()));
   }
 }
 
@@ -55,8 +56,9 @@ size_t VarDesc::GetTensorDescNum() const {
       break;
     default:
       PADDLE_THROW(
-          "Getting 'sub_tensor_number' is not supported by the type of var %s.",
-          this->Name());
+          platform::errors::Unavailable("Getting 'sub_tensor_number' is not "
+                                        "supported by the %s type variable.",
+                                        this->Name()));
   }
 }
 
@@ -133,9 +135,9 @@ void VarDesc::SetLoDLevel(int32_t lod_level) {
       desc_.mutable_type()->mutable_tensor_array()->set_lod_level(lod_level);
       break;
     default:
-      PADDLE_THROW(
-          "Setting 'lod_level' is not supported by the type of var %s.",
-          this->Name());
+      PADDLE_THROW(platform::errors::Unavailable(
+          "Setting 'lod_level' is not supported by the %s type variable.",
+          this->Name()));
   }
 }
 
@@ -157,9 +159,9 @@ void VarDesc::SetLoDLevels(const std::vector<int32_t> &multiple_lod_level) {
       }
     } break;
     default:
-      PADDLE_THROW(
-          "Setting 'lod_levels' is not supported by the type of var %s.",
-          this->Name());
+      PADDLE_THROW(platform::errors::Unavailable(
+          "Setting 'lod_levels' is not supported by the %s type variable",
+          this->Name()));
   }
 }
 
@@ -170,9 +172,9 @@ int32_t VarDesc::GetLoDLevel() const {
     case proto::VarType::LOD_TENSOR_ARRAY:
       return desc_.type().tensor_array().lod_level();
     default:
-      PADDLE_THROW(
-          "Getting 'lod_level' is not supported by the type of var %s.",
-          this->Name());
+      PADDLE_THROW(platform::errors::Unavailable(
+          "Getting 'lod_level' is not supported by the %s type variable.",
+          this->Name()));
   }
 }
 
@@ -187,15 +189,19 @@ std::vector<int32_t> VarDesc::GetLoDLevels() const {
       return res;
       break;
     default:
-      PADDLE_THROW(
-          "Getting 'lod_levels' is not supported by the type of var %s.",
-          this->Name());
+      PADDLE_THROW(platform::errors::Unavailable(
+          "Getting 'lod_levels' is not supported by the %s type variable.",
+          this->Name()));
   }
 }
 
 const proto::VarType::TensorDesc &VarDesc::tensor_desc() const {
-  PADDLE_ENFORCE(desc_.has_type(), "The var's type hasn't been set.");
-  PADDLE_ENFORCE(desc_.type().has_type(), "The var type hasn't been set.");
+  PADDLE_ENFORCE_EQ(
+      desc_.has_type(), true,
+      platform::errors::NotFound("The variable's type was not be set."));
+  PADDLE_ENFORCE_EQ(
+      desc_.type().has_type(), true,
+      platform::errors::NotFound("The variable's type was not be set."));
   switch (desc_.type().type()) {
     case proto::VarType::SELECTED_ROWS:
       return desc_.type().selected_rows();
@@ -204,14 +210,16 @@ const proto::VarType::TensorDesc &VarDesc::tensor_desc() const {
     case proto::VarType::LOD_TENSOR_ARRAY:
       return desc_.type().tensor_array().tensor();
     default:
-      PADDLE_THROW(
-          "Getting 'tensor_desc' is not supported by the type of var %s.",
-          this->Name());
+      PADDLE_THROW(platform::errors::Unavailable(
+          "Getting 'tensor_desc' is not supported by the %s type variable.",
+          this->Name()));
   }
 }
 
 std::vector<proto::VarType::TensorDesc> VarDesc::tensor_descs() const {
-  PADDLE_ENFORCE(desc_.has_type(), "The var type hasn't been set.");
+  PADDLE_ENFORCE_EQ(
+      desc_.has_type(), true,
+      platform::errors::NotFound("The variable's type was not be set."));
   std::vector<proto::VarType::TensorDesc> res;
   res.reserve(GetTensorDescNum());
   switch (desc_.type().type()) {
@@ -221,16 +229,19 @@ std::vector<proto::VarType::TensorDesc> VarDesc::tensor_descs() const {
       }
       return res;
     default:
-      PADDLE_THROW(
-          "Getting 'tensor_descs' is not supported by the type of var "
-          "%s.",
-          this->Name());
+      PADDLE_THROW(platform::errors::Unavailable(
+          "Getting 'tensor_descs' is not supported by the %s type variable.",
+          this->Name()));
   }
 }
 
 proto::VarType::TensorDesc *VarDesc::mutable_tensor_desc() {
-  PADDLE_ENFORCE(desc_.has_type(), "The var type hasn't been set.");
-  PADDLE_ENFORCE(desc_.type().has_type(), "The var type hasn't been set.");
+  PADDLE_ENFORCE_EQ(
+      desc_.has_type(), true,
+      platform::errors::NotFound("The variable's type was not be set."));
+  PADDLE_ENFORCE_EQ(
+      desc_.type().has_type(), true,
+      platform::errors::NotFound("The variable's type was not be set."));
   switch (desc_.type().type()) {
     case proto::VarType::SELECTED_ROWS:
       return desc_.mutable_type()->mutable_selected_rows();
@@ -240,15 +251,19 @@ proto::VarType::TensorDesc *VarDesc::mutable_tensor_desc() {
       return desc_.mutable_type()->mutable_tensor_array()->mutable_tensor();
     default:
       PADDLE_THROW(
-          "Getting 'mutable_tensor_desc' is not supported by the type of var "
-          "%s.",
-          this->Name());
+          platform::errors::Unavailable("Getting 'mutable_tensor_desc' is not "
+                                        "supported by the %s type variable.",
+                                        this->Name()));
   }
 }
 
 std::vector<proto::VarType::TensorDesc *> VarDesc::mutable_tensor_descs() {
-  PADDLE_ENFORCE(desc_.has_type(), "The var type hasn't been set.");
-  PADDLE_ENFORCE(desc_.type().has_type(), "The var type hasn't been set.");
+  PADDLE_ENFORCE_EQ(
+      desc_.has_type(), true,
+      platform::errors::NotFound("The variable's type was not be set."));
+  PADDLE_ENFORCE_EQ(
+      desc_.type().has_type(), true,
+      platform::errors::NotFound("The variable's type was not be set."));
   std::vector<proto::VarType::TensorDesc *> res;
   res.reserve(GetTensorDescNum());
   switch (desc_.type().type()) {
@@ -259,10 +274,9 @@ std::vector<proto::VarType::TensorDesc *> VarDesc::mutable_tensor_descs() {
       }
       return res;
     default:
-      PADDLE_THROW(
-          "Getting 'tensor_descs' is not supported by the type of var "
-          "%s.",
-          this->Name());
+      PADDLE_THROW(platform::errors::Unavailable(
+          "Getting 'tensor_descs' is not supported by the %s type variable.",
+          this->Name()));
   }
 }
 
diff --git a/paddle/fluid/framework/var_type.h b/paddle/fluid/framework/var_type.h
index 43e9ed553bea84aaaaa18a46fe81f06a18b124af..8affeda67b3d07d67ceed2b657b285210e1bd076 100644
--- a/paddle/fluid/framework/var_type.h
+++ b/paddle/fluid/framework/var_type.h
@@ -40,7 +40,8 @@ inline proto::VarType::Type ToVarType(int type) {
     case proto::VarType::READER:
       return static_cast<proto::VarType::Type>(type);
     default:
-      PADDLE_THROW("ToVarType:Unsupported type %d", type);
+      PADDLE_THROW(platform::errors::Unavailable(
+          "ToVarType method Unsupported type %d.", type));
   }
 }
 
@@ -66,7 +67,8 @@ inline void VisitVarType(const framework::Variable& var, Visitor visitor) {
       visitor(var.Get<FetchList>());
       return;
     default:
-      PADDLE_THROW("Not supported visit type, %s", ToTypeName(var.Type()));
+      PADDLE_THROW(platform::errors::Unavailable("Not supported visit type %s.",
+                                                 ToTypeName(var.Type())));
   }
 }
 
diff --git a/paddle/fluid/framework/var_type_traits.cc b/paddle/fluid/framework/var_type_traits.cc
index 5c90b07149ec5575f9907e41cc65a826421cf3ec..1e5e8d657556059bae8129e7c0b6ea6b57cbf63f 100644
--- a/paddle/fluid/framework/var_type_traits.cc
+++ b/paddle/fluid/framework/var_type_traits.cc
@@ -46,12 +46,14 @@ struct VarIdToTypeIndexMapInitializerImpl {
     static_assert(!std::is_same<Type, void>::value, "Type cannot be void");
     constexpr int kId = VarTypeTrait<Type>::kId;
     auto type = std::type_index(typeid(Type));
-    PADDLE_ENFORCE(id_to_type->count(kId) == 0,
-                   "Registered duplicate type id %d for type %s", kId,
-                   type.name());
-    PADDLE_ENFORCE(type_to_id->count(type) == 0,
-                   "Registered duplicate type_index %s for id %d", type.name(),
-                   kId);
+    PADDLE_ENFORCE_EQ(
+        id_to_type->count(kId), 0,
+        platform::errors::AlreadyExists(
+            "Registered duplicate type id %d for type %s.", kId, type.name()));
+    PADDLE_ENFORCE_EQ(
+        type_to_id->count(type), 0,
+        platform::errors::AlreadyExists(
+            "Registered duplicate type index %s for id %d.", type.name(), kId));
     id_to_type->emplace(kId, type);
     type_to_id->emplace(type, kId);
     VarIdToTypeIndexMapInitializerImpl<kStart + 1, kEnd,
@@ -79,15 +81,17 @@ struct VarIdToTypeIndexMapHolder {
  public:
   static const std::type_index &ToTypeIndex(int var_id) {
     auto it = Instance().id_to_type_map_.find(var_id);
-    PADDLE_ENFORCE(it != Instance().id_to_type_map_.end(),
-                   "VarId %d is not registered.", var_id);
+    PADDLE_ENFORCE_NE(it, Instance().id_to_type_map_.end(),
+                      platform::errors::NotFound(
+                          "Variable Id %d is not registered.", var_id));
     return it->second;
   }
 
   static int ToTypeId(const std::type_index &type) {
     auto it = Instance().type_to_id_map_.find(type);
-    PADDLE_ENFORCE(it != Instance().type_to_id_map_.end(),
-                   "VarType %s is not registered.", type.name());
+    PADDLE_ENFORCE_NE(it, Instance().type_to_id_map_.end(),
+                      platform::errors::NotFound(
+                          "Variable Type %s is not registered.", type.name()));
     return it->second;
   }
 
diff --git a/paddle/fluid/framework/variable_helper.cc b/paddle/fluid/framework/variable_helper.cc
index 67e17410a29aff435921f46eeb2691a025d5a9eb..ec42aa30e5abb3dc3d03633cae31d95999d82731 100644
--- a/paddle/fluid/framework/variable_helper.cc
+++ b/paddle/fluid/framework/variable_helper.cc
@@ -50,11 +50,11 @@ void InitializeVariable(Variable *var, proto::VarType::Type var_type) {
   } else if (var_type == proto::VarType::RAW) {
     // GetMutable will be called in operator
   } else {
-    PADDLE_THROW(
+    PADDLE_THROW(platform::errors::Unavailable(
         "Variable type %d is not in "
         "[LOD_TENSOR, SELECTED_ROWS, FEED_MINIBATCH, FETCH_LIST, "
-        "LOD_RANK_TABLE, PLACE_LIST, READER, RAW]",
-        var_type);
+        "LOD_RANK_TABLE, PLACE_LIST, READER, RAW].",
+        var_type));
   }
 }
 
@@ -76,7 +76,8 @@ void CopyVariable(const Variable &src_var, Variable *dst_var) {
     auto *dst_t = tmp_grad_slr->mutable_value();
     framework::TensorCopy(src_t, cpu_place, dst_t);
   } else {
-    PADDLE_THROW("unknown var type to copy");
+    PADDLE_THROW(
+        platform::errors::Unavailable("Unknown variable type to copy."));
   }
 }
 
diff --git a/paddle/fluid/inference/CMakeLists.txt b/paddle/fluid/inference/CMakeLists.txt
index 9dc96fdfe8622e3e78673664637ab50970fe93c6..cf6fcb7b64365b382c648dd83639e0c44670014d 100644
--- a/paddle/fluid/inference/CMakeLists.txt
+++ b/paddle/fluid/inference/CMakeLists.txt
@@ -44,10 +44,11 @@ add_subdirectory(api)
 set(STATIC_INFERENCE_API paddle_inference_api analysis_predictor
      zero_copy_tensor reset_tensor_array 
         analysis_config paddle_pass_builder activation_functions ${mkldnn_quantizer_cfg})
-if(WIN32)
+# TODO(xingzhaolong, jiweibo): remove this and create_static_lib(paddle_fluid) on windows GPU
+if(WIN32 AND WITH_GPU)
   cc_library(paddle_fluid DEPS ${fluid_modules} ${STATIC_INFERENCE_API}) 
 else()
- create_static_lib(paddle_fluid ${fluid_modules} ${STATIC_INFERENCE_API}) 
+  create_static_lib(paddle_fluid ${fluid_modules} ${STATIC_INFERENCE_API}) 
 endif()
 
 if(NOT APPLE AND NOT WIN32)
diff --git a/paddle/fluid/inference/analysis/analyzer.cc b/paddle/fluid/inference/analysis/analyzer.cc
index d6d0371edaa78cde603a7f7d77473682be57df31..be7d6ab868022b5e9e1f073aad441decba0dbf00 100644
--- a/paddle/fluid/inference/analysis/analyzer.cc
+++ b/paddle/fluid/inference/analysis/analyzer.cc
@@ -27,8 +27,9 @@ Analyzer::Analyzer() {}
 void Analyzer::Run(Argument *argument) { RunAnalysis(argument); }
 
 void Analyzer::RunAnalysis(Argument *argument) {
-  PADDLE_ENFORCE(argument->analysis_passes_valid(),
-                 "analsis_passes is not valid in the argument.");
+  PADDLE_ENFORCE_EQ(argument->analysis_passes_valid(), true,
+                    platform::errors::InvalidArgument(
+                        "analsis_passes is not valid in the argument."));
   const bool disable_logs = argument->disable_logs();
   for (auto &pass : argument->analysis_passes()) {
     if (!disable_logs) {
@@ -38,7 +39,8 @@ void Analyzer::RunAnalysis(Argument *argument) {
       continue;
 
     auto *ptr = PassRegistry::Global().Retreive(pass);
-    PADDLE_ENFORCE_NOT_NULL(ptr, "no analysis pass called %s", pass);
+    PADDLE_ENFORCE_NOT_NULL(ptr, platform::errors::PreconditionNotMet(
+                                     "no analysis pass called %s", pass));
     ptr->Run(argument);
   }
 }
diff --git a/paddle/fluid/inference/analysis/analyzer_tester.cc b/paddle/fluid/inference/analysis/analyzer_tester.cc
index 79784fcb9bf31e8fac972053b1a4ec6180d45afa..135ef6a970621cea6ee1418f751ffc37406628db 100644
--- a/paddle/fluid/inference/analysis/analyzer_tester.cc
+++ b/paddle/fluid/inference/analysis/analyzer_tester.cc
@@ -75,9 +75,14 @@ void TestWord2vecPrediction(const std::string& model_path) {
   std::vector<PaddleTensor> outputs;
   CHECK(predictor->Run(slots, &outputs));
 
-  PADDLE_ENFORCE_EQ(outputs.size(), 1UL);
+  PADDLE_ENFORCE_EQ(outputs.size(), 1UL,
+                    platform::errors::PreconditionNotMet(
+                        "Output size should be 1, but got %d", outputs.size()));
   // Check the output buffer size and result of each tid.
-  PADDLE_ENFORCE_EQ(outputs.front().data.length(), 33168UL);
+  PADDLE_ENFORCE_EQ(outputs.front().data.length(), 33168UL,
+                    platform::errors::PreconditionNotMet(
+                        "Output's data length should be 33168 but got %d",
+                        outputs.front().data.length()));
   float result[5] = {0.00129761, 0.00151112, 0.000423564, 0.00108815,
                      0.000932706};
   const size_t num_elements = outputs.front().data.length() / sizeof(float);
diff --git a/paddle/fluid/inference/analysis/argument.h b/paddle/fluid/inference/analysis/argument.h
index 27bae7a71ea192ac08e4e87cb7bcdb8b84e29dc8..40ca3e85868fbbba19d81336aed1a8cbee58ec54 100644
--- a/paddle/fluid/inference/analysis/argument.h
+++ b/paddle/fluid/inference/analysis/argument.h
@@ -76,53 +76,62 @@ struct Argument {
     }
   }
 
-#define DECL_ARGUMENT_FIELD(field__, Field, type__)          \
- public:                                                     \
-  type__& field__() {                                        \
-    PADDLE_ENFORCE(Has(#field__), "There is no such field"); \
-    return field__##_;                                       \
-  }                                                          \
-  void Set##Field(const type__& x) {                         \
-    field__##_ = x;                                          \
-    valid_fields_.insert(#field__);                          \
-  }                                                          \
-  DECL_ARGUMENT_FIELD_VALID(field__);                        \
-  type__* field__##_ptr() { return &field__##_; }            \
-                                                             \
- private:                                                    \
+#define DECL_ARGUMENT_FIELD(field__, Field, type__)                      \
+ public:                                                                 \
+  type__& field__() {                                                    \
+    PADDLE_ENFORCE_EQ(                                                   \
+        Has(#field__), true,                                             \
+        platform::errors::PreconditionNotMet("There is no such field")); \
+    return field__##_;                                                   \
+  }                                                                      \
+  void Set##Field(const type__& x) {                                     \
+    field__##_ = x;                                                      \
+    valid_fields_.insert(#field__);                                      \
+  }                                                                      \
+  DECL_ARGUMENT_FIELD_VALID(field__);                                    \
+  type__* field__##_ptr() { return &field__##_; }                        \
+                                                                         \
+ private:                                                                \
   type__ field__##_;
 
 #define DECL_ARGUMENT_FIELD_VALID(field__) \
   bool field__##_valid() { return Has(#field__); }
 
-#define DECL_ARGUMENT_UNIQUE_FIELD(field__, Field, type__)                \
- public:                                                                  \
-  type__& field__() {                                                     \
-    PADDLE_ENFORCE_NOT_NULL(field__##_);                                  \
-    PADDLE_ENFORCE(Has(#field__));                                        \
-    return *static_cast<type__*>(field__##_.get());                       \
-  }                                                                       \
-  void Set##Field(type__* x) {                                            \
-    field__##_ =                                                          \
-        unique_ptr_t(x, [](void* x) { delete static_cast<type__*>(x); }); \
-    valid_fields_.insert(#field__);                                       \
-  }                                                                       \
-  void Set##Field##NotOwned(type__* x) {                                  \
-    valid_fields_.insert(#field__);                                       \
-    field__##_ = unique_ptr_t(x, [](void* x) {});                         \
-  }                                                                       \
-  DECL_ARGUMENT_FIELD_VALID(field__);                                     \
-  type__* field__##_ptr() {                                               \
-    PADDLE_ENFORCE(Has(#field__));                                        \
-    return static_cast<type__*>(field__##_.get());                        \
-  }                                                                       \
-  type__* Release##Field() {                                              \
-    PADDLE_ENFORCE(Has(#field__));                                        \
-    valid_fields_.erase(#field__);                                        \
-    return static_cast<type__*>(field__##_.release());                    \
-  }                                                                       \
-                                                                          \
- private:                                                                 \
+#define DECL_ARGUMENT_UNIQUE_FIELD(field__, Field, type__)                    \
+ public:                                                                      \
+  type__& field__() {                                                         \
+    PADDLE_ENFORCE_NOT_NULL(field__##_, platform::errors::PreconditionNotMet( \
+                                            "filed should not be null."));    \
+    PADDLE_ENFORCE_EQ(                                                        \
+        Has(#field__), true,                                                  \
+        platform::errors::PreconditionNotMet("There is no such field"));      \
+    return *static_cast<type__*>(field__##_.get());                           \
+  }                                                                           \
+  void Set##Field(type__* x) {                                                \
+    field__##_ =                                                              \
+        unique_ptr_t(x, [](void* x) { delete static_cast<type__*>(x); });     \
+    valid_fields_.insert(#field__);                                           \
+  }                                                                           \
+  void Set##Field##NotOwned(type__* x) {                                      \
+    valid_fields_.insert(#field__);                                           \
+    field__##_ = unique_ptr_t(x, [](void* x) {});                             \
+  }                                                                           \
+  DECL_ARGUMENT_FIELD_VALID(field__);                                         \
+  type__* field__##_ptr() {                                                   \
+    PADDLE_ENFORCE_EQ(                                                        \
+        Has(#field__), true,                                                  \
+        platform::errors::PreconditionNotMet("There is no such field"));      \
+    return static_cast<type__*>(field__##_.get());                            \
+  }                                                                           \
+  type__* Release##Field() {                                                  \
+    PADDLE_ENFORCE_EQ(                                                        \
+        Has(#field__), true,                                                  \
+        platform::errors::PreconditionNotMet("There is no such field"));      \
+    valid_fields_.erase(#field__);                                            \
+    return static_cast<type__*>(field__##_.release());                        \
+  }                                                                           \
+                                                                              \
+ private:                                                                     \
   unique_ptr_t field__##_;
 
   DECL_ARGUMENT_FIELD(predictor_id, PredictorID, int);
@@ -218,13 +227,19 @@ struct Argument {
 
   DECL_ARGUMENT_FIELD(fusion_statis, FusionStatis, fusion_statis_t);
 
+  // Only used in paddle-lite subgraph.
+  DECL_ARGUMENT_FIELD(cpu_math_library_num_threads, CpuMathLibraryNumThreads,
+                      int);
+
  private:
   std::unordered_set<std::string> valid_fields_;
 };
 
 #define ARGUMENT_CHECK_FIELD(argument__, fieldname__) \
-  PADDLE_ENFORCE(argument__->Has(#fieldname__),       \
-                 "the argument field [%s] should be set", #fieldname__);
+  PADDLE_ENFORCE_EQ(                                  \
+      argument__->Has(#fieldname__), true,            \
+      platform::errors::PreconditionNotMet(           \
+          "the argument field [%s] should be set", #fieldname__));
 
 }  // namespace analysis
 }  // namespace inference
diff --git a/paddle/fluid/inference/analysis/helper.h b/paddle/fluid/inference/analysis/helper.h
index a48058400241b030f17557156a4d973fca92fd8d..730fe35853a96a3427c26f1fa5662118a638f731 100644
--- a/paddle/fluid/inference/analysis/helper.h
+++ b/paddle/fluid/inference/analysis/helper.h
@@ -73,12 +73,15 @@ struct DataTypeNamer {
   template <typename T>
   const std::string &repr() const {
     auto x = std::type_index(typeid(T));
-    PADDLE_ENFORCE(dic_.count(x), "unknown type for representation");
+    PADDLE_ENFORCE_GT(dic_.count(x), 0, platform::errors::PreconditionNotMet(
+                                            "unknown type for representation"));
     return dic_.at(x);
   }
 
   const std::string &repr(const std::type_index &type) const {  // NOLINT
-    PADDLE_ENFORCE(dic_.count(type), "unknown type for representation");
+    PADDLE_ENFORCE_GT(dic_.count(type), 0,
+                      platform::errors::PreconditionNotMet(
+                          "unknown type for representation"));
     return dic_.at(type);
   }
 
@@ -116,7 +119,9 @@ template <typename T>
 class OrderedRegistry {
  public:
   T *Register(const std::string &name, T *x) {
-    PADDLE_ENFORCE(!dic_.count(name), "duplicate key [%s]", name);
+    PADDLE_ENFORCE_EQ(dic_.count(name), 0,
+                      platform::errors::PreconditionNotMet(
+                          "There exists duplicate key [%s]", name));
     dic_[name] = elements_.size();
     elements_.emplace_back(std::unique_ptr<T>(x));
     return elements_.back().get();
@@ -136,14 +141,20 @@ class OrderedRegistry {
 template <typename T>
 T &GetFromScope(const framework::Scope &scope, const std::string &name) {
   framework::Variable *var = scope.FindVar(name);
-  PADDLE_ENFORCE(var != nullptr);
+  PADDLE_ENFORCE_NOT_NULL(
+      var, platform::errors::PreconditionNotMet(
+               "The var which name is %s should not be nullptr.", name));
   return *var->GetMutable<T>();
 }
 
 static framework::proto::ProgramDesc LoadProgramDesc(
     const std::string &model_path) {
   std::ifstream fin(model_path, std::ios::in | std::ios::binary);
-  PADDLE_ENFORCE(fin.is_open(), "Cannot open file %s", model_path);
+  PADDLE_ENFORCE_EQ(
+      fin.is_open(), true,
+      platform::errors::NotFound(
+          "Cannot open file %s, please confirm whether the file exists",
+          model_path));
   fin.seekg(0, std::ios::end);
   std::string buffer(fin.tellg(), ' ');
   fin.seekg(0, std::ios::beg);
@@ -188,10 +199,12 @@ static std::string GetDirRoot(const std::string &path) {
 static std::string GetOrCreateModelOptCacheDir(const std::string &model_root) {
   std::string opt_cache_dir = model_root + "/_opt_cache/";
   if (!PathExists(opt_cache_dir)) {
-    PADDLE_ENFORCE(MKDIR(opt_cache_dir.c_str()) != -1,
-                   "Can not create optimize cache directory: %s, Make sure you "
-                   "have permission to write",
-                   opt_cache_dir);
+    PADDLE_ENFORCE_NE(
+        MKDIR(opt_cache_dir.c_str()), -1,
+        platform::errors::PreconditionNotMet(
+            "Can not create optimize cache directory: %s, Make sure you "
+            "have permission to write",
+            opt_cache_dir));
   }
   return opt_cache_dir;
 }
diff --git a/paddle/fluid/inference/analysis/ir_pass_manager.cc b/paddle/fluid/inference/analysis/ir_pass_manager.cc
index cd8d86d72938417112e17e86e5cc6dd12254a8d1..d136f5033e7e3783ec6c44bbacb94047c718b935 100644
--- a/paddle/fluid/inference/analysis/ir_pass_manager.cc
+++ b/paddle/fluid/inference/analysis/ir_pass_manager.cc
@@ -38,7 +38,9 @@ IRPassManager::IRPassManager(Argument *argument) {
   graph_ = std::unique_ptr<Graph>(new Graph(argument->main_program()));
   if (argument->Has("scope")) {
     auto *scope_ptr = argument->scope_ptr();
-    PADDLE_ENFORCE(scope_ptr);
+    PADDLE_ENFORCE_NOT_NULL(scope_ptr,
+                            platform::errors::PreconditionNotMet(
+                                "The scope ptr should not be nullptr."));
     graph_->SetNotOwned(framework::ir::kParamScopeAttr, scope_ptr);
   }
 
@@ -101,13 +103,17 @@ void IRPassManager::CreatePasses(Argument *argument,
       std::string optim_cache_dir = argument->optim_cache_dir();
       bool int8_valid =
           !(model_from_memory && optim_cache_dir.empty() && enable_int8);
-      PADDLE_ENFORCE(int8_valid,
-                     "When you are in TRT INT8 mode, and load model from "
-                     "memory, you should set optim_cache_dir using "
-                     "config.SetOptimCacheDir()");
-      PADDLE_ENFORCE(!(model_from_memory && use_static_engine),
-                     "When you are using Paddle-TRT, and also using load model "
-                     "from memory, you should set the use_static to false.");
+      PADDLE_ENFORCE_EQ(
+          int8_valid, true,
+          platform::errors::PreconditionNotMet(
+              "When you are in TRT INT8 mode, and load model from "
+              "memory, you should set optim_cache_dir using "
+              "config.SetOptimCacheDir()"));
+      PADDLE_ENFORCE_EQ(
+          !(model_from_memory && use_static_engine), true,
+          platform::errors::PreconditionNotMet(
+              "When you are using Paddle-TRT, and also using load model "
+              "from memory, you should set the use_static to false."));
 
       if (!optim_cache_dir.empty()) {
         pass->Set("model_opt_cache_dir", new std::string(optim_cache_dir));
@@ -150,6 +156,8 @@ void IRPassManager::CreatePasses(Argument *argument,
       pass->Set("use_xpu", new bool(argument->use_xpu()));
       pass->Set("xpu_l3_workspace_size",
                 new int(argument->xpu_l3_workspace_size()));
+      pass->Set("cpu_math_library_num_threads",
+                new int(argument->cpu_math_library_num_threads()));
     }
     disable_logs_ = argument->disable_logs();
     if (pass_name == "fc_fuse_pass") {
diff --git a/paddle/fluid/inference/analysis/ir_passes/lite_subgraph_pass.cc b/paddle/fluid/inference/analysis/ir_passes/lite_subgraph_pass.cc
index 6b16a481ddedbad0956d1358de95842ea9a3a101..e78d5ef017b7f8451556d388bf3b8c0a55276a59 100644
--- a/paddle/fluid/inference/analysis/ir_passes/lite_subgraph_pass.cc
+++ b/paddle/fluid/inference/analysis/ir_passes/lite_subgraph_pass.cc
@@ -244,6 +244,7 @@ void LiteSubgraphPass::SetUpEngine(
   bool enable_int8 = Get<bool>("enable_int8");
   bool use_xpu = Get<bool>("use_xpu");
   int xpu_l3_workspace_size = Get<int>("xpu_l3_workspace_size");
+  int cpu_math_library_num_threads = Get<int>("cpu_math_library_num_threads");
 
   lite_api::TargetType target_type;
   if (use_gpu) {
@@ -263,11 +264,12 @@ void LiteSubgraphPass::SetUpEngine(
       // Notice: The ordering here determines the device where the
       // input tensor of the Lite engine is located, and then affects
       // whether tensor sharing is feasible.
-      paddle::lite::Place({target_type, precision_type}),
-      paddle::lite::Place({target_type, PRECISION(kInt64)}),
-      paddle::lite::Place({target_type, PRECISION(kFloat)}),
-      paddle::lite::Place({TARGET(kHost), PRECISION(kFloat)}),
+      paddle::lite_api::Place({target_type, precision_type}),
+      paddle::lite_api::Place({target_type, PRECISION(kInt64)}),
+      paddle::lite_api::Place({target_type, PRECISION(kFloat)}),
+      paddle::lite_api::Place({TARGET(kHost), PRECISION(kFloat)}),
   };
+  config.cpu_math_library_num_threads = cpu_math_library_num_threads;
   config.xpu_l3_workspace_size = xpu_l3_workspace_size;
   if (dump_model) {
     lite::StrToBinaryFile("./model.bin", config.model);
diff --git a/paddle/fluid/inference/analysis/ir_passes/subgraph_util.cc b/paddle/fluid/inference/analysis/ir_passes/subgraph_util.cc
index b3bfafb0a116018fe2d624f390f355b348e3f847..ebb19fd486cc89c69d70de3fa98954b9ee415f1a 100644
--- a/paddle/fluid/inference/analysis/ir_passes/subgraph_util.cc
+++ b/paddle/fluid/inference/analysis/ir_passes/subgraph_util.cc
@@ -123,7 +123,9 @@ void RenameAndGetOutputs(
   auto add_block_var = [&](const std::string &graph_arg,
                            const std::string &block_arg) {
     auto arg_var_node = graph_var_map.find(graph_arg);
-    PADDLE_ENFORCE(arg_var_node != graph_var_map.end());
+    PADDLE_ENFORCE_NE(arg_var_node, graph_var_map.end(),
+                      platform::errors::InvalidArgument(
+                          "Can not find %s in graph_var_map", graph_arg));
     auto *var_t = block_desc->Var(block_arg);
     var_t->SetShape(arg_var_node->second->Var()->GetShape());
     var_t->SetDataType(arg_var_node->second->Var()->GetDataType());
@@ -133,7 +135,10 @@ void RenameAndGetOutputs(
     framework::proto::OpDesc *op = block_desc->Op(index)->Proto();
     framework::OpDesc op_desc(*op, nullptr);
     auto correspond_node = subgraph_nodes[index];
-    PADDLE_ENFORCE_EQ(correspond_node->Name(), op->type());
+    PADDLE_ENFORCE_EQ(correspond_node->Name(), op->type(),
+                      platform::errors::PreconditionNotMet(
+                          "We should get %s, but get %s", op->type(),
+                          correspond_node->Name()));
 
     std::unordered_map<std::string, size_t> var2id;
     std::unordered_map<std::string, framework::ir::Node *> in_vars;
diff --git a/paddle/fluid/inference/analysis/ir_passes/tensorrt_subgraph_pass.cc b/paddle/fluid/inference/analysis/ir_passes/tensorrt_subgraph_pass.cc
index 7ef072277fb7f1f13c14b38d64cea6d1f4584b76..46612c1c5b7065a1f87e09117818df8a15e2bd8b 100644
--- a/paddle/fluid/inference/analysis/ir_passes/tensorrt_subgraph_pass.cc
+++ b/paddle/fluid/inference/analysis/ir_passes/tensorrt_subgraph_pass.cc
@@ -97,7 +97,9 @@ void TensorRtSubgraphPass::CreateTensorRTOp(
     std::vector<std::string> *repetitive_params) const {
   auto *op_desc = node->Op();
   auto &subgraph = *framework::ir::Agent(node).subgraph();
-  PADDLE_ENFORCE(!subgraph.empty());
+  PADDLE_ENFORCE_EQ(subgraph.empty(), false,
+                    platform::errors::PreconditionNotMet(
+                        "The subgraph should not be empty."));
 
   framework::ProgramDesc *program_desc =
       Get<framework::ProgramDesc *>("program");
@@ -194,12 +196,17 @@ void TensorRtSubgraphPass::CreateTensorRTOp(
   // to Tensor.
   std::vector<std::string> output_mapping;
   for (auto name : output_names) {
-    PADDLE_ENFORCE(output_name_map.count(name) != 0);
+    PADDLE_ENFORCE_NE(output_name_map.count(name), 0,
+                      platform::errors::PreconditionNotMet(
+                          "The output_name_map should have %s", name));
     output_mapping.push_back(output_name_map[name]);
   }
-  PADDLE_ENFORCE(!output_mapping.empty());
-  PADDLE_ENFORCE(!block_desc.Proto()->vars().empty(),
-                 "the block has no var-desc");
+  PADDLE_ENFORCE_EQ(output_mapping.empty(), false,
+                    platform::errors::PreconditionNotMet(
+                        "The output_mapping should not be empty."));
+  PADDLE_ENFORCE_EQ(
+      !block_desc.Proto()->vars().empty(), true,
+      platform::errors::PreconditionNotMet("the block has no var-desc"));
 
   // Set attrs
   op_desc->SetType("tensorrt_engine");
diff --git a/paddle/fluid/inference/analysis/passes/ir_analysis_pass.cc b/paddle/fluid/inference/analysis/passes/ir_analysis_pass.cc
index d986811a827b6ed477b30bc43d26f52a71e8f178..34192965297a6b88c7905a2b1d7b1857d842f06a 100644
--- a/paddle/fluid/inference/analysis/passes/ir_analysis_pass.cc
+++ b/paddle/fluid/inference/analysis/passes/ir_analysis_pass.cc
@@ -13,6 +13,8 @@
 // limitations under the License.
 
 #include "paddle/fluid/inference/analysis/passes/ir_analysis_pass.h"
+#include <memory>
+#include <utility>
 #include "paddle/fluid/framework/ir/fuse_pass_base.h"
 #include "paddle/fluid/inference/analysis/ir_pass_manager.h"
 
@@ -31,7 +33,10 @@ void IrAnalysisPass::RunImpl(Argument* argument) {
   // Apply passes.
   IRPassManager the_ir_manager(argument);
   graph = the_ir_manager.Apply(std::move(graph));
-  PADDLE_ENFORCE_GT(graph->Nodes().size(), 0);
+  PADDLE_ENFORCE_GT(
+      graph->Nodes().size(), 0,
+      platform::errors::PreconditionNotMet(
+          "The graph nodes size should be greater than 0, but got 0"));
   argument->SetMainGraph(graph.release());
   CollectFusionStatis(argument);
 }
diff --git a/paddle/fluid/inference/analysis/passes/ir_graph_build_pass.cc b/paddle/fluid/inference/analysis/passes/ir_graph_build_pass.cc
index 970ecdbbeb0c4c12ce6ba928a74a14ca1ae183ca..188b2ff851d96fa76edd666c696d98ddb1dcb948 100644
--- a/paddle/fluid/inference/analysis/passes/ir_graph_build_pass.cc
+++ b/paddle/fluid/inference/analysis/passes/ir_graph_build_pass.cc
@@ -31,7 +31,9 @@ void IrGraphBuildPass::RunImpl(Argument *argument) {
   if (!argument->scope_valid()) {
     argument->SetScope(new framework::Scope);
   }
-  PADDLE_ENFORCE(argument->use_gpu_valid());
+  PADDLE_ENFORCE_EQ(argument->use_gpu_valid(), true,
+                    platform::errors::PreconditionNotMet(
+                        "The use_gpu field should be valid"));
 
   // The load program should run on the same device with the inference program,
   // so that the parameters will on the same device, or they will keep copying
@@ -51,14 +53,17 @@ void IrGraphBuildPass::RunImpl(Argument *argument) {
         argument->model_from_memory_valid() && argument->model_from_memory());
     argument->SetMainProgram(program.release());
   } else {
-    PADDLE_THROW(
-        "either model_dir or (program path and parameter path) should be set.");
+    PADDLE_THROW(platform::errors::PreconditionNotMet(
+        "either model_dir or (program path and parameter path) should be "
+        "set."));
   }
 
   auto graph = std::unique_ptr<Graph>(new Graph(argument->main_program()));
   argument->SetMainGraph(graph.release());
   auto *scope_ptr = argument->scope_ptr();
-  PADDLE_ENFORCE(scope_ptr);
+  PADDLE_ENFORCE_NOT_NULL(scope_ptr,
+                          platform::errors::PreconditionNotMet(
+                              "The scope ptr should not be nullptr."));
   argument->main_graph().SetNotOwned(framework::ir::kParamScopeAttr, scope_ptr);
 }
 
diff --git a/paddle/fluid/inference/analysis/passes/ir_graph_clean_pass.cc b/paddle/fluid/inference/analysis/passes/ir_graph_clean_pass.cc
index 1f888a28da0416b41a87b551208fbe109f54d844..c30aa2a1629c3638b1e7714f7d646c924e7156d7 100644
--- a/paddle/fluid/inference/analysis/passes/ir_graph_clean_pass.cc
+++ b/paddle/fluid/inference/analysis/passes/ir_graph_clean_pass.cc
@@ -31,7 +31,8 @@ void IrInferCleanGraphPass::RunImpl(Argument* argument) {
   std::unordered_set<const framework::ir::Node*> invalid_nodes;
   int valid_op = 0;
   for (auto* node : graph.Nodes()) {
-    PADDLE_ENFORCE_NOT_NULL(node);
+    PADDLE_ENFORCE_NOT_NULL(node, platform::errors::PreconditionNotMet(
+                                      "The node should not be nullptr."));
     if (is_valid_node(node)) {
       invalid_nodes.insert(node);
     } else if (node->IsOp()) {
diff --git a/paddle/fluid/inference/analysis/passes/ir_params_sync_among_devices_pass.cc b/paddle/fluid/inference/analysis/passes/ir_params_sync_among_devices_pass.cc
index fedee3ff95f0ffe7af730c7113dbe6ea33c118e5..f127478b5f2bf4bbc3157c3d825d9b042275d957 100644
--- a/paddle/fluid/inference/analysis/passes/ir_params_sync_among_devices_pass.cc
+++ b/paddle/fluid/inference/analysis/passes/ir_params_sync_among_devices_pass.cc
@@ -23,8 +23,12 @@ namespace inference {
 namespace analysis {
 
 void IrParamsSyncAmongDevicesPass::RunImpl(Argument *argument) {
-  PADDLE_ENFORCE(argument->scope_valid());
-  PADDLE_ENFORCE(argument->use_gpu_valid());
+  PADDLE_ENFORCE_EQ(
+      argument->scope_valid(), true,
+      platform::errors::PreconditionNotMet("The scope field should be valid"));
+  PADDLE_ENFORCE_EQ(argument->use_gpu_valid(), true,
+                    platform::errors::PreconditionNotMet(
+                        "The use_gpu field should be valid"));
 
   platform::Place place;
 
@@ -40,7 +44,9 @@ void IrParamsSyncAmongDevicesPass::RunImpl(Argument *argument) {
 
   LOG(INFO) << "Sync params from CPU to GPU";
 
-  PADDLE_ENFORCE(argument->gpu_device_id_valid());
+  PADDLE_ENFORCE_EQ(argument->gpu_device_id_valid(), true,
+                    platform::errors::PreconditionNotMet(
+                        "The gpu_device_id field should be valid"));
   place = platform::CUDAPlace(argument->gpu_device_id());
 
   auto *scope = argument->scope_ptr();
@@ -56,7 +62,8 @@ void IrParamsSyncAmongDevicesPass::RunImpl(Argument *argument) {
       continue;
     }
     auto *var = scope->FindLocalVar(var_name);
-    PADDLE_ENFORCE(var != nullptr);
+    PADDLE_ENFORCE_NOT_NULL(var, platform::errors::PreconditionNotMet(
+                                     "The var should not be nullptr"));
     if (var->IsType<framework::LoDTensor>() ||
         var->IsType<framework::Tensor>()) {
       auto *t = var->GetMutable<framework::LoDTensor>();
diff --git a/paddle/fluid/inference/analysis/passes/memory_optimize_pass.cc b/paddle/fluid/inference/analysis/passes/memory_optimize_pass.cc
index 6fbf880356c541e72cae6f3b03efe017042254ff..f432188131eddc402e696091ab3723697216aadf 100644
--- a/paddle/fluid/inference/analysis/passes/memory_optimize_pass.cc
+++ b/paddle/fluid/inference/analysis/passes/memory_optimize_pass.cc
@@ -90,32 +90,6 @@ void MemoryOptimizePass::CollectLifeCycle(
   }
 }
 
-// TODO(Superjomn) Make this a general help method.
-int DataTypeToSpace(framework::proto::VarType_Type type) {
-  switch (type) {
-    case framework::proto::VarType_Type_BOOL:
-      return sizeof(bool);
-    case framework::proto::VarType_Type_FP32:
-      return sizeof(float);
-    case framework::proto::VarType_Type_INT32:
-      return sizeof(int32_t);
-    case framework::proto::VarType_Type_INT64:
-      return sizeof(int64_t);
-    case framework::proto::VarType_Type_INT16:
-      return sizeof(int16_t);
-    case framework::proto::VarType_Type_FP16:
-      return sizeof(int16_t);
-    case framework::proto::VarType_Type_FP64:
-      return sizeof(double);
-    case framework::proto::VarType_Type_UINT8:
-      return sizeof(unsigned char);
-    case framework::proto::VarType_Type_INT8:
-      return sizeof(int8_t);
-    default:
-      PADDLE_THROW("Unknown data type");
-  }
-}
-
 void MemoryOptimizePass::CollectVarMemorySize(
     space_table_t* space_table) const {
   const int fake_batch_size = 1;
@@ -163,7 +137,7 @@ void MemoryOptimizePass::CollectVarMemorySize(
       int size = std::accumulate(shape.begin(), shape.end(), 1,
                                  std::multiplies<int>());
       (*space_table)[node->Var()->Name()] =
-          size * DataTypeToSpace(node->Var()->GetDataType());
+          size * paddle::framework::SizeOfType(node->Var()->GetDataType());
     }
   }
 }
@@ -250,7 +224,9 @@ void UpdateOpDescsByReuse(
 
       // modify the graph
       for (auto input_node : node->inputs) {
-        PADDLE_ENFORCE(input_node->IsVar());
+        PADDLE_ENFORCE_EQ(input_node->IsVar(), true,
+                          platform::errors::PreconditionNotMet(
+                              "The input node should be a variable."));
         std::string input_node_name = input_node->Name();
         if (reuse_table.count(input_node_name) &&
             reuse_table.at(input_node_name) != input_node_name) {
@@ -272,7 +248,9 @@ void UpdateOpDescsByReuse(
 
       // modify the graph
       for (auto out_node : node->outputs) {
-        PADDLE_ENFORCE(out_node->IsVar());
+        PADDLE_ENFORCE_EQ(out_node->IsVar(), true,
+                          platform::errors::PreconditionNotMet(
+                              "The output node should be a variable."));
         std::string out_node_name = out_node->Name();
         if (reuse_table.count(out_node_name) &&
             reuse_table.at(out_node_name) != out_node_name) {
diff --git a/paddle/fluid/inference/api/CMakeLists.txt b/paddle/fluid/inference/api/CMakeLists.txt
index fb0ad31a3e612201de32813a65970c73b73b611b..c0d3b14e0e43e10332d18ddd217a8a50245ab5ed 100755
--- a/paddle/fluid/inference/api/CMakeLists.txt
+++ b/paddle/fluid/inference/api/CMakeLists.txt
@@ -53,12 +53,10 @@ if(WITH_TESTING)
     inference_base_test(test_api_impl SRCS api_impl_tester.cc DEPS paddle_fluid_shared
                         ARGS --word2vec_dirname=${WORD2VEC_MODEL_DIR} --book_dirname=${PYTHON_TESTS_DIR}/book)
     set_tests_properties(test_api_impl PROPERTIES DEPENDS test_image_classification)
-    set_tests_properties(test_api_impl PROPERTIES LABELS "RUN_TYPE=DIST")
   elseif(WIN32)
     inference_base_test(test_api_impl SRCS api_impl_tester.cc DEPS ${inference_deps}
                         ARGS --word2vec_dirname=${WORD2VEC_MODEL_DIR} --book_dirname=${PYTHON_TESTS_DIR}/book)
     set_tests_properties(test_api_impl PROPERTIES DEPENDS test_image_classification)
-    set_tests_properties(test_api_impl PROPERTIES LABELS "RUN_TYPE=DIST")
   endif()
 
 endif()
diff --git a/paddle/fluid/inference/api/analysis_config.cc b/paddle/fluid/inference/api/analysis_config.cc
index bb01e7009a56ca0fc36177704547a5ecdadbd9fb..2184574aa1fe3c66728b41f221c1b0bf5fd464e7 100644
--- a/paddle/fluid/inference/api/analysis_config.cc
+++ b/paddle/fluid/inference/api/analysis_config.cc
@@ -217,9 +217,21 @@ void AnalysisConfig::EnableMkldnnQuantizer() {
   Update();
 }
 
+void AnalysisConfig::EnableMkldnnBfloat16() {
+#ifdef PADDLE_WITH_MKLDNN
+  use_mkldnn_bfloat16_ = true;
+#else
+  LOG(ERROR) << "Please compile with MKLDNN first to use MkldnnBfloat16";
+  use_mkldnn_bfloat16_ = false;
+#endif
+
+  Update();
+}
+
 MkldnnQuantizerConfig *AnalysisConfig::mkldnn_quantizer_config() const {
   PADDLE_ENFORCE_NOT_NULL(mkldnn_quantizer_config_,
-                          "MkldnnQuantizer was not enabled yet.");
+                          platform::errors::PreconditionNotMet(
+                              "MkldnnQuantizer was not enabled yet."));
   return mkldnn_quantizer_config_.get();
 }
 
@@ -330,6 +342,12 @@ void AnalysisConfig::Update() {
 #endif
   }
 
+  if (use_mkldnn_bfloat16_) {
+#ifdef PADDLE_WITH_MKLDNN
+    pass_builder()->EnableMkldnnBfloat16();
+#endif
+  }
+
 #ifdef PADDLE_WITH_MKLDNN
   // Do not optimize when mkldnn is on
   if (enable_memory_optim_ && !use_mkldnn_) {
@@ -398,6 +416,7 @@ std::string AnalysisConfig::SerializeInfoCache() {
   ss << ";";
 
   ss << use_mkldnn_quantizer_;
+  ss << use_mkldnn_bfloat16_;
   ss << model_from_memory_;
 
   ss << with_profile_;
diff --git a/paddle/fluid/inference/api/analysis_predictor.cc b/paddle/fluid/inference/api/analysis_predictor.cc
index bdde116dd3e046e03d200909b77c340aebe8db2c..6e26533c94e36f111e83295f4ba4355beff6bc03 100644
--- a/paddle/fluid/inference/api/analysis_predictor.cc
+++ b/paddle/fluid/inference/api/analysis_predictor.cc
@@ -169,7 +169,8 @@ bool AnalysisPredictor::PrepareScope(
   if (parent_scope) {
     PADDLE_ENFORCE_NOT_NULL(
         parent_scope,
-        "Both program and parent_scope should be set in Clone mode.");
+        platform::errors::PreconditionNotMet(
+            "Both program and parent_scope should be set in Clone mode."));
     scope_ = parent_scope;
     status_is_cloned_ = true;
   } else {
@@ -235,7 +236,9 @@ bool AnalysisPredictor::PrepareExecutor() {
   executor_->Prepare(sub_scope_, *inference_program_, 0,
                      config_.use_feed_fetch_ops_);
 
-  PADDLE_ENFORCE_NOT_NULL(sub_scope_);
+  PADDLE_ENFORCE_NOT_NULL(sub_scope_,
+                          platform::errors::PreconditionNotMet(
+                              "The sub_scope should not be nullptr."));
 
   return true;
 }
@@ -297,7 +300,8 @@ bool AnalysisPredictor::Run(const std::vector<PaddleTensor> &inputs,
   timer.tic();
   // set feed variable
   framework::Scope *scope = sub_scope_ ? sub_scope_ : scope_.get();
-  PADDLE_ENFORCE_NOT_NULL(scope, "The scope should not be nullptr.");
+  PADDLE_ENFORCE_NOT_NULL(scope, platform::errors::PreconditionNotMet(
+                                     "The scope should not be nullptr."));
   if (!SetFeed(inputs, scope)) {
     LOG(ERROR) << "fail to set feed";
     return false;
@@ -399,7 +403,11 @@ bool AnalysisPredictor::GetFetch(std::vector<PaddleTensor> *outputs,
   outputs->resize(fetches_.size());
   for (size_t i = 0; i < fetches_.size(); ++i) {
     int idx = BOOST_GET_CONST(int, fetches_[i]->GetAttr("col"));
-    PADDLE_ENFORCE((size_t)idx == i);
+    PADDLE_ENFORCE_EQ(
+        static_cast<size_t>(idx), i,
+        platform::errors::InvalidArgument(
+            "Fetch op's col attr(%d) should be equal to the index(%d)", idx,
+            i));
     framework::FetchType &fetch_var =
         framework::GetFetchVariable(*scope, "fetch", idx);
     auto &fetch = BOOST_GET(framework::LoDTensor, fetch_var);
@@ -435,10 +443,12 @@ void AnalysisPredictor::PrepareArgument() {
   if (!config_.model_dir().empty()) {
     argument_.SetModelDir(config_.model_dir());
   } else {
-    PADDLE_ENFORCE(
-        !config_.params_file().empty(),
-        "Either model_dir or (param_file, prog_file) should be set.");
-    PADDLE_ENFORCE(!config_.prog_file().empty());
+    PADDLE_ENFORCE_EQ(config_.params_file().empty(), false,
+                      platform::errors::PreconditionNotMet(
+                          "Either model_dir or param_file should be set."));
+    PADDLE_ENFORCE_EQ(config_.prog_file().empty(), false,
+                      platform::errors::PreconditionNotMet(
+                          "Either model_dir or prog_file should be set."));
     std::string dir = inference::analysis::GetDirRoot(config_.prog_file());
 
     argument_.SetModelProgramPath(config_.prog_file());
@@ -461,6 +471,8 @@ void AnalysisPredictor::PrepareArgument() {
   }
 
   if (config_.lite_engine_enabled()) {
+    argument_.SetCpuMathLibraryNumThreads(
+        config_.cpu_math_library_num_threads());
     argument_.SetLitePrecisionMode(config_.lite_precision_mode_);
     argument_.SetLitePassesFilter(config_.lite_passes_filter_);
     argument_.SetLiteOpsFilter(config_.lite_ops_filter_);
@@ -501,7 +513,9 @@ void AnalysisPredictor::OptimizeInferenceProgram() {
   PrepareArgument();
   Analyzer().Run(&argument_);
 
-  PADDLE_ENFORCE(argument_.scope_valid());
+  PADDLE_ENFORCE_EQ(
+      argument_.scope_valid(), true,
+      platform::errors::InvalidArgument("The argument scope should be valid."));
   VLOG(5) << "to prepare executor";
   ARGUMENT_CHECK_FIELD((&argument_), ir_analyzed_program);
   inference_program_.reset(
@@ -523,8 +537,10 @@ std::unique_ptr<PaddlePredictor> CreatePaddlePredictor<
     FLAGS_minloglevel = 2;  // GLOG_ERROR
   }
   VLOG(3) << "create AnalysisConfig";
-  PADDLE_ENFORCE(config.is_valid(),
-                 "Note: Each config can only be used for one predictor.");
+  PADDLE_ENFORCE_EQ(
+      config.is_valid(), true,
+      platform::errors::InvalidArgument(
+          "Note: Each config can only be used for one predictor."));
 
   if (config.use_gpu()) {
     static std::once_flag gflags_initialized;
@@ -621,7 +637,9 @@ bool AnalysisPredictor::MkldnnQuantize() {
 }
 
 void AnalysisPredictor::PrepareFeedFetch() {
-  PADDLE_ENFORCE_NOT_NULL(sub_scope_);
+  PADDLE_ENFORCE_NOT_NULL(sub_scope_,
+                          platform::errors::InvalidArgument(
+                              "The sub_scope should not be nullptr."));
   CreateFeedFetchVar(sub_scope_);
   for (auto *op : inference_program_->Block(0).AllOps()) {
     if (op->Type() == "feed") {
@@ -644,7 +662,8 @@ void AnalysisPredictor::PrepareFeedFetch() {
 }
 
 void AnalysisPredictor::CreateFeedFetchVar(framework::Scope *scope) {
-  PADDLE_ENFORCE_NOT_NULL(scope);
+  PADDLE_ENFORCE_NOT_NULL(scope, platform::errors::InvalidArgument(
+                                     "The scope should not be nullptr."));
   auto *var = scope->Var("feed");
   var->GetMutable<framework::FeedList>();
   var = scope->Var("fetch");
@@ -665,7 +684,8 @@ AnalysisPredictor::GetInputTensorShape() {
   std::vector<std::string> names = GetInputNames();
   for (std::string name : names) {
     auto *var = inference_program_->Block(0).FindVar(name);
-    PADDLE_ENFORCE_NOT_NULL(var, "input %s does not exist.", name);
+    PADDLE_ENFORCE_NOT_NULL(var, platform::errors::PreconditionNotMet(
+                                     "Input %s does not exist.", name));
     input_shapes[name] = var->GetShape();
   }
   return input_shapes;
@@ -681,7 +701,11 @@ std::vector<std::string> AnalysisPredictor::GetOutputNames() {
 
 std::unique_ptr<ZeroCopyTensor> AnalysisPredictor::GetInputTensor(
     const std::string &name) {
-  PADDLE_ENFORCE(executor_->scope()->FindVar(name), "no name called %s", name);
+  PADDLE_ENFORCE_NOT_NULL(
+      executor_->scope()->FindVar(name),
+      platform::errors::PreconditionNotMet(
+          "The variable named %s is not found in the scope of the exector.",
+          name));
   std::unique_ptr<ZeroCopyTensor> res(
       new ZeroCopyTensor(static_cast<void *>(executor_->scope())));
   res->input_or_output_ = true;
@@ -698,7 +722,11 @@ std::unique_ptr<ZeroCopyTensor> AnalysisPredictor::GetInputTensor(
 
 std::unique_ptr<ZeroCopyTensor> AnalysisPredictor::GetOutputTensor(
     const std::string &name) {
-  PADDLE_ENFORCE(executor_->scope()->FindVar(name), "no name called %s", name);
+  PADDLE_ENFORCE_NOT_NULL(
+      executor_->scope()->FindVar(name),
+      platform::errors::PreconditionNotMet(
+          "he variable named %s is not found in the scope of the exector.",
+          name));
   std::unique_ptr<ZeroCopyTensor> res(
       new ZeroCopyTensor(static_cast<void *>(executor_->scope())));
   res->input_or_output_ = false;
@@ -759,8 +787,11 @@ bool AnalysisPredictor::LoadProgramDesc() {
     std::string pb_content;
     // Read binary
     std::ifstream fin(filename, std::ios::in | std::ios::binary);
-    PADDLE_ENFORCE(static_cast<bool>(fin.is_open()), "Cannot open file %s",
-                   filename);
+    PADDLE_ENFORCE_EQ(
+        static_cast<bool>(fin.is_open()), true,
+        platform::errors::NotFound(
+            "Cannot open file %s, please confirm whether the file is normal.",
+            filename));
     fin.seekg(0, std::ios::end);
     pb_content.resize(fin.tellg());
     fin.seekg(0, std::ios::beg);
@@ -777,7 +808,8 @@ bool AnalysisPredictor::LoadProgramDesc() {
 
 bool AnalysisPredictor::LoadParameters() {
   PADDLE_ENFORCE_NOT_NULL(inference_program_.get(),
-                          "The inference program should be loaded first.");
+                          platform::errors::PreconditionNotMet(
+                              "The inference program should be loaded first."));
 
   const auto &global_block = inference_program_->MutableBlock(0);
 
@@ -853,8 +885,9 @@ void AnalysisPredictor::ClearIntermediateTensor() {
 
 #if PADDLE_WITH_TENSORRT
 bool AnalysisPredictor::SaveTrtCalibToDisk() {
-  PADDLE_ENFORCE(config_.tensorrt_engine_enabled(),
-                 "This func can be invoked only in trt mode");
+  PADDLE_ENFORCE_EQ(config_.tensorrt_engine_enabled(), true,
+                    platform::errors::PreconditionNotMet(
+                        "This func can be invoked only in trt mode"));
   auto &block = inference_program_->Block(0);
   for (auto &op_desc : block.AllOps()) {
     if (op_desc->Type() == "tensorrt_engine") {
@@ -1015,6 +1048,7 @@ void AnalysisPredictor::SaveOptimModel(const std::string &dir) {
 template <>
 std::unique_ptr<PaddlePredictor> CreatePaddlePredictor<AnalysisConfig>(
     const AnalysisConfig &config) {
+  LOG(WARNING) << "Deprecated. Please use CreatePredictor instead.";
   return CreatePaddlePredictor<AnalysisConfig, PaddleEngineKind::kAnalysis>(
       config);
 }
diff --git a/paddle/fluid/inference/api/analysis_predictor_tester.cc b/paddle/fluid/inference/api/analysis_predictor_tester.cc
index dea448f9b03468eabda16d4375ea60348a09efb2..5766919f08e68832886b88b867bc48afa288a955 100644
--- a/paddle/fluid/inference/api/analysis_predictor_tester.cc
+++ b/paddle/fluid/inference/api/analysis_predictor_tester.cc
@@ -485,4 +485,25 @@ TEST_F(MkldnnQuantizerTest, kl_scaling_factor_unsigned) {
 }
 #endif
 
+#ifdef PADDLE_WITH_CUDA
+TEST(AnalysisPredictor, bf16_gpu_pass_strategy) {
+  AnalysisConfig config;
+  config.SetModel(FLAGS_dirname);
+  config.SwitchIrOptim(true);
+  config.EnableUseGpu(100, 0);
+  config.EnableMkldnnBfloat16();
+#ifdef PADDLE_WITH_MKLDNN
+  ASSERT_EQ(config.mkldnn_bfloat16_enabled(), true);
+#else
+  ASSERT_EQ(config.mkldnn_bfloat16_enabled(), false);
+#endif
+}
+#endif
+
+TEST(AnalysisPredictor, bf16_pass_strategy) {
+  std::vector<std::string> passes;
+  PassStrategy passStrategy(passes);
+  passStrategy.EnableMkldnnBfloat16();
+}
+
 }  // namespace paddle
diff --git a/paddle/fluid/inference/api/api.cc b/paddle/fluid/inference/api/api.cc
index 2f608da531f25e1a5665744f7e9a2968cc9d0d64..840541246aff4d6f5dec1d8b3f8e5892bdcb6e9d 100644
--- a/paddle/fluid/inference/api/api.cc
+++ b/paddle/fluid/inference/api/api.cc
@@ -62,9 +62,9 @@ PaddleBuf &PaddleBuf::operator=(const PaddleBuf &other) {
     if (other.length() && other.data())
       memcpy(data_, other.data(), other.length());
     else if (other.length())
-      PADDLE_THROW(
+      PADDLE_THROW(platform::errors::InvalidArgument(
           "Invalid argument, null pointer data with length %u is passed",
-          other.length());
+          other.length()));
 
     length_ = other.length();
     memory_owned_ = true;
@@ -92,7 +92,8 @@ void PaddleBuf::Resize(size_t length) {
     length_ = length;
     memory_owned_ = true;
   } else {
-    PADDLE_THROW("The memory is allocated externally, can not Resized");
+    PADDLE_THROW(platform::errors::PreconditionNotMet(
+        "The memory is allocated externally, can not Resized"));
   }
 }
 
@@ -105,7 +106,11 @@ void PaddleBuf::Reset(void *data, size_t length) {
 
 void PaddleBuf::Free() {
   if (memory_owned_ && data_) {
-    PADDLE_ENFORCE_GT(length_, 0UL);
+    PADDLE_ENFORCE_GT(
+        length_, 0UL,
+        platform::errors::PreconditionNotMet(
+            "The memory used in PaddleBuf %d should be greater than 0",
+            length_));
     delete[] static_cast<char *>(data_);
     data_ = nullptr;
     length_ = 0;
diff --git a/paddle/fluid/inference/api/api_impl.cc b/paddle/fluid/inference/api/api_impl.cc
index 07d6dcf86e9814e5bfc932d8320b549d55fe88ae..c78cdf24dec561f5fd5643cb50ee243a58b3ab6a 100644
--- a/paddle/fluid/inference/api/api_impl.cc
+++ b/paddle/fluid/inference/api/api_impl.cc
@@ -87,7 +87,9 @@ bool NativePaddlePredictor::Init(
   if (parent_scope) {
     scope_ = parent_scope;
     sub_scope_ = &(parent_scope->NewScope());
-    PADDLE_ENFORCE_NOT_NULL(sub_scope_, "create sub scope fail");
+    PADDLE_ENFORCE_NOT_NULL(sub_scope_,
+                            platform::errors::PreconditionNotMet(
+                                "The sub_scope should not be nullptr."));
   } else {
     paddle::framework::InitDevices(false);
     scope_.reset(new paddle::framework::Scope());
@@ -182,7 +184,10 @@ std::unique_ptr<PaddlePredictor> NativePaddlePredictor::Clone() {
   std::unique_ptr<PaddlePredictor> cls(new NativePaddlePredictor(config_));
   // Hot fix the bug that result diff in multi-thread.
   // TODO(Superjomn) re-implement a real clone here.
-  PADDLE_ENFORCE_NOT_NULL(dynamic_cast<NativePaddlePredictor *>(cls.get()));
+  PADDLE_ENFORCE_NOT_NULL(
+      dynamic_cast<NativePaddlePredictor *>(cls.get()),
+      platform::errors::PreconditionNotMet(
+          "Dynamic_cast from PaddlePredictor to NativePaddlePredictor failed"));
   if (!dynamic_cast<NativePaddlePredictor *>(cls.get())->Init(nullptr)) {
     LOG(ERROR) << "fail to call Init";
     return nullptr;
@@ -224,8 +229,13 @@ bool NativePaddlePredictor::SetFeed(const std::vector<PaddleTensor> &inputs,
       return false;
     }
 
-    PADDLE_ENFORCE_NOT_NULL(input_ptr);
-    PADDLE_ENFORCE_NOT_NULL(inputs[i].data.data());
+    PADDLE_ENFORCE_NOT_NULL(input_ptr,
+                            platform::errors::InvalidArgument(
+                                "The input_ptr should not be nullptr."));
+    PADDLE_ENFORCE_NOT_NULL(
+        inputs[i].data.data(),
+        platform::errors::InvalidArgument(
+            "The data of input tensor should not be null."));
     if (platform::is_cpu_place(place_)) {
       // TODO(panyx0718): Init LoDTensor from existing memcpy to save a copy.
       std::memcpy(static_cast<void *>(input_ptr), inputs[i].data.data(),
@@ -241,7 +251,8 @@ bool NativePaddlePredictor::SetFeed(const std::vector<PaddleTensor> &inputs,
                    platform::CPUPlace(), inputs[i].data.data(),
                    inputs[i].data.length(), dev_ctx->stream());
 #else
-      PADDLE_THROW("Not compile with CUDA, should not reach here.");
+      PADDLE_THROW(platform::errors::Unavailable(
+          "Not compile with CUDA, should not reach here."));
 #endif
     }
 
@@ -287,7 +298,11 @@ bool NativePaddlePredictor::GetFetch(std::vector<PaddleTensor> *outputs,
   outputs->resize(fetchs_.size());
   for (size_t i = 0; i < fetchs_.size(); ++i) {
     int idx = BOOST_GET_CONST(int, fetchs_[i]->GetAttr("col"));
-    PADDLE_ENFORCE((size_t)idx == i);
+    PADDLE_ENFORCE_EQ(
+        static_cast<size_t>(idx), i,
+        platform::errors::InvalidArgument(
+            "Fetch op's col attr(%d) should be equal to the index(%d)", idx,
+            i));
     framework::FetchType &fetch_var =
         framework::GetFetchVariable(*scope, "fetch", idx);
     auto fetch = BOOST_GET_CONST(framework::LoDTensor, fetch_var);
@@ -318,10 +333,15 @@ std::unique_ptr<PaddlePredictor> CreatePaddlePredictor<
   VLOG(3) << "create NativePaddlePredictor";
   if (config.use_gpu) {
     // 1. GPU memory
-    PADDLE_ENFORCE_GE(
-        config.fraction_of_gpu_memory, 0.f,
-        "fraction_of_gpu_memory in the config should be set to range (0., 1.]");
-    PADDLE_ENFORCE_GE(config.device, 0, "Invalid device id %d", config.device);
+    PADDLE_ENFORCE_GE(config.fraction_of_gpu_memory, 0.f,
+                      platform::errors::InvalidArgument(
+                          "fraction_of_gpu_memory in the config should be set "
+                          "to range (0., 1.]"));
+    PADDLE_ENFORCE_GE(config.device, 0,
+                      platform::errors::PreconditionNotMet(
+                          "Invalid device id %d, the device id should be "
+                          "greater than or equal to 0.",
+                          config.device));
     std::vector<std::string> flags;
     if (config.fraction_of_gpu_memory >= 0.0f ||
         config.fraction_of_gpu_memory <= 0.95f) {
@@ -336,7 +356,9 @@ std::unique_ptr<PaddlePredictor> CreatePaddlePredictor<
 
   std::unique_ptr<PaddlePredictor> predictor(new NativePaddlePredictor(config));
   PADDLE_ENFORCE_NOT_NULL(
-      dynamic_cast<NativePaddlePredictor *>(predictor.get()));
+      dynamic_cast<NativePaddlePredictor *>(predictor.get()),
+      platform::errors::PreconditionNotMet(
+          "Dynamic_cast from PaddlePredictor to NativePaddlePredictor failed"));
   if (!dynamic_cast<NativePaddlePredictor *>(predictor.get())->Init(nullptr)) {
     return nullptr;
   }
@@ -351,6 +373,7 @@ std::unique_ptr<PaddlePredictor> CreatePaddlePredictor<
 template <>
 std::unique_ptr<PaddlePredictor> CreatePaddlePredictor<NativeConfig>(
     const NativeConfig &config) {
+  LOG(WARNING) << "Deprecated. Please use CreatePredictor instead.";
   return CreatePaddlePredictor<NativeConfig, PaddleEngineKind::kNative>(config);
 }
 
diff --git a/paddle/fluid/inference/api/demo_ci/CMakeLists.txt b/paddle/fluid/inference/api/demo_ci/CMakeLists.txt
index 08a1a5428193c2d506f511112e4a26d73c382ff1..6a3760e1f749b2b4875df00b01def57c979b3c93 100644
--- a/paddle/fluid/inference/api/demo_ci/CMakeLists.txt
+++ b/paddle/fluid/inference/api/demo_ci/CMakeLists.txt
@@ -51,8 +51,8 @@ if (WIN32)
     set(CMAKE_C_FLAGS_RELEASE  "${CMAKE_C_FLAGS_RELEASE} /bigobj /MT")
     set(CMAKE_CXX_FLAGS_DEBUG  "${CMAKE_CXX_FLAGS_DEBUG} /bigobj /MTd")
     set(CMAKE_CXX_FLAGS_RELEASE   "${CMAKE_CXX_FLAGS_RELEASE} /bigobj /MT")
+    safe_set_static_flag()
     if (WITH_STATIC_LIB)
-      safe_set_static_flag()
       add_definitions(-DSTATIC_LIB)
     endif()
   endif()
@@ -136,7 +136,7 @@ else()
   set(DEPS ${DEPS}
       ${MATH_LIB} ${MKLDNN_LIB}
       glog gflags_static libprotobuf  xxhash ${EXTERNAL_LIB})
-  set(DEPS ${DEPS} libcmt shlwapi.lib)
+  set(DEPS ${DEPS} shlwapi.lib)
 endif(NOT WIN32)
 
 if(WITH_GPU)
diff --git a/paddle/fluid/inference/api/demo_ci/run.sh b/paddle/fluid/inference/api/demo_ci/run.sh
index d8d9e2187815dcad78ad4ea6be10ad677940bf39..a3e7bec398af7e193a75395ad40175336f5f7503 100755
--- a/paddle/fluid/inference/api/demo_ci/run.sh
+++ b/paddle/fluid/inference/api/demo_ci/run.sh
@@ -6,8 +6,8 @@ TEST_GPU_CPU=$3 # test both GPU/CPU mode or only CPU mode
 DATA_DIR=$4 # dataset
 TENSORRT_INCLUDE_DIR=$5 # TensorRT header file dir, default to /usr/local/TensorRT/include
 TENSORRT_LIB_DIR=$6 # TensorRT lib file dir, default to /usr/local/TensorRT/lib
-
-inference_install_dir=${PADDLE_ROOT}/build/fluid_inference_install_dir
+MSVC_STATIC_CRT=$7
+inference_install_dir=${PADDLE_ROOT}/build/paddle_inference_install_dir
 
 cd `dirname $0`
 current_dir=`pwd`
@@ -66,43 +66,54 @@ mkdir -p build
 cd build
 rm -rf *
 
-if [ $(echo `uname` | grep "Win") != "" ]; then
-  # -----simple_on_word2vec on windows-----
-  cmake .. -G "Visual Studio 14 2015" -A x64 -DPADDLE_LIB=${inference_install_dir} \
-    -DWITH_MKL=$TURN_ON_MKL \
-    -DDEMO_NAME=simple_on_word2vec \
-    -DWITH_GPU=$TEST_GPU_CPU \
-    -DWITH_STATIC_LIB=OFF
-  msbuild  /maxcpucount /property:Configuration=Release cpp_inference_demo.sln
-  Release/simple_on_word2vec.exe \
-      --dirname=$DATA_DIR/word2vec/word2vec.inference.model \
-      --use_gpu=False
-  if [ $? -ne 0 ]; then
-    echo "simple_on_word2vec demo runs fail."
-    exit 1
-  fi
-
-  # -----vis_demo on windows-----
-  rm -rf *
-  cmake .. -G "Visual Studio 14 2015" -A x64 -DPADDLE_LIB=${inference_install_dir} \
-    -DWITH_MKL=$TURN_ON_MKL \
-    -DDEMO_NAME=vis_demo \
-    -DWITH_GPU=$TEST_GPU_CPU \
-    -DWITH_STATIC_LIB=OFF
-  msbuild  /maxcpucount /property:Configuration=Release cpp_inference_demo.sln
-  for vis_demo_name in $vis_demo_list; do
-    Release/vis_demo.exe \
-      --modeldir=$DATA_DIR/$vis_demo_name/model \
-      --data=$DATA_DIR/$vis_demo_name/data.txt \
-      --refer=$DATA_DIR/$vis_demo_name/result.txt \
-      --use_gpu=False
-    if [ $? -ne 0 ]; then
-      echo "vis demo $vis_demo_name runs fail."
-      exit 1
+for WITH_STATIC_LIB in ON OFF; do
+  if [ $(echo `uname` | grep "Win") != "" ]; then
+    # TODO(xingzhaolong, jiweibo): remove this if windows GPU library is ready.
+    if [ $TEST_GPU_CPU == ON] && [ $WITH_STATIC_LIB ==ON ]; then
+      return 0
     fi
-  done
-else
-  for WITH_STATIC_LIB in ON OFF; do
+    
+    # -----simple_on_word2vec on windows-----
+    cmake .. -G "Visual Studio 14 2015" -A x64 -DPADDLE_LIB=${inference_install_dir} \
+      -DWITH_MKL=$TURN_ON_MKL \
+      -DDEMO_NAME=simple_on_word2vec \
+      -DWITH_GPU=$TEST_GPU_CPU \
+      -DWITH_STATIC_LIB=$WITH_STATIC_LIB \
+      -DMSVC_STATIC_CRT=$MSVC_STATIC_CRT
+    msbuild  /maxcpucount /property:Configuration=Release cpp_inference_demo.sln
+    for use_gpu in $use_gpu_list; do
+      Release/simple_on_word2vec.exe \
+        --dirname=$DATA_DIR/word2vec/word2vec.inference.model \
+        --use_gpu=$use_gpu
+      if [ $? -ne 0 ]; then
+        echo "simple_on_word2vec demo runs fail."
+        exit 1
+      fi
+    done
+
+    # -----vis_demo on windows-----
+    rm -rf *
+    cmake .. -G "Visual Studio 14 2015" -A x64 -DPADDLE_LIB=${inference_install_dir} \
+      -DWITH_MKL=$TURN_ON_MKL \
+      -DDEMO_NAME=vis_demo \
+      -DWITH_GPU=$TEST_GPU_CPU \
+      -DWITH_STATIC_LIB=$WITH_STATIC_LIB \
+      -DMSVC_STATIC_CRT=$MSVC_STATIC_CRT
+    msbuild  /maxcpucount /property:Configuration=Release cpp_inference_demo.sln
+    for use_gpu in $use_gpu_list; do
+      for vis_demo_name in $vis_demo_list; do
+        Release/vis_demo.exe \
+          --modeldir=$DATA_DIR/$vis_demo_name/model \
+          --data=$DATA_DIR/$vis_demo_name/data.txt \
+          --refer=$DATA_DIR/$vis_demo_name/result.txt \
+          --use_gpu=$use_gpu
+        if [ $? -ne 0 ]; then
+          echo "vis demo $vis_demo_name runs fail."
+          exit 1
+        fi
+      done
+    done
+  else
     # -----simple_on_word2vec on linux/mac-----
     rm -rf *
     cmake .. -DPADDLE_LIB=${inference_install_dir} \
@@ -123,7 +134,6 @@ else
         fi
       done
     fi
-
     # ---------vis_demo on linux/mac---------
     rm -rf *
     cmake .. -DPADDLE_LIB=${inference_install_dir} \
@@ -145,7 +155,6 @@ else
         fi
       done
     done
-
     # --------tensorrt mobilenet on linux/mac------
     if [ $USE_TENSORRT == ON -a $TEST_GPU_CPU == ON ]; then
       rm -rf *
@@ -167,6 +176,6 @@ else
         exit 1
       fi
     fi
-  done
-fi
+  fi
+done
 set +x
diff --git a/paddle/fluid/inference/api/demo_ci/run_windows_demo.bat b/paddle/fluid/inference/api/demo_ci/run_windows_demo.bat
index 5199b83413af87eacba6f26f4fc0a9acb6a39808..523dafa6649b9faa019edc1c1926b5fa408e03d5 100644
--- a/paddle/fluid/inference/api/demo_ci/run_windows_demo.bat
+++ b/paddle/fluid/inference/api/demo_ci/run_windows_demo.bat
@@ -21,7 +21,7 @@ if /i "%use_mkl%"=="N" (
 )
 
 :set_paddle_infernece_lib
-SET /P paddle_infernece_lib="Please input the path of paddle inference library, such as D:\fluid_inference_install_dir   =======>"
+SET /P paddle_infernece_lib="Please input the path of paddle inference library, such as D:\paddle_inference_install_dir   =======>"
 set tmp_var=!paddle_infernece_lib!
 call:remove_space
 set paddle_infernece_lib=!tmp_var!
diff --git a/paddle/fluid/inference/api/details/zero_copy_tensor.cc b/paddle/fluid/inference/api/details/zero_copy_tensor.cc
index a5a0405ac88ad8e94a65d728557ab9298eae56dc..46755eeda660ae8f4c54d318f6450fbf1d48b1f7 100644
--- a/paddle/fluid/inference/api/details/zero_copy_tensor.cc
+++ b/paddle/fluid/inference/api/details/zero_copy_tensor.cc
@@ -21,15 +21,21 @@
 namespace paddle {
 
 void ZeroCopyTensor::Reshape(const std::vector<int> &shape) {
-  PADDLE_ENFORCE(!name_.empty(),
-                 "Need to SetName first, so that the corresponding tensor can "
-                 "be retrieved.");
-  PADDLE_ENFORCE(input_or_output_,
-                 "Can't reshape the output tensor, it is readonly");
-  PADDLE_ENFORCE(scope_);
+  PADDLE_ENFORCE_EQ(
+      name_.empty(), false,
+      platform::errors::PreconditionNotMet(
+          "Need to SetName first, so that the corresponding tensor can "
+          "be retrieved."));
+  PADDLE_ENFORCE_EQ(input_or_output_, true,
+                    platform::errors::PermissionDenied(
+                        "Can't reshape the output tensor, it is readonly"));
+  PADDLE_ENFORCE_NOT_NULL(scope_, platform::errors::PreconditionNotMet(
+                                      "The scope should not be nullptr."));
   auto *scope = static_cast<framework::Scope *>(scope_);
   auto *var = scope->FindVar(name_);
-  PADDLE_ENFORCE(var, "No tensor called [%s] in the runtime scope", name_);
+  PADDLE_ENFORCE_NOT_NULL(
+      var, platform::errors::PreconditionNotMet(
+               "No tensor called [%s] in the runtime scope", name_));
   auto *tensor = var->GetMutable<framework::LoDTensor>();
   tensor->Resize(framework::make_ddim(shape));
 }
@@ -45,8 +51,10 @@ T *ZeroCopyTensor::mutable_data(PaddlePlace place) {
   EAGER_GET_TENSOR;
   PADDLE_ENFORCE_GT(
       tensor->numel(), 0,
-      "You should call ZeroCopyTensor::Reshape(const std::vector<int> &shape)"
-      "function before retrieving mutable_data from input tensor.");
+      platform::errors::PreconditionNotMet(
+          "You should call ZeroCopyTensor::Reshape(const std::vector<int> "
+          "&shape)"
+          "function before retrieving mutable_data from input tensor."));
   switch (static_cast<int>(place)) {
     case static_cast<int>(PaddlePlace::kCPU): {
       return tensor->mutable_data<T>(platform::CPUPlace());
@@ -55,7 +63,8 @@ T *ZeroCopyTensor::mutable_data(PaddlePlace place) {
       return tensor->mutable_data<T>(platform::CUDAPlace(device_));
     }
     default:
-      PADDLE_THROW("Unsupported place: %d", static_cast<int>(place));
+      PADDLE_THROW(platform::errors::Unavailable("Unsupported place: %d",
+                                                 static_cast<int>(place)));
       break;
   }
   return nullptr;
@@ -96,10 +105,11 @@ PaddleDType ZeroCopyTensor::type() const {
 template <typename T>
 void ZeroCopyTensor::copy_from_cpu(const T *data) {
   EAGER_GET_TENSOR;
-  PADDLE_ENFORCE_GE(
-      tensor->numel(), 0,
-      "You should call ZeroCopyTensor::Reshape(const std::vector<int> &shape)"
-      "function before copying data from cpu.");
+  PADDLE_ENFORCE_GE(tensor->numel(), 0,
+                    platform::errors::PreconditionNotMet(
+                        "You should call ZeroCopyTensor::Reshape(const "
+                        "std::vector<int> &shape)"
+                        "function before copying data from cpu."));
   size_t ele_size = tensor->numel() * sizeof(T);
 
   if (place_ == PaddlePlace::kCPU) {
@@ -116,7 +126,8 @@ void ZeroCopyTensor::copy_from_cpu(const T *data) {
     memory::Copy(gpu_place, static_cast<void *>(t_data), platform::CPUPlace(),
                  data, ele_size, dev_ctx->stream());
 #else
-    PADDLE_THROW("Not compiled with CUDA, should not reach here.");
+    PADDLE_THROW(platform::errors::Unavailable(
+        "Not compiled with CUDA, should not reach here."));
 #endif
   }
 }
@@ -141,7 +152,8 @@ void ZeroCopyTensor::copy_to_cpu(T *data) {
 
     cudaStreamSynchronize(dev_ctx->stream());
 #else
-    PADDLE_THROW("Not compile with CUDA, should not reach here.");
+    PADDLE_THROW(platform::errors::Unavailable(
+        "Not compile with CUDA, should not reach here."));
 #endif
   }
 }
@@ -176,20 +188,27 @@ template PD_INFER_DECL uint8_t *ZeroCopyTensor::mutable_data<uint8_t>(
     PaddlePlace place);
 
 void *ZeroCopyTensor::FindTensor() const {
-  PADDLE_ENFORCE(!name_.empty(),
-                 "Need to SetName first, so that the corresponding tensor can "
-                 "be retrieved.");
-  PADDLE_ENFORCE(scope_);
+  PADDLE_ENFORCE_EQ(
+      name_.empty(), false,
+      platform::errors::PreconditionNotMet(
+          "Need to SetName first, so that the corresponding tensor can "
+          "be retrieved."));
+  PADDLE_ENFORCE_NOT_NULL(scope_, platform::errors::PreconditionNotMet(
+                                      "The scope should not be nullptr."));
   auto *scope = static_cast<framework::Scope *>(scope_);
   auto *var = scope->FindVar(name_);
-  PADDLE_ENFORCE(var, "No tensor called [%s] in the runtime scope", name_);
+  PADDLE_ENFORCE_NOT_NULL(
+      var, platform::errors::PreconditionNotMet(
+               "No tensor called [%s] in the runtime scope", name_));
   auto *tensor = var->GetMutable<framework::LoDTensor>();
   return tensor;
 }
 
 std::vector<int> ZeroCopyTensor::shape() const {
   EAGER_GET_TENSOR;
-  PADDLE_ENFORCE(tensor_, "not found tensor called %s in the scope", name_);
+  PADDLE_ENFORCE_NOT_NULL(
+      tensor_, platform::errors::PreconditionNotMet(
+                   "Not found tensor called %s in the scope", name_));
   return framework::vectorize<int>(tensor->dims());
 }
 
diff --git a/paddle/fluid/inference/api/helper.h b/paddle/fluid/inference/api/helper.h
index cddb0c8daf97b2b8142fcc3b207be2c56a43988a..014985661fd927debb48c699a157c0e05265842c 100644
--- a/paddle/fluid/inference/api/helper.h
+++ b/paddle/fluid/inference/api/helper.h
@@ -112,16 +112,19 @@ static T convert(const std::string &item,
     std::string message =
         "invalid_argument exception when try to convert : " + item;
     LOG(ERROR) << message;
-    PADDLE_THROW(message);
+    PADDLE_THROW(platform::errors::InvalidArgument(
+        "invalid_argument exception when try to convert %s.", item));
   } catch (std::out_of_range &e) {
     std::string message =
         "out_of_range exception when try to convert : " + item;
     LOG(ERROR) << message;
-    PADDLE_THROW(message);
+    PADDLE_THROW(platform::errors::InvalidArgument(
+        "out_of_range exception when try to convert %s.", item));
   } catch (...) {
     std::string message = "unexpected exception when try to convert " + item;
     LOG(ERROR) << message;
-    PADDLE_THROW(message);
+    PADDLE_THROW(platform::errors::InvalidArgument(
+        "unexpected exception when try to convert %s.", item));
   }
   return res;
 }
@@ -353,7 +356,8 @@ static void PrintTime(int batch_size, int repeat, int num_threads, int tid,
                       double batch_latency, int epoch = 1,
                       const framework::proto::VarType::Type data_type =
                           framework::proto::VarType::FP32) {
-  PADDLE_ENFORCE_GT(batch_size, 0, "Non-positive batch size.");
+  PADDLE_ENFORCE_GT(batch_size, 0, platform::errors::InvalidArgument(
+                                       "Non-positive batch size."));
   double sample_latency = batch_latency / batch_size;
   LOG(INFO) << "====== threads: " << num_threads << ", thread id: " << tid
             << " ======";
diff --git a/paddle/fluid/inference/api/mkldnn_quantizer.cc b/paddle/fluid/inference/api/mkldnn_quantizer.cc
index 9be12ff309acff681da75f7f13e317a408a9552a..793fc53d90b768050572a3dd0a080a5d30e959a2 100644
--- a/paddle/fluid/inference/api/mkldnn_quantizer.cc
+++ b/paddle/fluid/inference/api/mkldnn_quantizer.cc
@@ -62,9 +62,12 @@ bool AnalysisPredictor::MkldnnQuantizer::CalculateScales() {
             if (scales_.find(var_name) != scales_.end()) continue;
 
             auto* var = predictor_.sub_scope_->FindVar(var_name);
-            PADDLE_ENFORCE(var, "%s is not in the scope", var_name);
-            PADDLE_ENFORCE(var->IsType<LoDTensor>(),
-                           "Only support lod tensor now.");
+            PADDLE_ENFORCE_NOT_NULL(var,
+                                    platform::errors::PreconditionNotMet(
+                                        "%s is not in the scope", var_name));
+            PADDLE_ENFORCE_EQ(var->IsType<LoDTensor>(), true,
+                              platform::errors::PreconditionNotMet(
+                                  "Only support lod tensor now."));
             LoDTensor* var_tensor = var->GetMutable<LoDTensor>();
 
             // force unsigned type if already know it
@@ -82,9 +85,11 @@ bool AnalysisPredictor::MkldnnQuantizer::CalculateScales() {
               } else if (op->Type() == "transpose2" ||
                          op->Type() == "reshape2" || op->Type() == "pool2d") {
                 auto input_var_name = op->Input("X")[0];
-                PADDLE_ENFORCE(scales_.find(input_var_name) != scales_.end(),
-                               "Input scales must be calculated before the "
-                               "output scales to infer if output is unsigned.");
+                PADDLE_ENFORCE_NE(
+                    scales_.find(input_var_name), scales_.end(),
+                    platform::errors::PreconditionNotMet(
+                        "Input scales must be calculated before the "
+                        "output scales to infer if output is unsigned."));
                 if (scales_.find(input_var_name) != scales_.end()) {
                   scales_[var_name] = scales_[input_var_name];
                 }
@@ -94,10 +99,11 @@ bool AnalysisPredictor::MkldnnQuantizer::CalculateScales() {
                 is_unsigned = true;
                 double min_scale = std::numeric_limits<double>::max();
                 for (auto input_var_name : op->Input("X")) {
-                  PADDLE_ENFORCE(
-                      scales_.find(input_var_name) != scales_.end(),
-                      "Input scales must be calculated before the "
-                      "output scales to infer if output is unsigned.");
+                  PADDLE_ENFORCE_NE(
+                      scales_.find(input_var_name), scales_.end(),
+                      platform::errors::PreconditionNotMet(
+                          "Input scales must be calculated before the "
+                          "output scales to infer if output is unsigned."));
                   is_unsigned = is_unsigned && scales_[input_var_name].first;
                   min_scale = std::min(
                       min_scale,
@@ -132,11 +138,12 @@ void AnalysisPredictor::MkldnnQuantizer::CalculateSingleScale(
   auto rule = qconfig_->scale_algo(op_type_name, conn_name);
   if (rule == ScaleAlgo::NONE) return;
 
-  PADDLE_ENFORCE(
-      var_tensor.numel() > 0,
-      "MkldnnQuantizer: LoDTensor of variable %s for quantization of op "
-      "%s of connection %s should not be empty.",
-      var_name, op_type_name, conn_name);
+  PADDLE_ENFORCE_GT(
+      var_tensor.numel(), 0,
+      platform::errors::InvalidArgument(
+          "MkldnnQuantizer: LoDTensor of variable %s for quantization of op "
+          "%s of connection %s should not be empty.",
+          var_name, op_type_name, conn_name));
 
   switch (rule) {
     case ScaleAlgo::MAX:
@@ -205,10 +212,11 @@ AnalysisPredictor::MkldnnQuantizer::GetKLScalingFactor(
   float min_val = eigen_tensor.minCoeff();
   bool is_positive = min_val >= 0.0f;
   if (is_unsigned)
-    PADDLE_ENFORCE(
-        is_positive,
-        "Tensor is claimed to be unsigned, but its min value (%f) is < 0.0",
-        min_val);
+    PADDLE_ENFORCE_EQ(
+        is_positive, true,
+        platform::errors::InvalidArgument(
+            "Tensor is claimed to be unsigned, but its min value (%f) is < 0.0",
+            min_val));
 
   int num_quantized_bins = 255;
 
@@ -316,10 +324,11 @@ AnalysisPredictor::MkldnnQuantizer::GetMaxScalingFactor(
   float max_abs = eigen_tensor.abs().maxCoeff();
   float min_val = eigen_tensor.minCoeff();
   if (is_unsigned)
-    PADDLE_ENFORCE(
-        min_val >= 0.0f,
-        "Tensor is claimed to be unsigned, but its min value (%f) is < 0.0",
-        min_val);
+    PADDLE_ENFORCE_GE(
+        min_val, 0.0f,
+        platform::errors::InvalidArgument(
+            "Tensor is claimed to be unsigned, but its min value (%f) is < 0.0",
+            min_val));
 
   LoDTensor scale_tensor = CreateScaleTensor();
   scale_tensor.data<double>()[0] = 1.0 / max_abs;
@@ -330,16 +339,19 @@ AnalysisPredictor::MkldnnQuantizer::GetMaxScalingFactor(
 std::pair<bool, LoDTensor>
 AnalysisPredictor::MkldnnQuantizer::GetMaxChScalingFactor(
     const LoDTensor& var_tensor, bool is_unsigned, bool is_transposed) const {
-  PADDLE_ENFORCE(var_tensor.dims().size() > 0, "Tensor dimension is empty.");
+  PADDLE_ENFORCE_GT(
+      var_tensor.dims().size(), 0,
+      platform::errors::InvalidArgument("Tensor dimension is empty."));
 
   ConstEigenVectorArrayMap eigen_tensor{var_tensor.data<float>(),
                                         var_tensor.numel(), 1};
   float min_val = eigen_tensor.minCoeff();
   if (is_unsigned)
-    PADDLE_ENFORCE(
-        min_val >= 0.0f,
-        "Tensor is claimed to be unsigned, but its min value (%f) is < 0.0",
-        min_val);
+    PADDLE_ENFORCE_GE(
+        min_val, 0.0f,
+        platform::errors::InvalidArgument(
+            "Tensor is claimed to be unsigned, but its min value (%f) is < 0.0",
+            min_val));
 
   auto dims = var_tensor.dims();
   constexpr int num_col_dims = 1;
@@ -367,17 +379,19 @@ AnalysisPredictor::MkldnnQuantizer::Histogram(
     const framework::LoDTensor& var_tensor, float min_val, float max_val,
     size_t num_bins) const {
   PADDLE_ENFORCE_GT(num_bins, 0,
-                    "MkldnnQuantizer: To calculate Histogram, num_bins (" +
-                        std::to_string(num_bins) + ") must be positive.");
-  PADDLE_ENFORCE_GT(
-      var_tensor.numel(), 0,
-      "MkldnnQuantizer: To calculate Histogram, the tensor must not be empty.");
-  PADDLE_ENFORCE(max_val >= min_val,
-                 "MkldnnQuantizer: To calculate Histogram, max_val (" +
-                     std::to_string(max_val) +
-                     ") must be greater or equal"
-                     "to min_val (" +
-                     std::to_string(min_val) + ").");
+                    platform::errors::InvalidArgument(
+                        "MkldnnQuantizer: To calculate Histogram, num_bins (" +
+                        std::to_string(num_bins) + ") must be positive."));
+  PADDLE_ENFORCE_GT(var_tensor.numel(), 0,
+                    platform::errors::InvalidArgument(
+                        "MkldnnQuantizer: To calculate Histogram, the tensor "
+                        "must not be empty."));
+  PADDLE_ENFORCE_GE(max_val, min_val,
+                    platform::errors::InvalidArgument(
+                        "MkldnnQuantizer: To calculate Histogram, max_val (" +
+                        std::to_string(max_val) + ") must be greater or equal"
+                                                  "to min_val (" +
+                        std::to_string(min_val) + ")."));
   ConstEigenVectorArrayMap eigen_tensor{var_tensor.data<float>(),
                                         var_tensor.numel(), 1};
   auto bin_width = std::abs(max_val - min_val) / num_bins;
@@ -407,7 +421,8 @@ void AnalysisPredictor::MkldnnQuantizer::PrepareArgument() const {
   auto graph = std::unique_ptr<Graph>(new Graph(arg.main_program()));
   arg.SetMainGraph(graph.release());
   auto* scope_ptr = arg.scope_ptr();
-  PADDLE_ENFORCE(scope_ptr);
+  PADDLE_ENFORCE_NOT_NULL(scope_ptr, platform::errors::PreconditionNotMet(
+                                         "The scope should not be nullptr."));
   arg.main_graph().SetNotOwned(framework::ir::kParamScopeAttr, scope_ptr);
 
   auto* builder = predictor_.config_.pass_builder();
@@ -441,7 +456,9 @@ bool AnalysisPredictor::MkldnnQuantizer::RunQuantizePasses() const {
   PrepareArgument();
   auto& arg = predictor_.argument_;
   Analyzer().Run(&arg);
-  PADDLE_ENFORCE(arg.scope_valid());
+  PADDLE_ENFORCE_EQ(
+      arg.scope_valid(), true,
+      platform::errors::PreconditionNotMet("The scope should be valid."));
   VLOG(5) << "to prepare executor";
   ARGUMENT_CHECK_FIELD((&arg), ir_analyzed_program);
   predictor_.inference_program_.reset(
@@ -456,7 +473,8 @@ bool AnalysisPredictor::MkldnnQuantizer::RunWarmup() const {
   VLOG(3) << "Predictor: run a quantization warmup iteration";
   auto warmup_data = qconfig_->warmup_data();
   PADDLE_ENFORCE_NOT_NULL(warmup_data,
-                          "Warmup data cannot be NULL in the config.");
+                          platform::errors::PreconditionNotMet(
+                              "Warmup data cannot be NULL in the config."));
   PrettyLogH1("--- Running warmup iteration for quantization");
 
   // Run the inference program
@@ -469,7 +487,10 @@ bool AnalysisPredictor::MkldnnQuantizer::RunWarmup() const {
 float AnalysisPredictor::MkldnnQuantizer::SafeEntropy(
     std::vector<int> reference_distr_P, int P_sum,
     std::vector<int> candidate_distr_Q, int Q_sum) const {
-  PADDLE_ENFORCE_EQ(reference_distr_P.size(), candidate_distr_Q.size());
+  PADDLE_ENFORCE_EQ(reference_distr_P.size(), candidate_distr_Q.size(),
+                    platform::errors::InvalidArgument(
+                        "The P size %d should be equal to Q size %d",
+                        reference_distr_P.size(), candidate_distr_Q.size()));
   float tmp_sum1 = 0;
   float tmp_sum2 = 0;
   for (size_t idx = 0; idx < reference_distr_P.size(); idx++) {
@@ -479,10 +500,11 @@ float AnalysisPredictor::MkldnnQuantizer::SafeEntropy(
       tmp_sum1 += 0;
       tmp_sum2 += 0;
     } else {
-      PADDLE_ENFORCE(q_idx != 0, "MkldnnQuantizer: Fatal error!, idx = " +
-                                     std::to_string(idx) +
-                                     " qindex = 0! p_idx = " +
-                                     std::to_string(p_idx));
+      PADDLE_ENFORCE_NE(
+          q_idx, 0,
+          platform::errors::PreconditionNotMet(
+              "MkldnnQuantizer: Fatal error!, idx = " + std::to_string(idx) +
+              " qindex = 0! p_idx = " + std::to_string(p_idx)));
     }
     tmp_sum1 += p_idx * (log(Q_sum * p_idx));
     tmp_sum2 += p_idx * (log(P_sum * q_idx));
diff --git a/paddle/fluid/inference/api/paddle_analysis_config.h b/paddle/fluid/inference/api/paddle_analysis_config.h
index 6a31ff281c68e3675d35c14059a453455ef398df..b1244e4e3dfdd5e6a627054250e6def2a7c35a89 100644
--- a/paddle/fluid/inference/api/paddle_analysis_config.h
+++ b/paddle/fluid/inference/api/paddle_analysis_config.h
@@ -401,6 +401,19 @@ struct PD_INFER_DECL AnalysisConfig {
   ///
   void EnableMkldnnQuantizer();
 
+  ///
+  /// \brief Turn on MKLDNN bfloat16.
+  ///
+  ///
+  void EnableMkldnnBfloat16();
+
+  ///
+  /// \brief A boolean state telling whether to use the MKLDNN Bfloat16.
+  ///
+  /// \return bool Whether to use the MKLDNN Bfloat16.
+  ///
+  bool mkldnn_bfloat16_enabled() const { return use_mkldnn_bfloat16_; }
+
   ///
   /// \brief A boolean state telling whether the thread local CUDA stream is
   /// enabled.
@@ -592,6 +605,7 @@ struct PD_INFER_DECL AnalysisConfig {
   int mkldnn_cache_capacity_{0};
   bool use_mkldnn_quantizer_{false};
   std::shared_ptr<MkldnnQuantizerConfig> mkldnn_quantizer_config_;
+  bool use_mkldnn_bfloat16_{false};
 
   // If the config is already used on a predictor, it becomes invalid.
   // Any config can only be used with one predictor.
diff --git a/paddle/fluid/inference/api/paddle_infer_declare.h b/paddle/fluid/inference/api/paddle_infer_declare.h
index 39c9653f16cefb71a9f2a0ddcc08723d189d411c..e8525f440fe7f2d54d045eedb79aed228513e550 100644
--- a/paddle/fluid/inference/api/paddle_infer_declare.h
+++ b/paddle/fluid/inference/api/paddle_infer_declare.h
@@ -17,11 +17,7 @@
 #if defined(_WIN32)
 #ifndef PD_INFER_DECL
 #ifdef PADDLE_DLL_INFERENCE
-#ifndef PADDLE_ON_INFERENCE
-#define PD_INFER_DECL
-#else
 #define PD_INFER_DECL __declspec(dllexport)
-#endif  // PADDLE_ON_INFERENCE
 #else
 #define PD_INFER_DECL __declspec(dllimport)
 #endif  // PADDLE_DLL_INFERENCE
diff --git a/paddle/fluid/inference/api/paddle_inference_api.h b/paddle/fluid/inference/api/paddle_inference_api.h
index da5d7411693c92eaa2066c7f76d56970f8939bc7..5dc4430fde4715fe11c19ce8adc7397f77391fc3 100644
--- a/paddle/fluid/inference/api/paddle_inference_api.h
+++ b/paddle/fluid/inference/api/paddle_inference_api.h
@@ -31,12 +31,30 @@ limitations under the License. */
 #include "paddle_analysis_config.h"  // NOLINT
 #include "paddle_api.h"              // NOLINT
 
+///
+/// \file paddle_inference_api.h
+///
+/// \brief Paddle Inference API
+///
+/// \author paddle-infer@baidu.com
+/// \date 2020-09-01
+/// \since 2.0.0-beta
+///
+
 namespace paddle_infer {
 using DataType = paddle::PaddleDType;
 using PlaceType = paddle::PaddlePlace;
 using PrecisionType = paddle::AnalysisConfig::Precision;
 using Config = paddle::AnalysisConfig;
 
+///
+/// \class Tensor
+///
+/// \brief Represents an n-dimensional array of values.
+/// The Tensor is used to store the input or output of the network.
+/// It is obtained through Predictor::GetinputHandle()
+/// and Predictor::GetOutputHandle() interface.
+///
 class PD_INFER_DECL Tensor {
  public:
   // Can only be created by predictor->GetInputHandle(cosnt std::string& name)
@@ -44,60 +62,186 @@ class PD_INFER_DECL Tensor {
   Tensor() = delete;
   explicit Tensor(std::unique_ptr<paddle::ZeroCopyTensor>&& tensor)
       : tensor_(std::move(tensor)) {}
+
+  ///
+  /// \brief Reset the shape of the tensor.
+  /// Generally it's only used for the input tensor.
+  /// Reshape must be called before calling mutable_data() or CopyFromCpu()
+  /// \param shape The shape to set.
+  ///
   void Reshape(const std::vector<int>& shape);
 
+  ///
+  /// \brief Copy the host memory to tensor data.
+  /// It's usually used to set the input tensor data.
+  /// \param data The pointer of the data, from which the tensor will copy.
+  ///
   template <typename T>
   void CopyFromCpu(const T* data);
 
-  // should add the place
+  ///
+  /// \brief Get the memory pointer in CPU or GPU with specific data type.
+  /// Please Reshape the tensor first before call this.
+  /// It's usually used to get input data pointer.
+  /// \param place The place of the tensor.
+  /// \return The tensor data buffer pointer.
+  ///
   template <typename T>
   T* mutable_data(PlaceType place);
 
+  ///
+  /// \brief Copy the tensor data to the host memory.
+  /// It's usually used to get the output tensor data.
+  /// \param[out] data The tensor will copy the data to the address.
+  ///
   template <typename T>
   void CopyToCpu(T* data);
 
+  ///
+  /// \brief Get the memory pointer directly.
+  /// It's usually used to get the output data pointer.
+  /// \param[out] place To get the device type of the tensor.
+  /// \param[out] size To get the data size of the tensor.
+  /// \return The tensor data buffer pointer.
+  ///
   template <typename T>
   T* data(PlaceType* place, int* size) const;
 
+  ///
+  /// \brief Set lod info of the tensor.
+  /// More about LOD can be seen here:
+  ///  https://www.paddlepaddle.org.cn/documentation/docs/zh/beginners_guide/basic_concept/lod_tensor.html#lodtensor
+  /// \param x the lod info.
+  ///
   void SetLoD(const std::vector<std::vector<size_t>>& x);
+
+  /// \brief Return the lod info of the tensor.
   std::vector<std::vector<size_t>> lod() const;
 
+  /// \brief Return the data type of the tensor.
+  /// It's usually used to get the output tensor data type.
+  /// \return The data type of the tensor.
   DataType type() const;
 
+  /// \brief Return the shape of the Tensor.
   std::vector<int> shape() const;
+
+  /// \brief Return the name of the tensor.
   const std::string& name() const;
 
  private:
   std::unique_ptr<paddle::ZeroCopyTensor> tensor_;
 };
 
+///
+/// \class Predictor
+///
+/// \brief Predictor is the interface for model prediction.
+///
+/// The predictor has the following typical uses:
+///
+/// Get predictor
+/// \code{cpp}
+///   auto predictor = CreatePredictor(config);
+/// \endcode
+///
+/// Get input or output names
+/// \code{cpp}
+///   auto input_names = predictor->GetInputNames();
+///   auto output_names = predictor->GetOutputNames();
+/// \endcode
+///
+/// Get input or output handle
+/// \code{cpp}
+///   auto input_t = predictor->GetInputHandle(input_names[0]);
+///   auto output_t = predictor->GetOutputHandle(output_names[0]);
+/// \endcode
+///
+/// Run predictor
+/// \code{cpp}
+///   predictor->Run();
+/// \endcode
+///
 class PD_INFER_DECL Predictor {
  public:
-  Predictor() = default;
+  Predictor() = delete;
   ~Predictor() {}
   // Use for clone
   explicit Predictor(std::unique_ptr<paddle::PaddlePredictor>&& pred)
       : predictor_(std::move(pred)) {}
 
+  ///
+  /// \brief Construct a new Predictor object
+  ///
+  /// \param[in] Config config
+  ///
   explicit Predictor(const Config& config);
 
+  ///
+  /// \brief Get the input names
+  ///
+  /// \return input names
+  ///
   std::vector<std::string> GetInputNames();
+
+  ///
+  /// \brief Get the Input Tensor object
+  ///
+  /// \param[in] name input name
+  /// \return input tensor
+  ///
   std::unique_ptr<Tensor> GetInputHandle(const std::string& name);
 
+  ///
+  /// \brief Run the prediction engine
+  ///
+  /// \return Whether the function executed successfully
+  ///
   bool Run();
 
+  ///
+  /// \brief Get the output names
+  ///
+  /// \return output names
+  ///
   std::vector<std::string> GetOutputNames();
+
+  ///
+  /// \brief Get the Output Tensor object
+  ///
+  /// \param[in] name otuput name
+  /// \return output tensor
+  ///
   std::unique_ptr<Tensor> GetOutputHandle(const std::string& name);
 
+  ///
+  /// \brief Clone to get the new predictor. thread safe.
+  ///
+  /// \return get a new predictor
+  ///
   std::unique_ptr<Predictor> Clone();
+
+  /// \brief Clear the intermediate tensors of the predictor
   void ClearIntermediateTensor();
 
  private:
   std::unique_ptr<paddle::PaddlePredictor> predictor_;
 };
 
+///
+/// \brief A factory to help create predictors.
+///
+/// Usage:
+///
+/// \code{.cpp}
+/// Config config;
+/// ... // change the configs.
+/// auto predictor = CreatePredictor(config);
+/// \endcode
+///
 PD_INFER_DECL std::shared_ptr<Predictor> CreatePredictor(
     const Config& config);  // NOLINT
+
 PD_INFER_DECL int GetNumBytesOfDataType(DataType dtype);
 
 PD_INFER_DECL std::string GetVersion();
@@ -128,13 +272,24 @@ T* Tensor::data(PlaceType* place, int* size) const {
 namespace paddle_infer {
 namespace services {
 
+///
+/// \class PredictorPool
+///
+/// \brief PredictorPool is a simple encapsulation of Predictor, suitable for
+/// use in multi-threaded situations. According to the thread id, the
+/// corresponding Predictor is taken out from PredictorPool to complete the
+/// prediction.
+///
 class PD_INFER_DECL PredictorPool {
  public:
   PredictorPool() = delete;
   PredictorPool(const PredictorPool&) = delete;
   PredictorPool& operator=(const PredictorPool&) = delete;
 
+  /// \brief Construct the predictor pool with \param size predictor instances.
   explicit PredictorPool(const Config& config, size_t size = 1);
+
+  /// \brief Get \param id-th predictor.
   Predictor* Retrive(size_t idx);
 
  private:
diff --git a/paddle/fluid/inference/api/paddle_pass_builder.cc b/paddle/fluid/inference/api/paddle_pass_builder.cc
index 43d0a2a9d0ad3ebf88feaae8df6208dc109f0b41..19f52422b441faf45204f47adbcf4e6aae30f6f1 100644
--- a/paddle/fluid/inference/api/paddle_pass_builder.cc
+++ b/paddle/fluid/inference/api/paddle_pass_builder.cc
@@ -143,6 +143,10 @@ void GpuPassStrategy::EnableMkldnnQuantizer() {
   LOG(ERROR) << "GPU not support MKL-DNN quantization";
 }
 
+void GpuPassStrategy::EnableMkldnnBfloat16() {
+  LOG(ERROR) << "GPU not support MKL-DNN bfloat16";
+}
+
 CpuPassStrategy::CpuPassStrategy() : PassStrategy({}) {
   // NOTE the large fusions should be located in the front, so that they will
   // not be damaged by smaller ones.
@@ -152,7 +156,8 @@ CpuPassStrategy::CpuPassStrategy() : PassStrategy({}) {
                   // "seqpool_concat_fuse_pass",    //
                   "seqpool_cvm_concat_fuse_pass",  //
                   // "embedding_fc_lstm_fuse_pass", //
-                  "fc_lstm_fuse_pass",                       //
+                  // TODO(wilber): fix correctness problem.
+                  // "fc_lstm_fuse_pass",                       //
                   "mul_lstm_fuse_pass",                      //
                   "fc_gru_fuse_pass",                        //
                   "mul_gru_fuse_pass",                       //
@@ -225,4 +230,16 @@ void CpuPassStrategy::EnableMkldnnQuantizer() {
 #endif
 }
 
+void CpuPassStrategy::EnableMkldnnBfloat16() {
+#ifdef PADDLE_WITH_MKLDNN
+  if (!use_mkldnn_bfloat16_) {
+    passes_.push_back("cpu_bfloat16_placement_pass");
+    passes_.push_back("cpu_bfloat16_pass");
+  }
+  use_mkldnn_bfloat16_ = true;
+#else
+  use_mkldnn_bfloat16_ = false;
+#endif
+}
+
 }  // namespace paddle
diff --git a/paddle/fluid/inference/api/paddle_pass_builder.h b/paddle/fluid/inference/api/paddle_pass_builder.h
index c5a4a5f754d031a8e8f88a96dd16c89fbe1b0fbb..9073253520466a3711089bc7b7da04a9191e0a42 100644
--- a/paddle/fluid/inference/api/paddle_pass_builder.h
+++ b/paddle/fluid/inference/api/paddle_pass_builder.h
@@ -132,6 +132,9 @@ class PD_INFER_DECL PassStrategy : public PaddlePassBuilder {
   /// \brief Enable MKLDNN quantize optimization.
   virtual void EnableMkldnnQuantizer() {}
 
+  /// \brief Enable MKLDNN bfloat16.
+  virtual void EnableMkldnnBfloat16() {}
+
   /// \brief Check if we are using gpu.
   /// \return A bool variable implying whether we are in gpu mode.
   bool use_gpu() const { return use_gpu_; }
@@ -161,6 +164,7 @@ class PD_INFER_DECL CpuPassStrategy : public PassStrategy {
     use_gpu_ = other.use_gpu_;
     use_mkldnn_ = other.use_mkldnn_;
     use_mkldnn_quantizer_ = other.use_mkldnn_quantizer_;
+    use_mkldnn_bfloat16_ = other.use_mkldnn_bfloat16_;
   }
   /// \brief Default destructor.
   virtual ~CpuPassStrategy() = default;
@@ -174,9 +178,13 @@ class PD_INFER_DECL CpuPassStrategy : public PassStrategy {
   /// \brief Enable MKLDNN quantize optimization.
   void EnableMkldnnQuantizer() override;
 
+  /// \brief Enable MKLDNN bfloat16.
+  void EnableMkldnnBfloat16() override;
+
  protected:
   /// \cond Protected
   bool use_mkldnn_quantizer_{false};
+  bool use_mkldnn_bfloat16_{false};
   /// \endcond
 };
 
@@ -205,6 +213,9 @@ class PD_INFER_DECL GpuPassStrategy : public PassStrategy {
   /// \brief Not supported in GPU mode yet.
   void EnableMkldnnQuantizer() override;
 
+  /// \brief Not supported in GPU mode yet.
+  void EnableMkldnnBfloat16() override;
+
   /// \brief Default destructor.
   virtual ~GpuPassStrategy() = default;
 
diff --git a/paddle/fluid/inference/capi/c_api.cc b/paddle/fluid/inference/capi/c_api.cc
index 821dff2f036c1892570a8ade5b40363251c7f531..07493c742c4fa906e7c4817e328e7d4f81afbffa 100644
--- a/paddle/fluid/inference/capi/c_api.cc
+++ b/paddle/fluid/inference/capi/c_api.cc
@@ -16,6 +16,7 @@
 #include <vector>
 #include "paddle/fluid/inference/capi/c_api_internal.h"
 #include "paddle/fluid/inference/capi/paddle_c_api.h"
+#include "paddle/fluid/platform/enforce.h"
 
 using paddle::ConvertToACPrecision;
 using paddle::ConvertToPaddleDType;
@@ -34,27 +35,37 @@ void PD_DeletePaddleBuf(PD_PaddleBuf* buf) {
 }
 
 void PD_PaddleBufResize(PD_PaddleBuf* buf, size_t length) {
-  PADDLE_ENFORCE_NOT_NULL(buf);
+  PADDLE_ENFORCE_NOT_NULL(buf,
+                          paddle::platform::errors::InvalidArgument(
+                              "The pointer of Buffer shouldn't be nullptr"));
   buf->buf.Resize(length);
 }
 
 void PD_PaddleBufReset(PD_PaddleBuf* buf, void* data, size_t length) {
-  PADDLE_ENFORCE_NOT_NULL(buf);
+  PADDLE_ENFORCE_NOT_NULL(buf,
+                          paddle::platform::errors::InvalidArgument(
+                              "The pointer of Buffer shouldn't be nullptr"));
   buf->buf.Reset(data, length);
 }
 
 bool PD_PaddleBufEmpty(PD_PaddleBuf* buf) {
-  PADDLE_ENFORCE_NOT_NULL(buf);
+  PADDLE_ENFORCE_NOT_NULL(buf,
+                          paddle::platform::errors::InvalidArgument(
+                              "The pointer of Buffer shouldn't be nullptr"));
   return buf->buf.empty();
 }
 
 void* PD_PaddleBufData(PD_PaddleBuf* buf) {
-  PADDLE_ENFORCE_NOT_NULL(buf);
+  PADDLE_ENFORCE_NOT_NULL(buf,
+                          paddle::platform::errors::InvalidArgument(
+                              "The pointer of Buffer shouldn't be nullptr"));
   return buf->buf.data();
 }
 
 size_t PD_PaddleBufLength(PD_PaddleBuf* buf) {
-  PADDLE_ENFORCE_NOT_NULL(buf);
+  PADDLE_ENFORCE_NOT_NULL(buf,
+                          paddle::platform::errors::InvalidArgument(
+                              "The pointer of Buffer shouldn't be nullptr"));
   return buf->buf.length();
 }
 
diff --git a/paddle/fluid/inference/capi/c_api_internal.h b/paddle/fluid/inference/capi/c_api_internal.h
index 2dd827229779d34384df2b3ba5f398c77db8369a..7e69b7210768e5af9e8f4150883a608a1517a13c 100644
--- a/paddle/fluid/inference/capi/c_api_internal.h
+++ b/paddle/fluid/inference/capi/c_api_internal.h
@@ -18,7 +18,6 @@
 #include "paddle/fluid/inference/api/paddle_analysis_config.h"
 #include "paddle/fluid/inference/api/paddle_api.h"
 #include "paddle/fluid/inference/capi/paddle_c_api.h"
-#include "paddle/fluid/platform/enforce.h"
 
 using PD_PaddleDType = paddle::PaddleDType;
 using PD_ACPrecision = paddle::AnalysisConfig::Precision;
diff --git a/paddle/fluid/inference/capi/paddle_c_api.h b/paddle/fluid/inference/capi/paddle_c_api.h
index 4be6b48fb1820dc3271de164e87387c73ee67da9..32129890d02a2a0e0b357a6e0402d07b56bc6509 100644
--- a/paddle/fluid/inference/capi/paddle_c_api.h
+++ b/paddle/fluid/inference/capi/paddle_c_api.h
@@ -235,6 +235,12 @@ PADDLE_CAPI_EXPORT extern void PD_EnableMkldnnQuantizer(
 PADDLE_CAPI_EXPORT extern bool PD_MkldnnQuantizerEnabled(
     const PD_AnalysisConfig* config);
 
+PADDLE_CAPI_EXPORT extern void PD_EnableMkldnnBfloat16(
+    PD_AnalysisConfig* config);
+
+PADDLE_CAPI_EXPORT extern bool PD_MkldnnBfloat16Enabled(
+    const PD_AnalysisConfig* config);
+
 PADDLE_CAPI_EXPORT extern void PD_SetModelBuffer(PD_AnalysisConfig* config,
                                                  const char* prog_buffer,
                                                  size_t prog_buffer_size,
diff --git a/paddle/fluid/inference/capi/pd_config.cc b/paddle/fluid/inference/capi/pd_config.cc
index f5445dd5a3f9b6499045361a36fd6363a79ef560..af8d4a69ecf24862ca5f282655b72ef37307c1c8 100644
--- a/paddle/fluid/inference/capi/pd_config.cc
+++ b/paddle/fluid/inference/capi/pd_config.cc
@@ -20,6 +20,7 @@
 #include <vector>
 #include "paddle/fluid/inference/capi/c_api_internal.h"
 #include "paddle/fluid/inference/capi/paddle_c_api.h"
+#include "paddle/fluid/platform/enforce.h"
 
 using paddle::ConvertToACPrecision;
 using paddle::ConvertToPaddleDType;
@@ -40,7 +41,10 @@ void PD_DeleteAnalysisConfig(PD_AnalysisConfig* config) {
 void PD_SetModel(PD_AnalysisConfig* config, const char* model_dir,
                  const char* params_path) {
   LOG(INFO) << model_dir;
-  PADDLE_ENFORCE_NOT_NULL(config);
+  PADDLE_ENFORCE_NOT_NULL(
+      config,
+      paddle::platform::errors::InvalidArgument(
+          "The pointer of analysis configuration shouldn't be nullptr"));
   LOG(INFO) << std::string(model_dir);
   if (!params_path) {
     config->config.SetModel(std::string(model_dir));
@@ -50,104 +54,164 @@ void PD_SetModel(PD_AnalysisConfig* config, const char* model_dir,
 }
 
 void PD_SetProgFile(PD_AnalysisConfig* config, const char* x) {
-  PADDLE_ENFORCE_NOT_NULL(config);
+  PADDLE_ENFORCE_NOT_NULL(
+      config,
+      paddle::platform::errors::InvalidArgument(
+          "The pointer of analysis configuration shouldn't be nullptr"));
   config->config.SetProgFile(std::string(x));
 }
 
 void PD_SetParamsFile(PD_AnalysisConfig* config, const char* x) {
-  PADDLE_ENFORCE_NOT_NULL(config);
+  PADDLE_ENFORCE_NOT_NULL(
+      config,
+      paddle::platform::errors::InvalidArgument(
+          "The pointer of analysis configuration shouldn't be nullptr"));
   config->config.SetParamsFile(std::string(x));
 }
 
 void PD_SetOptimCacheDir(PD_AnalysisConfig* config, const char* opt_cache_dir) {
-  PADDLE_ENFORCE_NOT_NULL(config);
+  PADDLE_ENFORCE_NOT_NULL(
+      config,
+      paddle::platform::errors::InvalidArgument(
+          "The pointer of analysis configuration shouldn't be nullptr"));
   config->config.SetOptimCacheDir(std::string(opt_cache_dir));
 }
 
 const char* PD_ModelDir(const PD_AnalysisConfig* config) {
-  PADDLE_ENFORCE_NOT_NULL(config);
+  PADDLE_ENFORCE_NOT_NULL(
+      config,
+      paddle::platform::errors::InvalidArgument(
+          "The pointer of analysis configuration shouldn't be nullptr"));
   return config->config.model_dir().c_str();
 }
 
 const char* PD_ProgFile(const PD_AnalysisConfig* config) {
-  PADDLE_ENFORCE_NOT_NULL(config);
+  PADDLE_ENFORCE_NOT_NULL(
+      config,
+      paddle::platform::errors::InvalidArgument(
+          "The pointer of analysis configuration shouldn't be nullptr"));
   return config->config.prog_file().c_str();
 }
 
 const char* PD_ParamsFile(const PD_AnalysisConfig* config) {
-  PADDLE_ENFORCE_NOT_NULL(config);
+  PADDLE_ENFORCE_NOT_NULL(
+      config,
+      paddle::platform::errors::InvalidArgument(
+          "The pointer of analysis configuration shouldn't be nullptr"));
   return config->config.params_file().c_str();
 }
 
 void PD_EnableUseGpu(PD_AnalysisConfig* config, int memory_pool_init_size_mb,
                      int device_id) {
-  PADDLE_ENFORCE_NOT_NULL(config);
+  PADDLE_ENFORCE_NOT_NULL(
+      config,
+      paddle::platform::errors::InvalidArgument(
+          "The pointer of analysis configuration shouldn't be nullptr"));
   config->config.EnableUseGpu(static_cast<uint64_t>(memory_pool_init_size_mb),
                               device_id);
 }
 
 void PD_DisableGpu(PD_AnalysisConfig* config) {
-  PADDLE_ENFORCE_NOT_NULL(config);
+  PADDLE_ENFORCE_NOT_NULL(
+      config,
+      paddle::platform::errors::InvalidArgument(
+          "The pointer of analysis configuration shouldn't be nullptr"));
   config->config.DisableGpu();
 }
 
 bool PD_UseGpu(const PD_AnalysisConfig* config) {
-  PADDLE_ENFORCE_NOT_NULL(config);
+  PADDLE_ENFORCE_NOT_NULL(
+      config,
+      paddle::platform::errors::InvalidArgument(
+          "The pointer of analysis configuration shouldn't be nullptr"));
   return config->config.use_gpu();
 }
 
 int PD_GpuDeviceId(const PD_AnalysisConfig* config) {
-  PADDLE_ENFORCE_NOT_NULL(config);
+  PADDLE_ENFORCE_NOT_NULL(
+      config,
+      paddle::platform::errors::InvalidArgument(
+          "The pointer of analysis configuration shouldn't be nullptr"));
   return config->config.gpu_device_id();
 }
 
 int PD_MemoryPoolInitSizeMb(const PD_AnalysisConfig* config) {
-  PADDLE_ENFORCE_NOT_NULL(config);
+  PADDLE_ENFORCE_NOT_NULL(
+      config,
+      paddle::platform::errors::InvalidArgument(
+          "The pointer of analysis configuration shouldn't be nullptr"));
   return config->config.memory_pool_init_size_mb();
 }
 
 float PD_FractionOfGpuMemoryForPool(const PD_AnalysisConfig* config) {
-  PADDLE_ENFORCE_NOT_NULL(config);
+  PADDLE_ENFORCE_NOT_NULL(
+      config,
+      paddle::platform::errors::InvalidArgument(
+          "The pointer of analysis configuration shouldn't be nullptr"));
   return config->config.fraction_of_gpu_memory_for_pool();
 }
 
 void PD_EnableCUDNN(PD_AnalysisConfig* config) {
-  PADDLE_ENFORCE_NOT_NULL(config);
+  PADDLE_ENFORCE_NOT_NULL(
+      config,
+      paddle::platform::errors::InvalidArgument(
+          "The pointer of analysis configuration shouldn't be nullptr"));
   config->config.EnableCUDNN();
 }
 
 bool PD_CudnnEnabled(const PD_AnalysisConfig* config) {
-  PADDLE_ENFORCE_NOT_NULL(config);
+  PADDLE_ENFORCE_NOT_NULL(
+      config,
+      paddle::platform::errors::InvalidArgument(
+          "The pointer of analysis configuration shouldn't be nullptr"));
   return config->config.cudnn_enabled();
 }
 
 void PD_SwitchIrOptim(PD_AnalysisConfig* config, bool x) {
-  PADDLE_ENFORCE_NOT_NULL(config);
+  PADDLE_ENFORCE_NOT_NULL(
+      config,
+      paddle::platform::errors::InvalidArgument(
+          "The pointer of analysis configuration shouldn't be nullptr"));
   config->config.SwitchIrOptim(x);
 }
 
 bool PD_IrOptim(const PD_AnalysisConfig* config) {
-  PADDLE_ENFORCE_NOT_NULL(config);
+  PADDLE_ENFORCE_NOT_NULL(
+      config,
+      paddle::platform::errors::InvalidArgument(
+          "The pointer of analysis configuration shouldn't be nullptr"));
   return config->config.ir_optim();
 }
 
 void PD_SwitchUseFeedFetchOps(PD_AnalysisConfig* config, bool x) {
-  PADDLE_ENFORCE_NOT_NULL(config);
+  PADDLE_ENFORCE_NOT_NULL(
+      config,
+      paddle::platform::errors::InvalidArgument(
+          "The pointer of analysis configuration shouldn't be nullptr"));
   config->config.SwitchUseFeedFetchOps(x);
 }
 
 bool PD_UseFeedFetchOpsEnabled(const PD_AnalysisConfig* config) {
-  PADDLE_ENFORCE_NOT_NULL(config);
+  PADDLE_ENFORCE_NOT_NULL(
+      config,
+      paddle::platform::errors::InvalidArgument(
+          "The pointer of analysis configuration shouldn't be nullptr"));
   return config->config.use_feed_fetch_ops_enabled();
 }
 
 void PD_SwitchSpecifyInputNames(PD_AnalysisConfig* config, bool x) {
-  PADDLE_ENFORCE_NOT_NULL(config);
+  PADDLE_ENFORCE_NOT_NULL(
+      config,
+      paddle::platform::errors::InvalidArgument(
+          "The pointer of analysis configuration shouldn't be nullptr"));
   config->config.SwitchSpecifyInputNames(x);
 }
 
 bool PD_SpecifyInputName(const PD_AnalysisConfig* config) {
-  PADDLE_ENFORCE_NOT_NULL(config);
+  PADDLE_ENFORCE_NOT_NULL(
+      config,
+      paddle::platform::errors::InvalidArgument(
+          "The pointer of analysis configuration shouldn't be nullptr"));
   return config->config.specify_input_name();
 }
 
@@ -155,98 +219,168 @@ void PD_EnableTensorRtEngine(PD_AnalysisConfig* config, int workspace_size,
                              int max_batch_size, int min_subgraph_size,
                              Precision precision, bool use_static,
                              bool use_calib_mode) {
-  PADDLE_ENFORCE_NOT_NULL(config);
+  PADDLE_ENFORCE_NOT_NULL(
+      config,
+      paddle::platform::errors::InvalidArgument(
+          "The pointer of analysis configuration shouldn't be nullptr"));
   config->config.EnableTensorRtEngine(
       workspace_size, max_batch_size, min_subgraph_size,
       paddle::ConvertToACPrecision(precision), use_static, use_calib_mode);
 }
 
 bool PD_TensorrtEngineEnabled(const PD_AnalysisConfig* config) {
-  PADDLE_ENFORCE_NOT_NULL(config);
+  PADDLE_ENFORCE_NOT_NULL(
+      config,
+      paddle::platform::errors::InvalidArgument(
+          "The pointer of analysis configuration shouldn't be nullptr"));
   return config->config.tensorrt_engine_enabled();
 }
 
 void PD_SwitchIrDebug(PD_AnalysisConfig* config, bool x) {
-  PADDLE_ENFORCE_NOT_NULL(config);
+  PADDLE_ENFORCE_NOT_NULL(
+      config,
+      paddle::platform::errors::InvalidArgument(
+          "The pointer of analysis configuration shouldn't be nullptr"));
   config->config.SwitchIrDebug(x);
 }
 
 void PD_EnableMKLDNN(PD_AnalysisConfig* config) {
-  PADDLE_ENFORCE_NOT_NULL(config);
+  PADDLE_ENFORCE_NOT_NULL(
+      config,
+      paddle::platform::errors::InvalidArgument(
+          "The pointer of analysis configuration shouldn't be nullptr"));
   config->config.EnableMKLDNN();
 }
 
 void PD_SetMkldnnCacheCapacity(PD_AnalysisConfig* config, int capacity) {
-  PADDLE_ENFORCE_NOT_NULL(config);
+  PADDLE_ENFORCE_NOT_NULL(
+      config,
+      paddle::platform::errors::InvalidArgument(
+          "The pointer of analysis configuration shouldn't be nullptr"));
   config->config.SetMkldnnCacheCapacity(capacity);
 }
 
 bool PD_MkldnnEnabled(const PD_AnalysisConfig* config) {
-  PADDLE_ENFORCE_NOT_NULL(config);
+  PADDLE_ENFORCE_NOT_NULL(
+      config,
+      paddle::platform::errors::InvalidArgument(
+          "The pointer of analysis configuration shouldn't be nullptr"));
   return config->config.mkldnn_enabled();
 }
 
 void PD_SetCpuMathLibraryNumThreads(PD_AnalysisConfig* config,
                                     int cpu_math_library_num_threads) {
-  PADDLE_ENFORCE_NOT_NULL(config);
+  PADDLE_ENFORCE_NOT_NULL(
+      config,
+      paddle::platform::errors::InvalidArgument(
+          "The pointer of analysis configuration shouldn't be nullptr"));
   config->config.SetCpuMathLibraryNumThreads(cpu_math_library_num_threads);
 }
 
 int PD_CpuMathLibraryNumThreads(const PD_AnalysisConfig* config) {
-  PADDLE_ENFORCE_NOT_NULL(config);
+  PADDLE_ENFORCE_NOT_NULL(
+      config,
+      paddle::platform::errors::InvalidArgument(
+          "The pointer of analysis configuration shouldn't be nullptr"));
   return config->config.cpu_math_library_num_threads();
 }
 
 void PD_EnableMkldnnQuantizer(PD_AnalysisConfig* config) {
-  PADDLE_ENFORCE_NOT_NULL(config);
+  PADDLE_ENFORCE_NOT_NULL(
+      config,
+      paddle::platform::errors::InvalidArgument(
+          "The pointer of analysis configuration shouldn't be nullptr"));
   config->config.EnableMkldnnQuantizer();
 }
 
 bool PD_MkldnnQuantizerEnabled(const PD_AnalysisConfig* config) {
-  PADDLE_ENFORCE_NOT_NULL(config);
+  PADDLE_ENFORCE_NOT_NULL(
+      config,
+      paddle::platform::errors::InvalidArgument(
+          "The pointer of analysis configuration shouldn't be nullptr"));
   return config->config.mkldnn_quantizer_enabled();
 }
 
+void PD_EnableMkldnnBfloat16(PD_AnalysisConfig* config) {
+  PADDLE_ENFORCE_NOT_NULL(
+      config,
+      paddle::platform::errors::InvalidArgument(
+          "The pointer of analysis configuration shouldn't be nullptr"));
+  config->config.EnableMkldnnBfloat16();
+}
+
+bool PD_MkldnnBfloat16Enabled(const PD_AnalysisConfig* config) {
+  PADDLE_ENFORCE_NOT_NULL(
+      config,
+      paddle::platform::errors::InvalidArgument(
+          "The pointer of analysis configuration shouldn't be nullptr"));
+  return config->config.mkldnn_bfloat16_enabled();
+}
+
 void PD_SetModelBuffer(PD_AnalysisConfig* config, const char* prog_buffer,
                        size_t prog_buffer_size, const char* params_buffer,
                        size_t params_buffer_size) {
-  PADDLE_ENFORCE_NOT_NULL(config);
+  PADDLE_ENFORCE_NOT_NULL(
+      config,
+      paddle::platform::errors::InvalidArgument(
+          "The pointer of analysis configuration shouldn't be nullptr"));
   config->config.SetModelBuffer(prog_buffer, prog_buffer_size, params_buffer,
                                 params_buffer_size);
 }
 
 bool PD_ModelFromMemory(const PD_AnalysisConfig* config) {
-  PADDLE_ENFORCE_NOT_NULL(config);
+  PADDLE_ENFORCE_NOT_NULL(
+      config,
+      paddle::platform::errors::InvalidArgument(
+          "The pointer of analysis configuration shouldn't be nullptr"));
   return config->config.model_from_memory();
 }
 
 void PD_EnableMemoryOptim(PD_AnalysisConfig* config) {
-  PADDLE_ENFORCE_NOT_NULL(config);
+  PADDLE_ENFORCE_NOT_NULL(
+      config,
+      paddle::platform::errors::InvalidArgument(
+          "The pointer of analysis configuration shouldn't be nullptr"));
   config->config.EnableMemoryOptim();
 }
 
 bool PD_MemoryOptimEnabled(const PD_AnalysisConfig* config) {
-  PADDLE_ENFORCE_NOT_NULL(config);
+  PADDLE_ENFORCE_NOT_NULL(
+      config,
+      paddle::platform::errors::InvalidArgument(
+          "The pointer of analysis configuration shouldn't be nullptr"));
   return config->config.enable_memory_optim();
 }
 
 void PD_EnableProfile(PD_AnalysisConfig* config) {
-  PADDLE_ENFORCE_NOT_NULL(config);
+  PADDLE_ENFORCE_NOT_NULL(
+      config,
+      paddle::platform::errors::InvalidArgument(
+          "The pointer of analysis configuration shouldn't be nullptr"));
   config->config.EnableProfile();
 }
 
 bool PD_ProfileEnabled(const PD_AnalysisConfig* config) {
-  PADDLE_ENFORCE_NOT_NULL(config);
+  PADDLE_ENFORCE_NOT_NULL(
+      config,
+      paddle::platform::errors::InvalidArgument(
+          "The pointer of analysis configuration shouldn't be nullptr"));
   return config->config.profile_enabled();
 }
 
 void PD_SetInValid(PD_AnalysisConfig* config) {
-  PADDLE_ENFORCE_NOT_NULL(config);
+  PADDLE_ENFORCE_NOT_NULL(
+      config,
+      paddle::platform::errors::InvalidArgument(
+          "The pointer of analysis configuration shouldn't be nullptr"));
   config->config.SetInValid();
 }
 
 bool PD_IsValid(const PD_AnalysisConfig* config) {
-  PADDLE_ENFORCE_NOT_NULL(config);
+  PADDLE_ENFORCE_NOT_NULL(
+      config,
+      paddle::platform::errors::InvalidArgument(
+          "The pointer of analysis configuration shouldn't be nullptr"));
   return config->config.is_valid();
 }
 
diff --git a/paddle/fluid/inference/capi/pd_predictor.cc b/paddle/fluid/inference/capi/pd_predictor.cc
index 8aa1e2a7b7f9b99a1636ca2e7396089ab2ae7e15..c1bf4c974fac8c80c3e8e31fbd247332a325e2aa 100644
--- a/paddle/fluid/inference/capi/pd_predictor.cc
+++ b/paddle/fluid/inference/capi/pd_predictor.cc
@@ -22,6 +22,7 @@
 #include "paddle/fluid/inference/api/paddle_api.h"
 #include "paddle/fluid/inference/capi/c_api_internal.h"
 #include "paddle/fluid/inference/capi/paddle_c_api.h"
+#include "paddle/fluid/platform/enforce.h"
 
 using paddle::ConvertToACPrecision;
 using paddle::ConvertToPaddleDType;
@@ -81,7 +82,10 @@ extern "C" {
 bool PD_PredictorRun(const PD_AnalysisConfig* config, PD_Tensor* inputs,
                      int in_size, PD_Tensor** output_data, int* out_size,
                      int batch_size) {
-  PADDLE_ENFORCE_NOT_NULL(config);
+  PADDLE_ENFORCE_NOT_NULL(
+      config,
+      paddle::platform::errors::InvalidArgument(
+          "The pointer of analysis configuration shouldn't be nullptr"));
   VLOG(3) << "Predoctor: PD_PredictorRun. ";
   static std::map<std::string, std::unique_ptr<paddle::PaddlePredictor>>
       predictors;
@@ -111,7 +115,10 @@ bool PD_PredictorRun(const PD_AnalysisConfig* config, PD_Tensor* inputs,
 bool PD_PredictorZeroCopyRun(const PD_AnalysisConfig* config,
                              PD_ZeroCopyData* inputs, int in_size,
                              PD_ZeroCopyData** output, int* out_size) {
-  PADDLE_ENFORCE_NOT_NULL(config);
+  PADDLE_ENFORCE_NOT_NULL(
+      config,
+      paddle::platform::errors::InvalidArgument(
+          "The pointer of analysis configuration shouldn't be nullptr"));
   static std::map<std::string, std::unique_ptr<paddle::PaddlePredictor>>
       predictors;
   if (!predictors.count(config->config.model_dir())) {
@@ -123,7 +130,10 @@ bool PD_PredictorZeroCopyRun(const PD_AnalysisConfig* config,
   VLOG(3) << "The inputs' size is " << input_names.size();
   PADDLE_ENFORCE_EQ(
       input_names.size(), in_size,
-      "The number of input and the number of model's input must match. ");
+      paddle::platform::errors::InvalidArgument(
+          "The number of input and the number of model's input must match. The "
+          "number of input is %d, the number of model's input is %d.",
+          input_names.size(), in_size));
   for (int i = 0; i < in_size; ++i) {
     auto input_t = predictor->GetInputTensor(inputs[i].name);
     std::vector<int> tensor_shape;
@@ -144,7 +154,8 @@ bool PD_PredictorZeroCopyRun(const PD_AnalysisConfig* config,
         input_t->copy_from_cpu(static_cast<uint8_t*>(inputs[i].data));
         break;
       default:
-        CHECK(false) << "Unsupport data type.";
+        PADDLE_THROW(paddle::platform::errors::InvalidArgument(
+            "Unsupported data type."));
         break;
     }
   }
@@ -227,7 +238,8 @@ void PD_SetZeroCopyInput(PD_Predictor* predictor,
       input->copy_from_cpu(static_cast<uint8_t*>(tensor->data.data));
       break;
     default:
-      CHECK(false) << "Unsupport data type.";
+      PADDLE_THROW(
+          paddle::platform::errors::InvalidArgument("Unsupported data type."));
       break;
   }
 
@@ -294,7 +306,8 @@ void PD_GetZeroCopyOutput(PD_Predictor* predictor, PD_ZeroCopyTensor* tensor) {
       output->copy_to_cpu(reinterpret_cast<uint8_t*>(tensor->data.data));
       break;
     default:
-      CHECK(false) << "Unsupport data type.";
+      PADDLE_THROW(
+          paddle::platform::errors::InvalidArgument("Unsupported data type."));
       break;
   }
 }
diff --git a/paddle/fluid/inference/capi/pd_tensor.cc b/paddle/fluid/inference/capi/pd_tensor.cc
index b4811f1d6ff192659fa12b33008fe5ac07e6a6c5..9b1eedd7c5a8106a6f6b7be3f682913e2431a3e5 100644
--- a/paddle/fluid/inference/capi/pd_tensor.cc
+++ b/paddle/fluid/inference/capi/pd_tensor.cc
@@ -19,6 +19,7 @@
 #include <vector>
 #include "paddle/fluid/inference/capi/c_api_internal.h"
 #include "paddle/fluid/inference/capi/paddle_c_api.h"
+#include "paddle/fluid/platform/enforce.h"
 
 using paddle::ConvertToACPrecision;
 using paddle::ConvertToPaddleDType;
@@ -37,44 +38,60 @@ void PD_DeletePaddleTensor(PD_Tensor* tensor) {
 }
 
 void PD_SetPaddleTensorName(PD_Tensor* tensor, char* name) {
-  PADDLE_ENFORCE_NOT_NULL(tensor);
+  PADDLE_ENFORCE_NOT_NULL(tensor,
+                          paddle::platform::errors::InvalidArgument(
+                              "The pointer of tensor shouldn't be nullptr"));
   tensor->tensor.name = std::string(name);
 }
 
 void PD_SetPaddleTensorDType(PD_Tensor* tensor, PD_DataType dtype) {
-  PADDLE_ENFORCE_NOT_NULL(tensor);
+  PADDLE_ENFORCE_NOT_NULL(tensor,
+                          paddle::platform::errors::InvalidArgument(
+                              "The pointer of tensor shouldn't be nullptr"));
   tensor->tensor.dtype = paddle::ConvertToPaddleDType(dtype);
 }
 
 void PD_SetPaddleTensorData(PD_Tensor* tensor, PD_PaddleBuf* buf) {
-  PADDLE_ENFORCE_NOT_NULL(tensor);
+  PADDLE_ENFORCE_NOT_NULL(tensor,
+                          paddle::platform::errors::InvalidArgument(
+                              "The pointer of tensor shouldn't be nullptr"));
   tensor->tensor.data = buf->buf;
 }
 
 void PD_SetPaddleTensorShape(PD_Tensor* tensor, int* shape, int size) {
-  PADDLE_ENFORCE_NOT_NULL(tensor);
+  PADDLE_ENFORCE_NOT_NULL(tensor,
+                          paddle::platform::errors::InvalidArgument(
+                              "The pointer of tensor shouldn't be nullptr"));
   tensor->tensor.shape.assign(shape, shape + size);
 }
 
 const char* PD_GetPaddleTensorName(const PD_Tensor* tensor) {
-  PADDLE_ENFORCE_NOT_NULL(tensor);
+  PADDLE_ENFORCE_NOT_NULL(tensor,
+                          paddle::platform::errors::InvalidArgument(
+                              "The pointer of tensor shouldn't be nullptr"));
   return tensor->tensor.name.c_str();
 }
 
 PD_DataType PD_GetPaddleTensorDType(const PD_Tensor* tensor) {
-  PADDLE_ENFORCE_NOT_NULL(tensor);
+  PADDLE_ENFORCE_NOT_NULL(tensor,
+                          paddle::platform::errors::InvalidArgument(
+                              "The pointer of tensor shouldn't be nullptr"));
   return ConvertToPDDataType(tensor->tensor.dtype);
 }
 
 PD_PaddleBuf* PD_GetPaddleTensorData(const PD_Tensor* tensor) {
-  PADDLE_ENFORCE_NOT_NULL(tensor);
+  PADDLE_ENFORCE_NOT_NULL(tensor,
+                          paddle::platform::errors::InvalidArgument(
+                              "The pointer of tensor shouldn't be nullptr"));
   PD_PaddleBuf* ret = PD_NewPaddleBuf();
   ret->buf = tensor->tensor.data;
   return ret;
 }
 
 const int* PD_GetPaddleTensorShape(const PD_Tensor* tensor, int* size) {
-  PADDLE_ENFORCE_NOT_NULL(tensor);
+  PADDLE_ENFORCE_NOT_NULL(tensor,
+                          paddle::platform::errors::InvalidArgument(
+                              "The pointer of tensor shouldn't be nullptr"));
   const std::vector<int>& shape = tensor->tensor.shape;
   *size = shape.size();
   return shape.data();
diff --git a/paddle/fluid/inference/io.cc b/paddle/fluid/inference/io.cc
index c497ab384b5fac74b5241d61517485fd8f2b40c4..84e011c6505a8fe974effbecf54101e0e51d29fa 100644
--- a/paddle/fluid/inference/io.cc
+++ b/paddle/fluid/inference/io.cc
@@ -47,7 +47,9 @@ void Init(const std::vector<std::string> argv) {
 
 void ReadBinaryFile(const std::string& filename, std::string* contents) {
   std::ifstream fin(filename, std::ios::in | std::ios::binary);
-  PADDLE_ENFORCE(static_cast<bool>(fin), "Cannot open file %s", filename);
+  PADDLE_ENFORCE_EQ(
+      fin.is_open(), true,
+      platform::errors::Unavailable("Failed to open file %s.", filename));
   fin.seekg(0, std::ios::end);
   contents->clear();
   contents->resize(fin.tellg());
@@ -133,9 +135,10 @@ std::unique_ptr<framework::ProgramDesc> Load(framework::Executor* executor,
 
   std::unique_ptr<framework::ProgramDesc> main_program(
       new framework::ProgramDesc(program_desc_str));
-  PADDLE_ENFORCE(framework::IsProgramVersionSupported(main_program->Version()),
-                 "model version %ld is not supported.",
-                 main_program->Version());
+  PADDLE_ENFORCE_EQ(
+      framework::IsProgramVersionSupported(main_program->Version()), true,
+      platform::errors::Unavailable("Model version %ld is not supported.",
+                                    main_program->Version()));
 
   // model_from_memory is false in separate parameters.
   LoadPersistables(executor, scope, *main_program, dirname, "",
@@ -151,9 +154,10 @@ std::unique_ptr<framework::ProgramDesc> Load(
 
   std::unique_ptr<framework::ProgramDesc> main_program(
       new framework::ProgramDesc(program_desc_str));
-  PADDLE_ENFORCE(framework::IsProgramVersionSupported(main_program->Version()),
-                 "model version %ld is not supported.",
-                 main_program->Version());
+  PADDLE_ENFORCE_EQ(
+      framework::IsProgramVersionSupported(main_program->Version()), true,
+      platform::errors::Unavailable("Model version %ld is not supported.",
+                                    main_program->Version()));
 
   LoadPersistables(executor, scope, *main_program, "", param_filename,
                    false /* model_from_memory */);
@@ -165,9 +169,10 @@ std::unique_ptr<framework::ProgramDesc> LoadFromMemory(
     const std::string& prog_buffer, const std::string& param_buffer) {
   std::unique_ptr<framework::ProgramDesc> main_program(
       new framework::ProgramDesc(prog_buffer));
-  PADDLE_ENFORCE(framework::IsProgramVersionSupported(main_program->Version()),
-                 "model version %ld is not supported.",
-                 main_program->Version());
+  PADDLE_ENFORCE_EQ(
+      framework::IsProgramVersionSupported(main_program->Version()), true,
+      platform::errors::Unavailable("Model version %ld is not supported.",
+                                    main_program->Version()));
 
   LoadPersistables(executor, scope, *main_program, "", param_buffer,
                    true /* model_filename */);
diff --git a/paddle/fluid/inference/lite/engine.cc b/paddle/fluid/inference/lite/engine.cc
index 8e88c94493952ff257ef69bf73f8edebb6ba2eee..5f24ef00bce59e5886d8448cf3f8356e9aeba481 100644
--- a/paddle/fluid/inference/lite/engine.cc
+++ b/paddle/fluid/inference/lite/engine.cc
@@ -20,8 +20,12 @@
 #define LITE_WITH_XPU 1
 #endif
 
+#ifndef PADDLE_WITH_ARM
+#define LITE_WITH_X86 1
+#endif
+
 #include "paddle/fluid/inference/lite/engine.h"
-#include "lite/api/paddle_use_passes.h"
+#include <utility>
 
 namespace paddle {
 namespace inference {
@@ -36,32 +40,40 @@ bool EngineManager::Has(const std::string& name) const {
   return engines_.at(name).get() != nullptr;
 }
 
-paddle::lite::Predictor* EngineManager::Get(const std::string& name) const {
+paddle::lite_api::PaddlePredictor* EngineManager::Get(
+    const std::string& name) const {
   return engines_.at(name).get();
 }
 
-paddle::lite::Predictor* EngineManager::Create(const std::string& name,
-                                               const EngineConfig& cfg) {
-  if (cfg.valid_places.front().target == TARGET(kCUDA)) {
-#ifdef PADDLE_WITH_CUDA
-    paddle::lite::Env<TARGET(kCUDA)>::Init();
+paddle::lite_api::PaddlePredictor* EngineManager::Create(
+    const std::string& name, const EngineConfig& cfg) {
+  // config info for predictor.
+  paddle::lite_api::CxxConfig lite_cxx_config;
+  lite_cxx_config.set_model_buffer(cfg.model.c_str(), cfg.model.size(),
+                                   cfg.param.c_str(), cfg.param.size());
+  lite_cxx_config.set_valid_places(cfg.valid_places);
+#ifdef PADDLE_WITH_ARM
+  set_threads.set_threads(cfg.cpu_math_library_num_threads);
+#else
+  lite_cxx_config.set_x86_math_library_num_threads(
+      cfg.cpu_math_library_num_threads);
 #endif
-  } else if (cfg.valid_places.front().target == TARGET(kXPU)) {
+
 #ifdef PADDLE_WITH_XPU
-    paddle::lite::TargetWrapper<TARGET(kXPU)>::workspace_l3_size_per_thread =
-        cfg.xpu_l3_workspace_size;
+  lite_cxx_config.set_xpu_workspace_l3_size_per_thread(
+      cfg.xpu_l3_workspace_size);
 #endif
-  }
-  auto* p = new paddle::lite::Predictor();
-  p->Build("", cfg.model, cfg.param, cfg.valid_places, cfg.neglected_passes,
-           cfg.model_type, cfg.model_from_memory);
-  engines_[name].reset(p);
-  return p;
+
+  // create predictor
+  std::shared_ptr<paddle::lite_api::PaddlePredictor> p =
+      paddle::lite_api::CreatePaddlePredictor(lite_cxx_config);
+  engines_[name] = std::move(p);
+  return engines_[name].get();
 }
 
 void EngineManager::DeleteAll() {
   for (auto& item : engines_) {
-    item.second.reset(nullptr);
+    item.second.reset();
   }
 }
 
diff --git a/paddle/fluid/inference/lite/engine.h b/paddle/fluid/inference/lite/engine.h
index 345eb682e9fe81d4ec67a31082c1d347a694fd96..5ba487cc24d7d58cd87853a58fc12f1a82c3610d 100644
--- a/paddle/fluid/inference/lite/engine.h
+++ b/paddle/fluid/inference/lite/engine.h
@@ -23,12 +23,9 @@
 #pragma GCC diagnostic push
 #pragma GCC diagnostic ignored "-Wall"
 #include "lite/api/cxx_api.h"
+#include "lite/api/paddle_api.h"
 #include "lite/api/paddle_place.h"
-#include "lite/core/context.h"
-#include "lite/core/device_info.h"
-#include "lite/core/memory.h"
-#include "lite/core/op_registry.h"
-#include "lite/core/tensor.h"
+#include "lite/api/paddle_use_passes.h"
 #pragma GCC diagnostic pop
 
 namespace paddle {
@@ -38,25 +35,33 @@ namespace lite {
 struct EngineConfig {
   std::string model;
   std::string param;
-  paddle::lite::Place prefer_place;
-  std::vector<paddle::lite::Place> valid_places;
+  std::vector<paddle::lite_api::Place> valid_places;
   std::vector<std::string> neglected_passes;
   lite_api::LiteModelType model_type{lite_api::LiteModelType::kProtobuf};
   bool model_from_memory{true};
+
+  // for xpu
   size_t xpu_l3_workspace_size;
+
+  // for x86 or arm
+  int cpu_math_library_num_threads{1};
+
+  // for cuda
+  bool use_multi_stream{false};
 };
 
 class EngineManager {
  public:
   bool Empty() const;
   bool Has(const std::string& name) const;
-  paddle::lite::Predictor* Get(const std::string& name) const;
-  paddle::lite::Predictor* Create(const std::string& name,
-                                  const EngineConfig& cfg);
+  paddle::lite_api::PaddlePredictor* Get(const std::string& name) const;
+  paddle::lite_api::PaddlePredictor* Create(const std::string& name,
+                                            const EngineConfig& cfg);
   void DeleteAll();
 
  private:
-  std::unordered_map<std::string, std::unique_ptr<paddle::lite::Predictor>>
+  std::unordered_map<std::string,
+                     std::shared_ptr<paddle::lite_api::PaddlePredictor>>
       engines_;
 };
 
diff --git a/paddle/fluid/inference/lite/tensor_utils.cc b/paddle/fluid/inference/lite/tensor_utils.cc
index d79a041ccf8a1611247b65b034c03940eabfcccd..33661594b926f284052c85c6a816a17dfff1ce20 100644
--- a/paddle/fluid/inference/lite/tensor_utils.cc
+++ b/paddle/fluid/inference/lite/tensor_utils.cc
@@ -13,6 +13,7 @@
 // limitations under the License.
 
 #include "paddle/fluid/inference/lite/tensor_utils.h"
+#include <functional>
 #include <map>
 #include <memory>
 #include "paddle/fluid/framework/data_type.h"
@@ -144,16 +145,55 @@ void MemoryCopyAsync(const platform::Place& dst_place, void* dst_data,
   }
 }
 
-void InitDstTensor(paddle::lite::Tensor* dst, const framework::LoDTensor& src) {
+void* GetLiteTensorDataPtr(paddle::lite_api::Tensor* src,
+                           PrecisionType precision_type,
+                           TargetType target_type) {
+  void* res{nullptr};
+  switch (precision_type) {
+    case PrecisionType::kFloat:
+      res = static_cast<void*>(src->mutable_data<float>(target_type));
+      break;
+    case PrecisionType::kInt8:
+      res = static_cast<void*>(src->mutable_data<int8_t>(target_type));
+      break;
+    case PrecisionType::kInt32:
+      res = static_cast<void*>(src->mutable_data<int32_t>(target_type));
+      break;
+    case PrecisionType::kInt64:
+      res = static_cast<void*>(src->mutable_data<int64_t>(target_type));
+      break;
+    default:
+      PADDLE_THROW(platform::errors::Unimplemented(
+          "Unsupported precision type. Now only supports FP32, INT8, INT32 and "
+          "INT64."));
+      break;
+  }
+  return res;
+}
+
+int64_t GetLiteTensorNumel(const paddle::lite_api::Tensor& tensor) {
+  auto shape = tensor.shape();
+  int64_t numel = std::accumulate(shape.begin(), shape.end(), 1,
+                                  std::multiplies<int64_t>());
+  return numel;
+}
+
+void InitDstTensor(paddle::lite_api::Tensor* dst,
+                   const framework::LoDTensor& src) {
   // Currently, Lite needs to explicitly specify the target type of
   // the input tensor.
   constexpr int empty_size = 0;
-  dst->mutable_data(GetLiteTargetType(src.place()), empty_size);
-  dst->set_precision(GetLitePrecisionType(src.type()));
-  SetLoD(dst->mutable_lod(), src.lod());
+  dst->Resize({empty_size});
+  GetLiteTensorDataPtr(dst, GetLitePrecisionType(src.type()),
+                       GetLiteTargetType(src.place()));
+  dst->SetPrecision(GetLitePrecisionType(src.type()));
+  paddle::lite::LoD lite_lod;
+  SetLoD(&lite_lod, src.lod());
+  dst->SetLoD(lite_lod);
 }
 
-void InitDstTensor(framework::LoDTensor* dst, const paddle::lite::Tensor& src) {
+void InitDstTensor(framework::LoDTensor* dst,
+                   const paddle::lite_api::Tensor& src) {
   constexpr framework::proto::VarType::Type dtype =
       framework::proto::VarType_Type_FP32;
   dst->mutable_data(inference::lite::utils::GetNativePlace(src.target()),
@@ -162,7 +202,8 @@ void InitDstTensor(framework::LoDTensor* dst, const paddle::lite::Tensor& src) {
 }
 
 template <>
-void TensorCopyAsync(paddle::lite::Tensor* dst, const framework::LoDTensor& src,
+void TensorCopyAsync(paddle::lite_api::Tensor* dst,
+                     const framework::LoDTensor& src,
                      const platform::DeviceContext& ctx) {
   InitDstTensor(dst, src);
   const platform::Place& src_place = src.place();
@@ -171,52 +212,56 @@ void TensorCopyAsync(paddle::lite::Tensor* dst, const framework::LoDTensor& src,
       static_cast<size_t>(src.numel()) * framework::SizeOfType(src.type());
   dst->Resize(framework::vectorize(src.dims()));
   const void* src_data = src.data<void>();
-  void* dst_data = dst->mutable_data(bytes);
+  void* dst_data{nullptr};
+  dst_data = GetLiteTensorDataPtr(dst, GetLitePrecisionType(src.type()),
+                                  GetLiteTargetType(src.place()));
   VLOG(3) << "[CopyAsync fluid -> lite] Bytes = " << bytes << ", src = " << &src
           << ", dst = " << dst << ", src_type = " << src.type();
   MemoryCopyAsync(dst_place, dst_data, src_place, src_data, bytes, ctx);
-  VLOG(3) << "[Lite memory size] Bytes = " << dst->memory_size();
+  VLOG(3) << "[Lite memory size] Bytes = " << bytes;
 }
 
 template <>
-void TensorCopyAsync(framework::LoDTensor* dst, const paddle::lite::Tensor& src,
+void TensorCopyAsync(framework::LoDTensor* dst,
+                     const paddle::lite_api::Tensor& src,
                      const platform::DeviceContext& ctx) {
-  dst->Resize(paddle::framework::make_ddim(src.dims().Vectorize()));
+  dst->Resize(paddle::framework::make_ddim(src.shape()));
   InitDstTensor(dst, src);
   const platform::Place& src_place = GetNativePlace(src.target());
   const platform::Place& dst_place = dst->place();
-  const size_t bytes =
-      static_cast<size_t>(src.numel()) * framework::SizeOfType(dst->type());
-  const void* src_data = src.raw_data();
+  int64_t src_numel = GetLiteTensorNumel(src);
+  const size_t bytes = src_numel * framework::SizeOfType(dst->type());
+  const void* src_data = src.data<void>();
   // When Lite is ready, the source type needs to be modified here.
   void* dst_data = dst->mutable_data(dst_place, dst->type());
   VLOG(3) << "[CopyAsync lite -> fluid] Bytes = " << bytes << ", src = " << &src
           << ", dst = " << dst << ", src_type = " << dst->type();
   MemoryCopyAsync(dst_place, dst_data, src_place, src_data, bytes, ctx);
-  VLOG(3) << "[Lite memory size] Bytes = " << src.memory_size();
+  VLOG(3) << "[Lite memory size] Bytes = " << bytes;
 }
 
 template <>
-void TensorDataShare(paddle::lite::Tensor* dst, framework::LoDTensor* src) {
-  const size_t bytes =
-      static_cast<size_t>(src->numel()) * framework::SizeOfType(src->type());
-  auto buf = std::make_shared<paddle::lite::Buffer>(paddle::lite::Buffer(
-      src->data<void>(), GetLiteTargetType(src->place()), src->memory_size()));
+void TensorDataShare(paddle::lite_api::Tensor* dst, framework::LoDTensor* src) {
   dst->Resize(framework::vectorize(src->dims()));
-  dst->set_precision(GetLitePrecisionType(src->type()));
-  SetLoD(dst->mutable_lod(), src->lod());
-  dst->ResetBuffer(buf, bytes);
+  dst->ShareExternalMemory(src->data<void>(), src->memory_size(),
+                           GetLiteTargetType(src->place()));
+  dst->SetPrecision(GetLitePrecisionType(src->type()));
+  paddle::lite::LoD lite_lod;
+  SetLoD(&lite_lod, src->lod());
+  dst->SetLoD(lite_lod);
 }
 
 template <>
-void TensorDataShare(framework::LoDTensor* dst, paddle::lite::Tensor* src) {
+void TensorDataShare(framework::LoDTensor* dst, paddle::lite_api::Tensor* src) {
   constexpr framework::proto::VarType::Type dtype =
       framework::proto::VarType_Type_FP32;
-  void* src_raw_data = src->raw_data();
+  void* src_raw_data =
+      GetLiteTensorDataPtr(src, GetLitePrecisionType(dtype), src->target());
+  size_t memory_size = GetLiteTensorNumel(*src) * sizeof(float);
   std::shared_ptr<memory::allocation::Allocation> holder(
-      new memory::allocation::Allocation(src_raw_data, src->memory_size(),
+      new memory::allocation::Allocation(src_raw_data, memory_size,
                                          GetNativePlace(src->target())));
-  dst->Resize(paddle::framework::make_ddim(src->dims().Vectorize()));
+  dst->Resize(paddle::framework::make_ddim(src->shape()));
   SetLoD(dst->mutable_lod(), src->lod());
   dst->ResetHolderWithType(holder, dtype);
 }
diff --git a/paddle/fluid/inference/lite/test_engine.cc b/paddle/fluid/inference/lite/test_engine.cc
index 325c7ab2539f28f5145ee88a1bbf374f333348e1..e505af19d5389c074c5777d0235dfa055d1395a7 100644
--- a/paddle/fluid/inference/lite/test_engine.cc
+++ b/paddle/fluid/inference/lite/test_engine.cc
@@ -14,15 +14,16 @@
 
 #include <gtest/gtest.h>
 
-#include "paddle/fluid/inference/lite/engine.h"
 #include "paddle/fluid/inference/utils/singleton.h"
-#include "paddle/fluid/operators/lite/ut_helper.h"
 
 #include "paddle/fluid/framework/block_desc.h"
 #include "paddle/fluid/framework/op_desc.h"
 #include "paddle/fluid/framework/program_desc.h"
 #include "paddle/fluid/framework/scope.h"
 
+#include "paddle/fluid/inference/lite/engine.h"
+#include "paddle/fluid/operators/lite/ut_helper.h"
+
 namespace paddle {
 namespace inference {
 namespace lite {
@@ -101,10 +102,10 @@ TEST(EngineManager, engine) {
   config.model_from_memory = true;
   config.valid_places = {
 #ifdef PADDLE_WITH_CUDA
-      paddle::lite::Place({TARGET(kCUDA), PRECISION(kFloat)}),
+      paddle::lite_api::Place({TARGET(kCUDA), PRECISION(kFloat)}),
 #endif
-      paddle::lite::Place({TARGET(kX86), PRECISION(kFloat)}),
-      paddle::lite::Place({TARGET(kHost), PRECISION(kAny)}),
+      paddle::lite_api::Place({TARGET(kX86), PRECISION(kFloat)}),
+      paddle::lite_api::Place({TARGET(kHost), PRECISION(kAny)}),
   };
 
   LOG(INFO) << "Create EngineManager";
@@ -117,7 +118,7 @@ TEST(EngineManager, engine) {
   ASSERT_EQ(inference::Singleton<inference::lite::EngineManager>::Global().Has(
                 unique_key),
             true);
-  paddle::lite::Predictor* engine_0 =
+  paddle::lite_api::PaddlePredictor* engine_0 =
       inference::Singleton<inference::lite::EngineManager>::Global().Get(
           unique_key);
   CHECK_NOTNULL(engine_0);
diff --git a/paddle/fluid/inference/lite/test_tensor_utils.cc b/paddle/fluid/inference/lite/test_tensor_utils.cc
index eef7bfb68fe06537d09f3f3e7e5c35283d4739ef..a792fb77d6ad483601402506685e2f91066571da 100644
--- a/paddle/fluid/inference/lite/test_tensor_utils.cc
+++ b/paddle/fluid/inference/lite/test_tensor_utils.cc
@@ -73,6 +73,33 @@ TEST(LiteEngineOp, GetNativeLayoutType) {
   EXPECT_ANY_THROW(GetNativeLayoutType(DataLayoutType::kNHWC));
 }
 
+template <typename T>
+void test_lite_tensor_data_ptr(PrecisionType precision_type) {
+  void* GetLiteTensorDataPtr(paddle::lite_api::Tensor * src,
+                             PrecisionType precision_type,
+                             TargetType target_type);
+  const int count = 4;
+  paddle::lite::Tensor lite_tensor;
+  lite_tensor.Resize({count});
+  auto* lite_tensor_data = lite_tensor.mutable_data<T>();
+  for (size_t i = 0; i < count; ++i) {
+    lite_tensor_data[i] = i;
+  }
+  paddle::lite_api::Tensor lite_api_tensor(&lite_tensor);
+  T* data = static_cast<T*>(GetLiteTensorDataPtr(
+      &lite_api_tensor, precision_type, TargetType::kHost));
+  for (size_t i = 0; i < count; ++i) {
+    CHECK_EQ(data[i], static_cast<T>(i)) << "the i-th num is not correct.";
+  }
+}
+
+TEST(LiteEngineOp, GetLiteTensorDataPtr) {
+  test_lite_tensor_data_ptr<int64_t>(PrecisionType::kInt64);
+  test_lite_tensor_data_ptr<int32_t>(PrecisionType::kInt32);
+  test_lite_tensor_data_ptr<int8_t>(PrecisionType::kInt8);
+  EXPECT_ANY_THROW(test_lite_tensor_data_ptr<double>(PrecisionType::kUnk));
+}
+
 void test_tensor_copy(const platform::DeviceContext& ctx) {
   // Create LoDTensor.
   std::vector<float> vector({1, 2, 3, 4});
@@ -83,10 +110,11 @@ void test_tensor_copy(const platform::DeviceContext& ctx) {
   lod_tensor.set_lod(lod);
   // Create lite::Tensor and copy.
   paddle::lite::Tensor lite_tensor;
-  TensorCopyAsync(&lite_tensor, lod_tensor, ctx);
+  paddle::lite_api::Tensor lite_api_tensor(&lite_tensor);
+  TensorCopyAsync(&lite_api_tensor, lod_tensor, ctx);
   // Copy to LoDTensor.
   framework::LoDTensor lod_tensor_n;
-  TensorCopyAsync(&lod_tensor_n, lite_tensor, ctx);
+  TensorCopyAsync(&lod_tensor_n, lite_api_tensor, ctx);
 #ifdef PADDLE_WITH_CUDA
   if (platform::is_gpu_place(ctx.GetPlace())) {
     platform::GpuStreamSync(
@@ -108,10 +136,11 @@ void test_tensor_share(const platform::DeviceContext& ctx) {
   lod_tensor.set_lod(lod);
   // Create lite::Tensor and share.
   paddle::lite::Tensor lite_tensor;
-  TensorDataShare(&lite_tensor, &lod_tensor);
+  paddle::lite_api::Tensor lite_api_tensor(&lite_tensor);
+  TensorDataShare(&lite_api_tensor, &lod_tensor);
   // Copy to LoDTensor.
   framework::LoDTensor lod_tensor_n;
-  TensorCopyAsync(&lod_tensor_n, lite_tensor, ctx);
+  TensorCopyAsync(&lod_tensor_n, lite_api_tensor, ctx);
   std::vector<float> result;
   TensorToVector(lod_tensor_n, ctx, &result);
   ASSERT_EQ(result, vector);
diff --git a/paddle/fluid/inference/tensorrt/convert/concat_op.cc b/paddle/fluid/inference/tensorrt/convert/concat_op.cc
index 28afb87a891fb301b1b5108c9762bf6c88cefb96..5d63aa2ace86cb89917126f3a6fef9d0e9839e8c 100644
--- a/paddle/fluid/inference/tensorrt/convert/concat_op.cc
+++ b/paddle/fluid/inference/tensorrt/convert/concat_op.cc
@@ -34,8 +34,11 @@ class ConcatOpConverter : public OpConverter {
       itensors.push_back(engine_->GetITensor(input_name));
     }
     int axis = BOOST_GET_CONST(int, op_desc.GetAttr("axis"));
-    PADDLE_ENFORCE(axis > 0,
-                   "The axis attr of Concat op should be large than 0 for trt");
+    PADDLE_ENFORCE_GT(axis, 0, platform::errors::InvalidArgument(
+                                   "The axis attr of Concat"
+                                   " op should be larger than 0 for trt. "
+                                   "But received %d.",
+                                   axis));
 
     auto* layer = TRT_ENGINE_ADD_LAYER(engine_, Concatenation, itensors.data(),
                                        itensors.size());
diff --git a/paddle/fluid/inference/tensorrt/convert/conv2d_op.cc b/paddle/fluid/inference/tensorrt/convert/conv2d_op.cc
index 10c212c0b4fa394e3c745bf524ef9d081c4bc3c1..aa03bc44bd629513d96cda541c0b7162629bfdc8 100644
--- a/paddle/fluid/inference/tensorrt/convert/conv2d_op.cc
+++ b/paddle/fluid/inference/tensorrt/convert/conv2d_op.cc
@@ -100,7 +100,9 @@ void ConvertConv2d(TensorRTEngine* engine, const framework::proto::OpDesc& op,
   TensorRTEngine::Weight bias{nvinfer1::DataType::kFLOAT, nullptr, 0};
   auto* layer = fadd_layer(const_cast<nvinfer1::ITensor*>(X), n_output, n_input,
                            nv_ksize, weight, bias);
-  PADDLE_ENFORCE(layer != nullptr);
+  PADDLE_ENFORCE_NOT_NULL(layer,
+                          platform::errors::Fatal("TensorRT create conv2d"
+                                                  " layer error."));
   layer->setStride(nv_strides);
   layer->setPadding(nv_paddings);
   layer->setNbGroups(groups);
diff --git a/paddle/fluid/inference/tensorrt/convert/elementwise_op.cc b/paddle/fluid/inference/tensorrt/convert/elementwise_op.cc
index c4f0855dbb1ca87b40c396692a812a3cbe06a7b8..dfadb28a6520f983986263b38be69fa48335d485 100644
--- a/paddle/fluid/inference/tensorrt/convert/elementwise_op.cc
+++ b/paddle/fluid/inference/tensorrt/convert/elementwise_op.cc
@@ -43,13 +43,30 @@ class ElementwiseWeightOpConverter : public OpConverter {
     framework::OpDesc op_desc(op, nullptr);
     VLOG(3) << "Convert a fluid elementwise op to TensorRT IScaleLayer";
 
-    PADDLE_ENFORCE_EQ(op_desc.Input("X").size(), 1);
-    PADDLE_ENFORCE_EQ(op_desc.Input("Y").size(), 1);  // Y is a weight
-    PADDLE_ENFORCE_EQ(op_desc.Output("Out").size(), 1);
+    PADDLE_ENFORCE_EQ(
+        op_desc.Input("X").size(), 1,
+        platform::errors::InvalidArgument(
+            "The input op's Input(\"X\").size() "
+            "should equal to 1, but received Input(\"X\").size() = %u.",
+            op_desc.Input("X").size()));
+    PADDLE_ENFORCE_EQ(
+        op_desc.Input("Y").size(), 1,
+        platform::errors::InvalidArgument(
+            "The input op's Input(\"Y\").size() "
+            "should equal to 1, but received Input(\"Y\").size() = %u.",
+            op_desc.Input("Y").size()));  // Y is a weight
+    PADDLE_ENFORCE_EQ(
+        op_desc.Output("Out").size(), 1,
+        platform::errors::InvalidArgument(
+            "The input op's Output(\"Out\").size() "
+            "should equal to 1, but reveceid Output(\"Out\").size() = %u.",
+            op_desc.Output("Out").size()));
 
     auto* X = engine_->GetITensor(op_desc.Input("X").front());
     auto* Y_v = scope.FindVar(op_desc.Input("Y").front());
-    PADDLE_ENFORCE_NOT_NULL(Y_v);
+    PADDLE_ENFORCE_NOT_NULL(
+        Y_v, platform::errors::NotFound("Variable %s not found in scope.",
+                                        op_desc.Input("Y").front().c_str()));
     auto* Y_t = Y_v->GetMutable<framework::LoDTensor>();
     float* weight_data = nullptr;
     weight_data =
@@ -176,9 +193,24 @@ class ElementwiseTensorOpConverter : public OpConverter {
     framework::OpDesc op_desc(op, nullptr);
     nvinfer1::ILayer* layer = nullptr;
 
-    PADDLE_ENFORCE_EQ(op_desc.Input("X").size(), 1);
-    PADDLE_ENFORCE_EQ(op_desc.Input("Y").size(), 1);  // Y is a weight
-    PADDLE_ENFORCE_EQ(op_desc.Output("Out").size(), 1);
+    PADDLE_ENFORCE_EQ(
+        op_desc.Input("X").size(), 1,
+        platform::errors::InvalidArgument(
+            "The input op's Input(\"X\").size() "
+            "should equal to 1, but received Input(\"X\").size() = %u.",
+            op_desc.Input("X").size()));
+    PADDLE_ENFORCE_EQ(
+        op_desc.Input("Y").size(), 1,
+        platform::errors::InvalidArgument(
+            "The input op's Input(\"Y\").size() "
+            "should equal to 1, but received Input(\"Y\").size() = %u.",
+            op_desc.Input("Y").size()));  // Y is a weight
+    PADDLE_ENFORCE_EQ(
+        op_desc.Output("Out").size(), 1,
+        platform::errors::InvalidArgument(
+            "The input op's Output(\"Out\").size() "
+            "should equal to 1, but received Output(\"Out\").size() = %u.",
+            op_desc.Output("Out").size()));
 
     auto* X = engine_->GetITensor(op_desc.Input("X").front());
     auto* Y = engine_->GetITensor(op_desc.Input("Y").front());
diff --git a/paddle/fluid/inference/tensorrt/convert/emb_eltwise_layernorm.cc b/paddle/fluid/inference/tensorrt/convert/emb_eltwise_layernorm.cc
index 4ab5d5d5fccab58600a42f40366a2be1fe6ff56d..17b0edc065a84e1326c99344b7fd57703472c574 100644
--- a/paddle/fluid/inference/tensorrt/convert/emb_eltwise_layernorm.cc
+++ b/paddle/fluid/inference/tensorrt/convert/emb_eltwise_layernorm.cc
@@ -136,7 +136,7 @@ class EmbEltwiseLayerNormOpConverter : public OpConverter {
       plugin::DynamicPluginTensorRT* plugin = nullptr;
       plugin = new plugin::EmbLaynormPluginDynamicV2<float>(
           input_embs, bias, scale, emb_sizes, bias_size, scale_size, hidden,
-          eps);
+          eps, use_fp16);
       layer = engine_->AddPluginV2(input_ids.data(), input_num, plugin);
 #endif
     } else {
diff --git a/paddle/fluid/inference/tensorrt/convert/io_converter.cc b/paddle/fluid/inference/tensorrt/convert/io_converter.cc
index 854f434d93e81237dc85c5df62debcf3b3824b78..d9cf9e2e860018df594ac4d84a4d9fa9b9ba669f 100644
--- a/paddle/fluid/inference/tensorrt/convert/io_converter.cc
+++ b/paddle/fluid/inference/tensorrt/convert/io_converter.cc
@@ -29,38 +29,67 @@ class DefaultIOConverter : public EngineIOConverter {
   // NOTE out is GPU memory.
   virtual void operator()(const LoDTensor& in, void* out,
                           size_t max_size) override {
-    PADDLE_ENFORCE(out != nullptr);
-    PADDLE_ENFORCE(stream_ != nullptr);
+    PADDLE_ENFORCE_NOT_NULL(out,
+                            platform::errors::InvalidArgument(
+                                "The input param 'out' must not be nullptr."));
+    PADDLE_ENFORCE_NOT_NULL(stream_,
+                            platform::errors::PreconditionNotMet(
+                                "You should set up stream_ by SetStream() "
+                                "before you call the operator()."));
     const auto& place = in.place();
     size_t size = in.memory_size();
-    PADDLE_ENFORCE_LE(size, max_size);
+    PADDLE_ENFORCE_LE(
+        size, max_size,
+        platform::errors::InvalidArgument(
+            "The input Tensor in's memory_size shoule be less than or equal to "
+            "the input max_size. But in's memory_size = %u, max_size = %u.",
+            size, max_size));
     if (is_cpu_place(place)) {
-      PADDLE_ENFORCE_EQ(0, cudaMemcpyAsync(out, in.data<float>(), size,
-                                           cudaMemcpyHostToDevice, *stream_));
+      PADDLE_ENFORCE_CUDA_SUCCESS(cudaMemcpyAsync(
+          out, in.data<float>(), size, cudaMemcpyHostToDevice, *stream_));
     } else if (is_gpu_place(place)) {
-      PADDLE_ENFORCE_EQ(0, cudaMemcpyAsync(out, in.data<float>(), size,
-                                           cudaMemcpyDeviceToDevice, *stream_));
+      PADDLE_ENFORCE_EQ(
+          0, cudaMemcpyAsync(out, in.data<float>(), size,
+                             cudaMemcpyDeviceToDevice, *stream_),
+          platform::errors::External(
+              "cudaMemcpyAsync(cudaMemcpyDeviceToDevice) error."));
     } else {
-      PADDLE_THROW("Unknown device for converter");
+      PADDLE_THROW(platform::errors::NotFound("Unknown device for converter"));
     }
     cudaStreamSynchronize(*stream_);
   }
   // NOTE in is GPU memory.
   virtual void operator()(const void* in, LoDTensor* out,
                           size_t max_size) override {
-    PADDLE_ENFORCE(in != nullptr);
-    PADDLE_ENFORCE(stream_ != nullptr);
+    PADDLE_ENFORCE_NOT_NULL(in,
+                            platform::errors::InvalidArgument(
+                                "The input param 'in' must not be nullptr."));
+    PADDLE_ENFORCE_NOT_NULL(stream_,
+                            platform::errors::PreconditionNotMet(
+                                "You should set up stream_ by SetStream() "
+                                "before you call the operator()."));
     const auto& place = out->place();
     size_t size = out->memory_size();
-    PADDLE_ENFORCE_LE(size, max_size);
+    PADDLE_ENFORCE_LE(
+        size, max_size,
+        platform::errors::InvalidArgument(
+            "The input Tensor out's memory_size shoule be less than or equal "
+            "to the input max_size. "
+            "But out's memory_size = %u, max_size = %u.",
+            size, max_size));
     if (is_cpu_place(place)) {
       PADDLE_ENFORCE_EQ(0, cudaMemcpyAsync(out->data<float>(), in, size,
-                                           cudaMemcpyDeviceToHost, *stream_));
+                                           cudaMemcpyDeviceToHost, *stream_),
+                        platform::errors::External(
+                            "cudaMemcpyAsync(cudaMemcpyDeviceToHost) error."));
     } else if (is_gpu_place(place)) {
-      PADDLE_ENFORCE_EQ(0, cudaMemcpyAsync(out->data<float>(), in, size,
-                                           cudaMemcpyDeviceToDevice, *stream_));
+      PADDLE_ENFORCE_EQ(
+          0, cudaMemcpyAsync(out->data<float>(), in, size,
+                             cudaMemcpyDeviceToDevice, *stream_),
+          platform::errors::External(
+              "cudaMemcpyAsync(cudaMemcpyDeviceToDevice) error."));
     } else {
-      PADDLE_THROW("Unknown device for converter");
+      PADDLE_THROW(platform::errors::NotFound("Unknown device for converter"));
     }
     cudaStreamSynchronize(*stream_);
   }
diff --git a/paddle/fluid/inference/tensorrt/convert/io_converter.h b/paddle/fluid/inference/tensorrt/convert/io_converter.h
index 5daa242f6ab802a50fa6105f0102b817b700f461..58c178028b8b275b57f5c298534bd1d31aede234 100644
--- a/paddle/fluid/inference/tensorrt/convert/io_converter.h
+++ b/paddle/fluid/inference/tensorrt/convert/io_converter.h
@@ -44,10 +44,14 @@ class EngineIOConverter {
 
   static void ConvertInput(const std::string& op_type, const LoDTensor& in,
                            void* out, size_t max_size, cudaStream_t* stream) {
-    PADDLE_ENFORCE(stream != nullptr);
+    PADDLE_ENFORCE_NOT_NULL(stream,
+                            platform::errors::InvalidArgument(
+                                "The input stream must not be nullptr."));
     auto* converter = Registry<EngineIOConverter>::Global().Lookup(
         op_type, "default" /* default_type */);
-    PADDLE_ENFORCE_NOT_NULL(converter);
+    PADDLE_ENFORCE_NOT_NULL(
+        converter, platform::errors::Unimplemented(
+                       "The %s in is not supported yet.", op_type.c_str()));
     converter->SetStream(stream);
     (*converter)(in, out, max_size);
   }
@@ -55,10 +59,14 @@ class EngineIOConverter {
   static void ConvertOutput(const std::string& op_type, const void* in,
                             LoDTensor* out, size_t max_size,
                             cudaStream_t* stream) {
-    PADDLE_ENFORCE(stream != nullptr);
+    PADDLE_ENFORCE_NOT_NULL(stream,
+                            platform::errors::InvalidArgument(
+                                "The input stream must not be nullptr."));
     auto* converter = Registry<EngineIOConverter>::Global().Lookup(
         op_type, "default" /* default_type */);
-    PADDLE_ENFORCE_NOT_NULL(converter);
+    PADDLE_ENFORCE_NOT_NULL(
+        converter, platform::errors::Unimplemented(
+                       "The %s in not supported yet.", op_type.c_str()));
     converter->SetStream(stream);
     (*converter)(in, out, max_size);
   }
diff --git a/paddle/fluid/inference/tensorrt/convert/op_converter.h b/paddle/fluid/inference/tensorrt/convert/op_converter.h
index f4b0f5f23d8fda064c29534b56868beae79f65c0..ac0a04b9a116d907fd69c0ca58d3ae7e82921dab 100644
--- a/paddle/fluid/inference/tensorrt/convert/op_converter.h
+++ b/paddle/fluid/inference/tensorrt/convert/op_converter.h
@@ -53,7 +53,12 @@ class OpConverter {
     OpConverter* it{nullptr};
 
     if (op_desc.Type() == "mul") {
-      PADDLE_ENFORCE_EQ(op_desc.Input("Y").size(), 1UL);
+      PADDLE_ENFORCE_EQ(op_desc.Input("Y").size(), 1UL,
+                        platform::errors::InvalidArgument(
+                            "The input op mul's Input(\"Y\")."
+                            "size() should equal to 1, but reveceid "
+                            "Input(\"Y\").size() = %u.",
+                            op_desc.Input("Y").size()));
       std::string Y = op_desc.Input("Y")[0];
       if (parameters.count(Y)) {
         it = Registry<OpConverter>::Global().Lookup("fc");
@@ -66,38 +71,51 @@ class OpConverter {
       // static std::unordered_set<std::string> add_weight_op_set {"add", "mul",
       // "sub", "div"};
       static std::unordered_set<std::string> add_weight_op_set{"add", "mul"};
-      PADDLE_ENFORCE_EQ(op_desc.Input("Y").size(), 1UL);
+      PADDLE_ENFORCE_EQ(op_desc.Input("Y").size(), 1UL,
+                        platform::errors::InvalidArgument(
+                            "The input op's Input(\"Y\")."
+                            "size() should equal to 1, but reveceid "
+                            "Input(\"Y\").size() = %u.",
+                            op_desc.Input("Y").size()));
       int op_type_len = op_desc.Type().size();
       std::string op_type = op_desc.Type().substr(op_type_len - 3, op_type_len);
       std::string Y = op_desc.Input("Y")[0];
       if (parameters.count(Y)) {
-        PADDLE_ENFORCE(add_weight_op_set.count(op_type) > 0,
-                       "Unsupported elementwise type" + op_type);
+        PADDLE_ENFORCE_GT(
+            add_weight_op_set.count(op_type), 0,
+            platform::errors::Unimplemented("Unsupported elementwise type %s",
+                                            op_type.c_str()));
         it = Registry<OpConverter>::Global().Lookup("elementwise_" + op_type +
                                                     "_weight");
-        PADDLE_ENFORCE_NOT_NULL(it, "no OpConverter for optype [%s]",
-                                op_desc.Type());
+        PADDLE_ENFORCE_NOT_NULL(
+            it, platform::errors::Unimplemented(
+                    "no OpConverter for optype [%s]", op_desc.Type()));
       } else {
-        PADDLE_ENFORCE(add_tensor_op_set.count(op_type) > 0,
-                       "Unsupported elementwise type" + op_type);
+        PADDLE_ENFORCE_GT(
+            add_tensor_op_set.count(op_type), 0,
+            platform::errors::Unimplemented("Unsupported elementwise type %s",
+                                            op_type.c_str()));
         it = Registry<OpConverter>::Global().Lookup("elementwise_" + op_type +
                                                     "_tensor");
       }
-      PADDLE_ENFORCE_NOT_NULL(it, "no OpConverter for optype [%s]",
-                              op_desc.Type());
+      PADDLE_ENFORCE_NOT_NULL(
+          it, platform::errors::Unimplemented("no OpConverter for optype [%s]",
+                                              op_desc.Type()));
     }
 
     if (op_desc.Type() == "depthwise_conv2d") {
       it = Registry<OpConverter>::Global().Lookup("conv2d");
-      PADDLE_ENFORCE_NOT_NULL(it, "no OpConverter for optype [%s]",
-                              op_desc.Type());
+      PADDLE_ENFORCE_NOT_NULL(
+          it, platform::errors::Unimplemented("no OpConverter for optype [%s]",
+                                              op_desc.Type()));
     }
 
     if (!it) {
       it = Registry<OpConverter>::Global().Lookup(op_desc.Type());
     }
-    PADDLE_ENFORCE_NOT_NULL(it, "no OpConverter for optype [%s]",
-                            op_desc.Type());
+    PADDLE_ENFORCE_NOT_NULL(
+        it, platform::errors::Unimplemented("no OpConverter for optype [%s]",
+                                            op_desc.Type()));
 
     it->SetEngine(engine);
     (*it)(op, scope, test_mode);
@@ -149,9 +167,13 @@ class OpConverter {
     for (auto& input : inputs) {
       if (parameters.count(input)) continue;
       auto* var = block_desc->FindVar(input);
-      PADDLE_ENFORCE(var, "no variable called %s", input);
-      PADDLE_ENFORCE_EQ(var->GetType(), FluidDT::VarType_Type_LOD_TENSOR,
-                        "TensorRT engine only takes LoDTensor as input");
+      PADDLE_ENFORCE_NOT_NULL(
+          var, platform::errors::NotFound("no variable called %s in block.",
+                                          input.c_str()));
+      PADDLE_ENFORCE_EQ(
+          var->GetType(), FluidDT::VarType_Type_LOD_TENSOR,
+          platform::errors::InvalidArgument("TensorRT engine only takes "
+                                            "LoDTensor as input"));
       auto var_shape = var->GetShape();
       if (engine->with_dynamic_shape()) {
 #if IS_TRT_VERSION_GE(6000)
diff --git a/paddle/fluid/inference/tensorrt/convert/pad_op.cc b/paddle/fluid/inference/tensorrt/convert/pad_op.cc
index a1b0f3b4310a020d4bbf8d7c04c9447d3e0e72f7..dd594404d3316ada6e20624c074368f241ca5cdd 100644
--- a/paddle/fluid/inference/tensorrt/convert/pad_op.cc
+++ b/paddle/fluid/inference/tensorrt/convert/pad_op.cc
@@ -39,9 +39,22 @@ class PadOpConverter : public OpConverter {
     nvinfer1::Dims input_shape = input->getDimensions();
     int nbDims = input_shape.nbDims;
     int pad_size = static_cast<int>(paddings.size());
-    PADDLE_ENFORCE_GE(nbDims, 2);
-    PADDLE_ENFORCE_EQ((nbDims + 1) * 2, pad_size);
-    PADDLE_ENFORCE(pad_value == 0.0, "The pad layer of TRT only support zero.");
+    PADDLE_ENFORCE_GE(
+        nbDims, 2,
+        platform::errors::InvalidArgument(
+            "Input X[0]'s dimension should greater than or equal to 2. "
+            "But received %d.",
+            nbDims));
+    PADDLE_ENFORCE_EQ(
+        (nbDims + 1) * 2, pad_size,
+        platform::errors::InvalidArgument("Input X[0]'s dimension(nbDims for "
+                                          "short) should meet the condition:"
+                                          "(nbDims + 1) * 2 == pad_size. But "
+                                          "received nbDims:%d, pad_size:%d.",
+                                          nbDims, pad_size));
+    PADDLE_ENFORCE_EQ(pad_value, 0.0,
+                      platform::errors::InvalidArgument(
+                          "The pad layer of TRT only support zero."));
 
     nvinfer1::DimsHW pre_pad(paddings[pad_size - 4], paddings[pad_size - 2]);
     nvinfer1::DimsHW post_pad(paddings[pad_size - 3], paddings[pad_size - 1]);
@@ -50,7 +63,9 @@ class PadOpConverter : public OpConverter {
                                        *const_cast<nvinfer1::ITensor*>(input),
                                        pre_pad, post_pad);
 
-    PADDLE_ENFORCE(layer != nullptr);
+    PADDLE_ENFORCE_NOT_NULL(layer,
+                            platform::errors::External(
+                                "add padding layer to tensorrt engine error"));
     auto output_name = op_desc.Output("Out")[0];
     RreplenishLayerAndOutput(layer, "pad", {output_name}, test_mode);
   }
diff --git a/paddle/fluid/inference/tensorrt/convert/swish_op.cc b/paddle/fluid/inference/tensorrt/convert/swish_op.cc
index 4b3e1c9e70a4a94808c94c81fcc773482f0574e4..e220d80f0d79da5eab98aa7a18a5093f9f4a55c4 100644
--- a/paddle/fluid/inference/tensorrt/convert/swish_op.cc
+++ b/paddle/fluid/inference/tensorrt/convert/swish_op.cc
@@ -28,11 +28,20 @@ class SwishOpConverter : public OpConverter {
     framework::OpDesc op_desc(op, nullptr);
     // Declare inputs
     int input_num = op_desc.Input("X").size();
-    PADDLE_ENFORCE(input_num == 1);
+    PADDLE_ENFORCE_EQ(input_num, 1,
+                      platform::errors::InvalidArgument(
+                          "The input X's size must equal to 1 in TRT swish op."
+                          " But received X's size %d.",
+                          input_num));
     auto* input = engine_->GetITensor(op_desc.Input("X")[0]);
     // Get output
     size_t output_num = op_desc.Output("Out").size();
-    PADDLE_ENFORCE(output_num == 1);
+    PADDLE_ENFORCE_EQ(
+        output_num, 1UL,
+        platform::errors::InvalidArgument(
+            "The ouput Out's size must equal to 1 in TRT swish op. "
+            "But received Out's size %u.",
+            output_num));
     // Get attrs
     float beta = BOOST_GET_CONST(float, op_desc.GetAttr("beta"));
 
diff --git a/paddle/fluid/inference/tensorrt/convert/ut_helper.h b/paddle/fluid/inference/tensorrt/convert/ut_helper.h
index 3c48c8192f6b06e5a0ba005738383b46bc550ecb..cfb25eb2ba82763950babda5385649d31d2e9185 100644
--- a/paddle/fluid/inference/tensorrt/convert/ut_helper.h
+++ b/paddle/fluid/inference/tensorrt/convert/ut_helper.h
@@ -49,7 +49,10 @@ void RandomizeTensor(framework::LoDTensor* tensor, const platform::Place& place,
                      const platform::DeviceContext& ctx) {
   auto dims = tensor->dims();
   size_t num_elements = analysis::AccuDims(dims, dims.size());
-  PADDLE_ENFORCE_GT(num_elements, 0);
+  PADDLE_ENFORCE_GT(
+      num_elements, 0UL,
+      platform::errors::PermissionDenied("RandomizeTensor only can be used for "
+                                         "tensor which dims is not zero."));
 
   platform::CPUPlace cpu_place;
   framework::LoDTensor temp_tensor;
@@ -79,7 +82,8 @@ class TRTConvertValidation {
         scope_(scope),
         if_add_batch_(if_add_batch),
         max_batch_size_(max_batch_size) {
-    PADDLE_ENFORCE_EQ(cudaStreamCreate(&stream_), 0);
+    PADDLE_ENFORCE_EQ(cudaStreamCreate(&stream_), 0,
+                      platform::errors::External("cudaStreamCreate error."));
     engine_.reset(new TensorRTEngine(max_batch_size, workspace_size));
     engine_->InitNetwork();
   }
@@ -154,7 +158,12 @@ class TRTConvertValidation {
   void Execute(int batch_size,
                std::unordered_set<std::string> neglected_output = {}) {
     // Execute Fluid Op
-    PADDLE_ENFORCE_LE(batch_size, max_batch_size_);
+    PADDLE_ENFORCE_LE(batch_size, max_batch_size_,
+                      platform::errors::InvalidArgument(
+                          "Runtime batch_size should be less than or equal to "
+                          "max_batch_size_. "
+                          "But received batch_size:%d, max_batch_size_:%d",
+                          batch_size, max_batch_size_));
     platform::CUDADeviceContext ctx(place_);
     op_->Run(scope_, place_);
     cudaStreamSynchronize(stream_);
diff --git a/paddle/fluid/inference/tensorrt/engine.cc b/paddle/fluid/inference/tensorrt/engine.cc
index 5eb4880a9c584bb98ab2fc885295b5322bcbc555..6154c10d54587356ceb08982fa83b4cfb03b8f20 100644
--- a/paddle/fluid/inference/tensorrt/engine.cc
+++ b/paddle/fluid/inference/tensorrt/engine.cc
@@ -63,11 +63,13 @@ void TensorRTEngine::Execute(int batch_size, std::vector<void *> *buffers,
 void TensorRTEngine::FreezeNetwork() {
   freshDeviceId();
   VLOG(3) << "TRT to freeze network";
-  PADDLE_ENFORCE(infer_builder_ != nullptr,
-                 "Call InitNetwork first to initialize network.");
-  PADDLE_ENFORCE_EQ(network() != nullptr, true,
-                    platform::errors::InvalidArgument(
-                        "Call InitNetwork first to initialize network."));
+  PADDLE_ENFORCE_NOT_NULL(infer_builder_,
+                          platform::errors::InvalidArgument(
+                              "Inference builder of TRT is null. Please make "
+                              "sure you call InitNetwork first."));
+  PADDLE_ENFORCE_NOT_NULL(network(),
+                          platform::errors::InvalidArgument(
+                              "Call InitNetwork first to initialize network."));
   // build engine.
   infer_builder_->setMaxBatchSize(max_batch_);
   infer_builder_->setMaxWorkspaceSize(max_workspace_);
@@ -211,7 +213,10 @@ void TensorRTEngine::FreezeNetwork() {
   } else {
     infer_engine_.reset(infer_builder_->buildCudaEngine(*network()));
   }
-  PADDLE_ENFORCE(infer_engine_ != nullptr, "build cuda engine failed!");
+  PADDLE_ENFORCE_NOT_NULL(
+      infer_engine_, platform::errors::Fatal(
+                         "Build TensorRT cuda engine failed! Please recheck "
+                         "you configurations related to paddle-TensorRT."));
 }
 
 nvinfer1::ITensor *TensorRTEngine::DeclareInput(const std::string &name,
@@ -221,8 +226,16 @@ nvinfer1::ITensor *TensorRTEngine::DeclareInput(const std::string &name,
                     platform::errors::InvalidArgument(
                         "The TRT network should be initialized first."));
   auto *input = network()->addInput(name.c_str(), dtype, dims);
-  PADDLE_ENFORCE(input, "infer network add input %s failed", name);
-  PADDLE_ENFORCE(input->isNetworkInput());
+  PADDLE_ENFORCE_NOT_NULL(
+      input, platform::errors::InvalidArgument("Adding input %s failed in "
+                                               "TensorRT inference network. "
+                                               "Please recheck your input.",
+                                               name));
+  PADDLE_ENFORCE_EQ(input->isNetworkInput(), true,
+                    platform::errors::InvalidArgument(
+                        "Input %s is not the input of TRT inference network. "
+                        "Please recheck your input.",
+                        name));
   TensorRTEngine::SetITensor(name, input);
   return input;
 }
@@ -231,31 +244,53 @@ void TensorRTEngine::DeclareOutput(const nvinfer1::ILayer *layer, int offset,
                                    const std::string &name) {
   auto *output = layer->getOutput(offset);
   SetITensor(name, output);
-  PADDLE_ENFORCE(output != nullptr);
+  PADDLE_ENFORCE_NOT_NULL(
+      output, platform::errors::InvalidArgument(
+                  "The output %s of TRT engine should not be null.", name));
   output->setName(name.c_str());
-  PADDLE_ENFORCE(!output->isNetworkInput());
+  PADDLE_ENFORCE_EQ(output->isNetworkInput(), false,
+                    platform::errors::InvalidArgument(
+                        "The output %s of TRT engine should not be the input "
+                        "of the network at the same time.",
+                        name));
   network()->markOutput(*output);
-  PADDLE_ENFORCE(output->isNetworkOutput());
+  PADDLE_ENFORCE_EQ(
+      output->isNetworkOutput(), true,
+      platform::errors::InvalidArgument(
+          "The output %s of TRT engine should be the output of the network.",
+          name));
 }
 
 void TensorRTEngine::DeclareOutput(const std::string &name) {
   auto *output = TensorRTEngine::GetITensor(name);
-  PADDLE_ENFORCE(output != nullptr);
+  PADDLE_ENFORCE_NOT_NULL(
+      output, platform::errors::InvalidArgument(
+                  "The output %s of TRT engine should not be null.", name));
   output->setName(name.c_str());
-  PADDLE_ENFORCE(!output->isNetworkInput());
+  PADDLE_ENFORCE_EQ(output->isNetworkInput(), false,
+                    platform::errors::InvalidArgument(
+                        "The output %s of TRT engine should not be the input "
+                        "of the network at the same time.",
+                        name));
   network()->markOutput(*output);
 }
 
 void TensorRTEngine::SetITensor(const std::string &name,
                                 nvinfer1::ITensor *tensor) {
-  PADDLE_ENFORCE(tensor != nullptr);
-  PADDLE_ENFORCE_EQ(0, itensor_map_.count(name), "duplicate ITensor name %s",
-                    name);
+  PADDLE_ENFORCE_NOT_NULL(
+      tensor, platform::errors::InvalidArgument(
+                  "Tensor named %s of TRT engine should not be null.", name));
+  PADDLE_ENFORCE_EQ(
+      0, itensor_map_.count(name),
+      platform::errors::InvalidArgument(
+          "Tensor named %s of TRT engine should not be duplicated", name));
   itensor_map_[name] = tensor;
 }
 
 nvinfer1::ITensor *TensorRTEngine::GetITensor(const std::string &name) {
-  PADDLE_ENFORCE(itensor_map_.count(name), "no ITensor %s", name);
+  PADDLE_ENFORCE_EQ(itensor_map_.count(name), true,
+                    platform::errors::NotFound(
+                        "Tensor named %s is not found in TRT engine", name));
   return itensor_map_[name];
 }
 
@@ -272,11 +307,11 @@ float *TensorRTEngine::GetWeightCPUData(const std::string &name,
   std::string splitter = "__";
   std::string name_with_suffix = name + splitter + name_suffix;
   platform::CPUPlace cpu_place;
-  PADDLE_ENFORCE_EQ(
-      weight_map.count(name_with_suffix), 0,
-      "During TRT Op converter: We set weight %s with the same name "
-      "twice into the weight_map",
-      name_with_suffix);
+  PADDLE_ENFORCE_EQ(weight_map.count(name_with_suffix), 0,
+                    platform::errors::AlreadyExists(
+                        "The weight named %s is set into the weight map "
+                        "twice in TRT OP converter.",
+                        name_with_suffix));
   weight_map[name_with_suffix].reset(new framework::Tensor());
   weight_map[name_with_suffix]->Resize(weight_tensor->dims());
   TensorCopySync(*weight_tensor, cpu_place, weight_map[name_with_suffix].get());
@@ -298,7 +333,10 @@ nvinfer1::IPluginLayer *TensorRTEngine::AddPlugin(
 void TensorRTEngine::freshDeviceId() {
   int count;
   cudaGetDeviceCount(&count);
-  PADDLE_ENFORCE_LT(device_id_, count);
+  PADDLE_ENFORCE_LT(device_id_, count,
+                    platform::errors::OutOfRange(
+                        "Device id %d exceeds the current device count: %d.",
+                        device_id_, count));
   cudaSetDevice(device_id_);
 }
 
diff --git a/paddle/fluid/inference/tensorrt/engine.h b/paddle/fluid/inference/tensorrt/engine.h
index 22fed0b69763900e4f9efd8cbd61ef7d53d46ce0..d153e637ce15bee1f8a789f69ea63d28704a4b9d 100644
--- a/paddle/fluid/inference/tensorrt/engine.h
+++ b/paddle/fluid/inference/tensorrt/engine.h
@@ -199,8 +199,10 @@ class TensorRTEngine {
   }
 
   nvinfer1::IHostMemory* Serialize() {
-    PADDLE_ENFORCE(infer_engine_ != nullptr,
-                   "You should build engine first and then serialize");
+    PADDLE_ENFORCE_NOT_NULL(
+        infer_engine_,
+        platform::errors::InvalidArgument(
+            "The TensorRT engine must be built first before serialization"));
     ihost_memory_.reset(infer_engine_->serialize());
     return ihost_memory_.get();
   }
@@ -225,8 +227,14 @@ class TensorRTEngine {
           engine_serialized_data.c_str(), engine_serialized_data.size(),
           &inference::Singleton<plugin::PluginFactoryTensorRT>::Global()));
     }
-    PADDLE_ENFORCE(infer_engine_ != nullptr,
-                   "build cuda engine failed when deserialize engine info.!");
+    PADDLE_ENFORCE_NOT_NULL(
+        infer_engine_,
+        platform::errors::Fatal(
+            "Building TRT cuda engine failed when deserializing engine info. "
+            "Please check:\n1. Your TRT serialization is generated and loaded "
+            "on the same GPU architecture;\n2. The Paddle Inference version of "
+            "generating serialization file and doing inference are "
+            "consistent."));
   }
 
   void SetRuntimeBatch(size_t batch_size);
diff --git a/paddle/fluid/inference/tensorrt/op_teller.cc b/paddle/fluid/inference/tensorrt/op_teller.cc
index 6eec6b8ab7a4eca3c81dc3a74c1c7d61ec6528f1..47dbccd96572a21a9ca4480ac7075443bf99cb9f 100644
--- a/paddle/fluid/inference/tensorrt/op_teller.cc
+++ b/paddle/fluid/inference/tensorrt/op_teller.cc
@@ -31,6 +31,7 @@ struct SimpleOpTypeSetTeller : public Teller {
     teller_set.insert("fused_embedding_eltwise_layernorm");
     teller_set.insert("multihead_matmul");
     teller_set.insert("skip_layernorm");
+    teller_set.insert("slice");
 #endif
   }
 
diff --git a/paddle/fluid/inference/tensorrt/plugin/elementwise_op_plugin.cu b/paddle/fluid/inference/tensorrt/plugin/elementwise_op_plugin.cu
index 0ec803fe64afadd970777e3b0d0ab5d37fcc4d22..457d9dd87375477926480bce0a84e8f89c409698 100644
--- a/paddle/fluid/inference/tensorrt/plugin/elementwise_op_plugin.cu
+++ b/paddle/fluid/inference/tensorrt/plugin/elementwise_op_plugin.cu
@@ -56,14 +56,27 @@ __global__ void elementwise_kernel(const size_t total, const T *x_data,
 
 nvinfer1::Dims ElementWisePlugin::getOutputDimensions(
     int index, const nvinfer1::Dims *input_dims, int num_inputs) {
-  PADDLE_ENFORCE_EQ(index, 0);
-  PADDLE_ENFORCE_EQ(num_inputs, 2);
-  PADDLE_ENFORCE_NOT_NULL(input_dims);
+  PADDLE_ENFORCE_EQ(index, 0, platform::errors::InvalidArgument(
+                                  "There is only one output in TRT elementwise "
+                                  "op plugin, but got output index: %d.",
+                                  index));
+  PADDLE_ENFORCE_EQ(num_inputs, 2, platform::errors::InvalidArgument(
+                                       "There are 2 inputs in TRT elementwise "
+                                       "op plugin, but got input number: %d.",
+                                       num_inputs));
+  PADDLE_ENFORCE_NOT_NULL(
+      input_dims,
+      platform::errors::InvalidArgument(
+          "The input dims of TRT elementwise op plugin should not be null."));
   return input_dims[0];
 }
 
 int ElementWisePlugin::initialize() {
-  PADDLE_ENFORCE_GT(dims_y_.nbDims, 0);
+  PADDLE_ENFORCE_GT(dims_y_.nbDims, 0,
+                    platform::errors::InvalidArgument(
+                        "The dimension of input Y of TRT elementwise op plugin "
+                        "should be greater than 0, but got %d.",
+                        dims_y_.nbDims));
 
   axis_ = (axis_ == -1) ? dims_x_.nbDims - dims_y_.nbDims : axis_;
   int trimed_nb_dims = dims_y_.nbDims;
@@ -74,8 +87,18 @@ int ElementWisePlugin::initialize() {
   }
   dims_y_.nbDims = trimed_nb_dims;
 
-  PADDLE_ENFORCE_GE(dims_x_.nbDims, dims_y_.nbDims + axis_);
-  PADDLE_ENFORCE_LT(axis_, dims_x_.nbDims);
+  PADDLE_ENFORCE_GE(dims_x_.nbDims, dims_y_.nbDims + axis_,
+                    platform::errors::InvalidArgument(
+                        "We expect [number of x dims] >= [number of y dims + "
+                        "axis] in TRT elementwise op plugin, but got [number "
+                        "of x dims] = %d, [number of y dims + axis] = %d.",
+                        dims_x_.nbDims, dims_y_.nbDims + axis_));
+  PADDLE_ENFORCE_LT(
+      axis_, dims_x_.nbDims,
+      platform::errors::InvalidArgument("We expect [axis] < [number of x dims] "
+                                        "in TRT elementwise op plugin, but got "
+                                        "[axis] = %d, [number of x dims] = %d.",
+                                        axis_, dims_x_.nbDims));
 
   prev_size_ = 1;
   midd_size_ = 1;
@@ -86,7 +109,9 @@ int ElementWisePlugin::initialize() {
 
   for (int i = 0; i < dims_y_.nbDims; ++i) {
     PADDLE_ENFORCE_EQ(dims_x_.d[i + axis_], dims_y_.d[i],
-                      "Broadcast dimension mismatch.");
+                      platform::errors::InvalidArgument(
+                          "Broadcast dimension mismatch. The dims of input Y "
+                          "should be a subsequence of X."));
     midd_size_ *= dims_y_.d[i];
   }
 
@@ -221,7 +246,10 @@ int ElementwisePluginDynamic::enqueue(
     elementwise_kernel<<<block, thread, 0, stream>>>(
         num, x, y, out, prev_size, midd_size, post_size, details::Mul<float>());
   } else {
-    PADDLE_THROW("Not implemented.");
+    PADDLE_THROW(platform::errors::Unimplemented(
+        "Paddle-TRT only support elementwise operation: {add, mul} currently, "
+        "but got %s.",
+        type_));
   }
 
   return cudaGetLastError() != cudaSuccess;
diff --git a/paddle/fluid/inference/tensorrt/plugin/emb_eltwise_layernorm_plugin.cu b/paddle/fluid/inference/tensorrt/plugin/emb_eltwise_layernorm_plugin.cu
index 5e43be90de3dbbfef3c7d3def7e37904bb644380..873631fea614cc18cdc2b2b2f27d2480aa71d50b 100644
--- a/paddle/fluid/inference/tensorrt/plugin/emb_eltwise_layernorm_plugin.cu
+++ b/paddle/fluid/inference/tensorrt/plugin/emb_eltwise_layernorm_plugin.cu
@@ -32,13 +32,34 @@ namespace plugin {
 #if IS_TRT_VERSION_GE(6000)
 
 template <typename T>
-int EmbEltwiseLayernormPluginDynamic<T>::initialize() {
+EmbEltwiseLayernormPluginDynamicImpl<
+    T>::~EmbEltwiseLayernormPluginDynamicImpl() {
+  this->terminate();
+}
+
+inline half fp32tofp16(float x) { return static_cast<half>(x); }
+
+template <typename T>
+int EmbEltwiseLayernormPluginDynamicImpl<T>::initialize() {
   embs_gpu_.resize(embs_.size());
   for (int i = 0; i < embs_.size(); i++) {
     if (embs_[i]) {
-      cudaMalloc(&embs_gpu_[i], sizeof(float) * emb_sizes_[i]);
-      cudaMemcpy(embs_gpu_[i], embs_[i], emb_sizes_[i] * sizeof(float),
+      T *host_ptr;
+      auto size = emb_sizes_[i];
+
+      if (std::is_same<T, half>::value) {
+        host_ptr = new T[size];
+        std::transform(embs_[i], (embs_[i] + size), host_ptr, fp32tofp16);
+      } else {
+        host_ptr = reinterpret_cast<T *>(embs_[i]);
+      }
+
+      cudaMalloc(&embs_gpu_[i], sizeof(T) * size);
+      cudaMemcpy(embs_gpu_[i], host_ptr, size * sizeof(T),
                  cudaMemcpyHostToDevice);
+      if (std::is_same<T, half>::value) {
+        delete[] host_ptr;
+      }
     }
   }
 
@@ -53,11 +74,105 @@ int EmbEltwiseLayernormPluginDynamic<T>::initialize() {
                cudaMemcpyHostToDevice);
   }
 
+  int input_num = embs_.size();
+  in_ptr_tensor_.Resize({input_num});
+  emb_ptr_tensor_.Resize({input_num});
+
+  cudaGetDevice(&device_id_);
+  auto emb_ptr_gpu_d =
+      emb_ptr_tensor_.mutable_data<int64_t>(platform::CUDAPlace(device_id_));
+  cudaMemcpy(emb_ptr_gpu_d, embs_gpu_.data(), sizeof(uintptr_t) * input_num,
+             cudaMemcpyHostToDevice);
+
   return 0;
 }
 
 template <typename T>
-nvinfer1::DimsExprs EmbEltwiseLayernormPluginDynamic<T>::getOutputDimensions(
+void EmbEltwiseLayernormPluginDynamicImpl<T>::terminate() {
+  for (int i = 0; i < embs_gpu_.size(); ++i) {
+    if (embs_gpu_[i]) {
+      cudaFree(embs_gpu_[i]);
+      embs_gpu_[i] = nullptr;
+    }
+  }
+
+  if (bias_gpu_) {
+    cudaFree(bias_gpu_);
+    bias_gpu_ = nullptr;
+  }
+
+  if (scale_gpu_) {
+    cudaFree(scale_gpu_);
+    scale_gpu_ = nullptr;
+  }
+}
+
+template <typename T>
+int EmbEltwiseLayernormPluginDynamicImpl<T>::enqueue(
+    const nvinfer1::PluginTensorDesc *input_desc,
+    const nvinfer1::PluginTensorDesc *output_desc, const void *const *inputs,
+    void *const *outputs, void *workspace, cudaStream_t stream) {
+  auto id_dims = input_desc[0].dims;
+  int batch = id_dims.d[0];
+  int seq_len = id_dims.d[1];
+  int input_num = embs_.size();
+
+  auto in_ptr_gpu_d =
+      in_ptr_tensor_.mutable_data<int64_t>(platform::CUDAPlace(device_id_));
+  auto emb_ptr_gpu_d =
+      emb_ptr_tensor_.mutable_data<int64_t>(platform::CUDAPlace(device_id_));
+
+  auto new_input_ptr = reinterpret_cast<uintptr_t>(inputs[0]);
+
+  if (old_input_ptr_ != new_input_ptr) {
+    old_input_ptr_ = new_input_ptr;
+
+    cudaMemcpyAsync(in_ptr_gpu_d, reinterpret_cast<const void *>(inputs),
+                    sizeof(uintptr_t) * input_num, cudaMemcpyHostToDevice,
+                    stream);
+  }
+
+  auto out_type = output_desc[0].type;
+
+  if (std::is_same<T, float>::value) {
+    PADDLE_ENFORCE_EQ(
+        out_type == nvinfer1::DataType::kFLOAT, true,
+        platform::errors::InvalidArgument(
+            "The EmbEltwiseLayernorm Plugin only support fp32 input."));
+  } else if (std::is_same<T, half>::value) {
+    PADDLE_ENFORCE_EQ(
+        out_type == nvinfer1::DataType::kHALF, true,
+        platform::errors::InvalidArgument(
+            "The EmbEltwiseLayernorm Plugin only support fp16 input."));
+  } else {
+    PADDLE_THROW(platform::errors::Fatal(
+        "Unsupport data type, the out type of EmbEltwiseLayernorm should be "
+        "float or half."));
+  }
+
+  auto *output_d = reinterpret_cast<T *>(outputs[0]);
+
+  operators::math::EmbEltwiseLayerNormFunctor<T> emb_eltwise_layernorm_func;
+  emb_eltwise_layernorm_func(batch, seq_len, hidden_size_, in_ptr_gpu_d,
+                             scale_gpu_, bias_gpu_, emb_ptr_gpu_d, output_d,
+                             eps_, input_num, stream);
+  return cudaGetLastError() != cudaSuccess;
+}
+
+template class EmbEltwiseLayernormPluginDynamicImpl<float>;
+#ifdef SUPPORTS_CUDA_FP16
+template class EmbEltwiseLayernormPluginDynamicImpl<half>;
+#endif  // SUPPORTS_CUDA_FP16
+
+int EmbEltwiseLayernormPluginDynamic::initialize() {
+  impl_->initialize();
+
+  return 0;
+}
+
+void EmbEltwiseLayernormPluginDynamic::terminate() { impl_->terminate(); }
+
+nvinfer1::DimsExprs EmbEltwiseLayernormPluginDynamic::getOutputDimensions(
     int output_index, const nvinfer1::DimsExprs *inputs, int nb_inputs,
     nvinfer1::IExprBuilder &expr_builder) {  // NOLINT
   PADDLE_ENFORCE_EQ(output_index, 0,
@@ -76,18 +191,7 @@ nvinfer1::DimsExprs EmbEltwiseLayernormPluginDynamic<T>::getOutputDimensions(
   return ret;
 }
 
-template <typename T>
-void EmbEltwiseLayernormPluginDynamic<T>::terminate() {
-  for (auto ptr : embs_gpu_) {
-    if (ptr) cudaFree(ptr);
-  }
-
-  if (bias_gpu_) cudaFree(bias_gpu_);
-  if (scale_gpu_) cudaFree(scale_gpu_);
-}
-
-template <typename T>
-bool EmbEltwiseLayernormPluginDynamic<T>::supportsFormatCombination(
+bool EmbEltwiseLayernormPluginDynamic::supportsFormatCombination(
     int pos, const nvinfer1::PluginTensorDesc *in_out, int nb_inputs,
     int nb_outputs) {
   PADDLE_ENFORCE_NOT_NULL(
@@ -98,6 +202,11 @@ bool EmbEltwiseLayernormPluginDynamic<T>::supportsFormatCombination(
                         "The EmbEltwiseLayerNorm's output should be one"
                         "but it's (%d) outputs.",
                         nb_outputs));
+  PADDLE_ENFORCE_EQ(nb_outputs, 1,
+                    platform::errors::InvalidArgument(
+                        "The EmbEltwiseLayerNorm's output should be one"
+                        "but it's (%d) outputs.",
+                        nb_outputs));
   PADDLE_ENFORCE_LT(
       pos, nb_inputs + nb_outputs,
       platform::errors::InvalidArgument("The pos(%d) should be less than the "
@@ -122,7 +231,7 @@ bool EmbEltwiseLayernormPluginDynamic<T>::supportsFormatCombination(
   }
 
   if (pos == all_nums - 1) {
-    if (sizeof(T) == sizeof(float)) {
+    if (with_fp16_ == false) {
       return desc.type == nvinfer1::DataType::kFLOAT;
     } else {
       return desc.type == nvinfer1::DataType::kHALF;
@@ -131,84 +240,27 @@ bool EmbEltwiseLayernormPluginDynamic<T>::supportsFormatCombination(
   return false;
 }
 
-template <typename T>
-nvinfer1::DataType EmbEltwiseLayernormPluginDynamic<T>::getOutputDataType(
+nvinfer1::DataType EmbEltwiseLayernormPluginDynamic::getOutputDataType(
     int index, const nvinfer1::DataType *input_types, int nb_inputs) const {
   PADDLE_ENFORCE_EQ(
       index, 0, platform::errors::InvalidArgument(
                     "The EmbEltwiseLayernorm Plugin only has one input, so the "
                     "index value should be 0, but get %d.",
                     index));
-  return nvinfer1::DataType::kFLOAT;
+  if (with_fp16_)
+    return nvinfer1::DataType::kHALF;
+  else
+    return nvinfer1::DataType::kFLOAT;
 }
 
-template <typename T>
-int EmbEltwiseLayernormPluginDynamic<T>::enqueue(
+int EmbEltwiseLayernormPluginDynamic::enqueue(
     const nvinfer1::PluginTensorDesc *input_desc,
     const nvinfer1::PluginTensorDesc *output_desc, const void *const *inputs,
     void *const *outputs, void *workspace, cudaStream_t stream) {
-  auto id_dims = input_desc[0].dims;
-  int batch = id_dims.d[0];
-  int seq_len = id_dims.d[1];
-  int input_num = embs_.size();
-
-  framework::Tensor in_ptr_tensor, emb_ptr_tensor;
-  int device_id;
-  cudaGetDevice(&device_id);
-
-  in_ptr_tensor.Resize({input_num});
-  emb_ptr_tensor.Resize({input_num});
-  int64_t *in_ptr_gpu_d =
-      in_ptr_tensor.mutable_data<int64_t>(platform::CUDAPlace(device_id));
-  int64_t *emb_ptr_gpu_d =
-      emb_ptr_tensor.mutable_data<int64_t>(platform::CUDAPlace(device_id));
-
-  std::vector<uintptr_t> in_ptr, emb_ptr;
-  for (int i = 0; i < input_num; i++) {
-    in_ptr.push_back(reinterpret_cast<uintptr_t>(inputs[i]));
-    emb_ptr.push_back(reinterpret_cast<uintptr_t>(embs_gpu_[i]));
-  }
-
-  cudaMemcpyAsync(in_ptr_gpu_d, in_ptr.data(), sizeof(int64_t) * input_num,
-                  cudaMemcpyHostToDevice, stream);
-  cudaMemcpyAsync(emb_ptr_gpu_d, emb_ptr.data(), sizeof(int64_t) * input_num,
-                  cudaMemcpyHostToDevice, stream);
-
-  auto out_type = output_desc[0].type;
-
-  const unsigned tpb = 256;
-  const dim3 grid(seq_len, batch, 1);
-  const dim3 block(tpb, 1, 1);
-  if (sizeof(T) == sizeof(float)) {
-    PADDLE_ENFORCE_EQ(
-        out_type == nvinfer1::DataType::kFLOAT, true,
-        platform::errors::InvalidArgument(
-            "The EmbEltwiseLayernorm Plugin only support fp32 input."));
-  } else if (sizeof(T) == sizeof(int16_t)) {
-    PADDLE_ENFORCE_EQ(
-        out_type == nvinfer1::DataType::kHALF, true,
-        platform::errors::InvalidArgument(
-            "The EmbEltwiseLayernorm Plugin only support fp16 input."));
-  } else {
-    PADDLE_THROW(platform::errors::Fatal(
-        "Unsupport data type, the out type of EmbEltwiseLayernorm should be "
-        "float or half."));
-  }
-
-  T *output_d = static_cast<T *>(outputs[0]);
-
-  operators::math::EmbEltwiseLayerNormFunctor<T> emb_eltwise_layernorm_func;
-  emb_eltwise_layernorm_func(batch, seq_len, hidden_size_, in_ptr_gpu_d,
-                             scale_gpu_, bias_gpu_, emb_ptr_gpu_d, output_d,
-                             eps_, input_num, stream);
+  impl_->enqueue(input_desc, output_desc, inputs, outputs, workspace, stream);
   return cudaGetLastError() != cudaSuccess;
 }
 
-template class EmbEltwiseLayernormPluginDynamic<float>;
-#ifdef SUPPORTS_CUDA_FP16
-template class EmbEltwiseLayernormPluginDynamic<half>;
-#endif  // SUPPORTS_CUDA_FP16
-
 #endif
 
 }  // namespace plugin
diff --git a/paddle/fluid/inference/tensorrt/plugin/emb_eltwise_layernorm_plugin.h b/paddle/fluid/inference/tensorrt/plugin/emb_eltwise_layernorm_plugin.h
index 5babd87db0602e973452efa613fcaf9810d29afa..24ca853104e35c26a2f9add57fd2f8bc025646c2 100644
--- a/paddle/fluid/inference/tensorrt/plugin/emb_eltwise_layernorm_plugin.h
+++ b/paddle/fluid/inference/tensorrt/plugin/emb_eltwise_layernorm_plugin.h
@@ -27,14 +27,76 @@ namespace tensorrt {
 namespace plugin {
 
 #if IS_TRT_VERSION_GE(6000)
+
+class EmbEltwiseLayernormPluginDynamicImplBase {
+ public:
+  EmbEltwiseLayernormPluginDynamicImplBase() {}
+  virtual ~EmbEltwiseLayernormPluginDynamicImplBase() {}
+
+  virtual int initialize() = 0;
+  virtual void terminate() = 0;
+  virtual int enqueue(const nvinfer1::PluginTensorDesc* inputDesc,
+                      const nvinfer1::PluginTensorDesc* outputDesc,
+                      const void* const* inputs, void* const* outputs,
+                      void* workspace, cudaStream_t stream) = 0;
+};
+
 template <typename T>
+class EmbEltwiseLayernormPluginDynamicImpl
+    : public EmbEltwiseLayernormPluginDynamicImplBase {
+ public:
+  explicit EmbEltwiseLayernormPluginDynamicImpl(std::vector<float*> input_embs,
+                                                float* bias, float* scale,
+                                                std::vector<int> emb_sizes,
+                                                int bias_size, int scale_size,
+                                                int hidden_size, float eps)
+      : embs_(input_embs),
+        bias_(bias),
+        scale_(scale),
+        emb_sizes_(emb_sizes),
+        bias_size_(bias_size),
+        scale_size_(scale_size),
+        hidden_size_(hidden_size),
+        eps_(eps) {}
+
+  ~EmbEltwiseLayernormPluginDynamicImpl();
+
+  int initialize();
+  void terminate();
+  int enqueue(const nvinfer1::PluginTensorDesc* inputDesc,
+              const nvinfer1::PluginTensorDesc* outputDesc,
+              const void* const* inputs, void* const* outputs, void* workspace,
+              cudaStream_t stream);
+
+ private:
+  std::vector<float*> embs_;
+  float* bias_{nullptr};
+  float* scale_{nullptr};
+
+  // data on devices
+  float* bias_gpu_{nullptr};
+  float* scale_gpu_{nullptr};
+  std::vector<T*> embs_gpu_;
+
+  std::vector<int> emb_sizes_;
+  int bias_size_;
+  int scale_size_;
+  int hidden_size_;
+  float eps_;
+
+  framework::Tensor in_ptr_tensor_, emb_ptr_tensor_;
+  int device_id_{0};
+  uintptr_t old_input_ptr_{0};
+};
+
 class EmbEltwiseLayernormPluginDynamic : public DynamicPluginTensorRT {
  public:
   explicit EmbEltwiseLayernormPluginDynamic(std::vector<float*> input_embs,
                                             float* bias, float* scale,
                                             std::vector<int> emb_sizes,
                                             int bias_size, int scale_size,
-                                            int hidden_size, float eps)
+                                            int hidden_size, float eps,
+                                            bool with_fp16)
       : embs_(input_embs),
         bias_(bias),
         scale_(scale),
@@ -42,51 +104,81 @@ class EmbEltwiseLayernormPluginDynamic : public DynamicPluginTensorRT {
         bias_size_(bias_size),
         scale_size_(scale_size),
         hidden_size_(hidden_size),
-        eps_(eps) {}
+        eps_(eps),
+        with_fp16_(with_fp16),
+        own_host_buff_(false) {
+    if (with_fp16) {
+#ifdef SUPPORTS_CUDA_FP16
+      impl_ = new EmbEltwiseLayernormPluginDynamicImpl<half>(
+          embs_, bias_, scale_, emb_sizes_, bias_size_, scale_size_,
+          hidden_size_, eps_);
+#else
+      PADDLE_THROW(platform::errors::Fatal(
+          "Unsupported data type, current GPU doesn't support half."));
+#endif  // SUPPORTS_CUDA_FP16
+    } else {
+      impl_ = new EmbEltwiseLayernormPluginDynamicImpl<float>(
+          embs_, bias_, scale_, emb_sizes_, bias_size_, scale_size_,
+          hidden_size_, eps_);
+    }
+  }
 
   EmbEltwiseLayernormPluginDynamic(void const* serial_data,
-                                   size_t serial_length) {
+                                   size_t serial_length)
+      : own_host_buff_(true) {
     DeserializeValue(&serial_data, &serial_length, &emb_sizes_);
 
-    embs_gpu_.resize(emb_sizes_.size());
     embs_.resize(emb_sizes_.size());
     for (size_t i = 0; i < emb_sizes_.size(); i++) {
-      cudaMalloc(&embs_gpu_[i], sizeof(float) * emb_sizes_[i]);
-      cudaMemcpy(embs_gpu_[i], serial_data, emb_sizes_[i] * sizeof(float),
-                 cudaMemcpyHostToDevice);
+      auto size = emb_sizes_[i];
+      auto ptr = new float[size];
+      memcpy(ptr, serial_data, sizeof(float) * size);
+      embs_[i] = ptr;
       reinterpret_cast<char const*&>(serial_data) +=
           emb_sizes_[i] * sizeof(float);
       serial_length -= emb_sizes_[i] * sizeof(float);
-      embs_[i] = nullptr;
     }
     DeserializeValue(&serial_data, &serial_length, &bias_size_);
     DeserializeValue(&serial_data, &serial_length, &scale_size_);
 
-    cudaMalloc(&bias_gpu_, sizeof(float) * bias_size_);
-    cudaMemcpy(bias_gpu_, serial_data, bias_size_ * sizeof(float),
-               cudaMemcpyHostToDevice);
-    bias_ = nullptr;
+    if (bias_size_) {
+      bias_ = new float[bias_size_];
+      memcpy(bias_, serial_data, sizeof(float) * bias_size_);
+    }
     reinterpret_cast<char const*&>(serial_data) += bias_size_ * sizeof(float);
     serial_length -= bias_size_ * sizeof(float);
 
-    cudaMalloc(&scale_gpu_, sizeof(float) * scale_size_);
-    cudaMemcpy(scale_gpu_, serial_data, scale_size_ * sizeof(float),
-               cudaMemcpyHostToDevice);
-    scale_ = nullptr;
+    if (scale_size_) {
+      scale_ = new float[scale_size_];
+      memcpy(scale_, serial_data, sizeof(float) * scale_size_);
+    }
     reinterpret_cast<char const*&>(serial_data) += scale_size_ * sizeof(float);
     serial_length -= scale_size_ * sizeof(float);
 
     DeserializeValue(&serial_data, &serial_length, &hidden_size_);
     DeserializeValue(&serial_data, &serial_length, &eps_);
+    DeserializeValue(&serial_data, &serial_length, &with_fp16_);
+
+    if (with_fp16_) {
+#ifdef SUPPORTS_CUDA_FP16
+      impl_ = new EmbEltwiseLayernormPluginDynamicImpl<half>(
+          embs_, bias_, scale_, emb_sizes_, bias_size_, scale_size_,
+          hidden_size_, eps_);
+#else
+      PADDLE_THROW(platform::errors::Fatal(
+          "Unsupported data type, current GPU doesn't support half."));
+#endif  // SUPPORTS_CUDA_FP16
+    } else {
+      impl_ = new EmbEltwiseLayernormPluginDynamicImpl<float>(
+          embs_, bias_, scale_, emb_sizes_, bias_size_, scale_size_,
+          hidden_size_, eps_);
+    }
   }
 
   nvinfer1::IPluginV2DynamicExt* clone() const override {
     auto ptr = new EmbEltwiseLayernormPluginDynamic(
         embs_, bias_, scale_, emb_sizes_, bias_size_, scale_size_, hidden_size_,
-        eps_);
-    ptr->embs_gpu_ = embs_gpu_;
-    ptr->bias_gpu_ = bias_gpu_;
-    ptr->scale_gpu_ = scale_gpu_;
+        eps_, with_fp16_);
     return ptr;
   }
 
@@ -95,6 +187,7 @@ class EmbEltwiseLayernormPluginDynamic : public DynamicPluginTensorRT {
   }
   int getNbOutputs() const override { return 1; }
   int initialize() override;
+  void terminate() override;
 
   size_t getSerializationSize() const override {
     int sum_num = 0;
@@ -110,24 +203,32 @@ class EmbEltwiseLayernormPluginDynamic : public DynamicPluginTensorRT {
     sum_num += (bias_size_ + scale_size_) * sizeof(float);
     sum_num += SerializedSize(hidden_size_);
     sum_num += SerializedSize(eps_);
-    // sum_num += SerializedSize(with_fp16_);
+    sum_num += SerializedSize(with_fp16_);
 
     return sum_num;
   }
 
-  void terminate() override;
   void serialize(void* buffer) const override {
-    // SerializeValue(&buffer, with_fp16_);
     SerializeValue(&buffer, emb_sizes_);
     for (size_t i = 0; i < emb_sizes_.size(); i++) {
-      SerializeCudaPointer(&buffer, embs_gpu_[i], emb_sizes_[i]);
+      auto size = emb_sizes_[i];
+      for (int j = 0; j < size; ++j) {
+        SerializeValue(&buffer, embs_[i][j]);
+      }
     }
     SerializeValue(&buffer, bias_size_);
     SerializeValue(&buffer, scale_size_);
-    SerializeCudaPointer(&buffer, bias_gpu_, bias_size_);
-    SerializeCudaPointer(&buffer, scale_gpu_, scale_size_);
+    for (int i = 0; i < bias_size_; ++i) {
+      SerializeValue(&buffer, bias_[i]);
+    }
+
+    for (int i = 0; i < scale_size_; ++i) {
+      SerializeValue(&buffer, scale_[i]);
+    }
+
     SerializeValue(&buffer, hidden_size_);
     SerializeValue(&buffer, eps_);
+    SerializeValue(&buffer, with_fp16_);
   }
 
   nvinfer1::DimsExprs getOutputDimensions(
@@ -158,23 +259,33 @@ class EmbEltwiseLayernormPluginDynamic : public DynamicPluginTensorRT {
                                        const nvinfer1::DataType* input_types,
                                        int nb_inputs) const override;
 
-  void destroy() override { delete this; }
+  void destroy() override {
+    if (own_host_buff_) {
+      for (auto ptr : embs_) {
+        delete[] ptr;
+      }
+      delete[] bias_;
+      delete[] scale_;
+    }
+
+    delete impl_;
+    delete this;
+  }
 
  private:
   std::vector<float*> embs_;
   float* bias_;
   float* scale_;
 
-  // data on devices
-  float* bias_gpu_;
-  float* scale_gpu_;
-  std::vector<float*> embs_gpu_;
-
   std::vector<int> emb_sizes_;
   int bias_size_;
   int scale_size_;
   int hidden_size_;
   float eps_;
+
+  bool with_fp16_;
+  bool own_host_buff_{false};
+  EmbEltwiseLayernormPluginDynamicImplBase* impl_{nullptr};
 };
 
 class EmbEltwiseLayernormPluginV2Creator : public nvinfer1::IPluginCreator {
@@ -198,8 +309,7 @@ class EmbEltwiseLayernormPluginV2Creator : public nvinfer1::IPluginCreator {
   nvinfer1::IPluginV2* deserializePlugin(const char* name,
                                          const void* serial_data,
                                          size_t serial_length) override {
-    return new EmbEltwiseLayernormPluginDynamic<float>(serial_data,
-                                                       serial_length);
+    return new EmbEltwiseLayernormPluginDynamic(serial_data, serial_length);
   }
 
   void setPluginNamespace(const char* lib_namespace) override {
diff --git a/paddle/fluid/inference/tensorrt/plugin/slice_op_plugin.cu b/paddle/fluid/inference/tensorrt/plugin/slice_op_plugin.cu
index 4fb1d8241084d7af787c32949b63819cddbfcb82..5c56270627a6fcb49eb0713d2282c224719fc38d 100644
--- a/paddle/fluid/inference/tensorrt/plugin/slice_op_plugin.cu
+++ b/paddle/fluid/inference/tensorrt/plugin/slice_op_plugin.cu
@@ -26,8 +26,10 @@ namespace inference {
 namespace tensorrt {
 namespace plugin {
 
-// Dynamic Plugin below.
-#if IS_TRT_VERSION_GE(6000)
+SlicePlugin *CreateSlicePluginDeserialize(const void *buffer, size_t length) {
+  return new SlicePlugin(buffer, length);
+}
+REGISTER_TRT_PLUGIN("slice_plugin", CreateSlicePluginDeserialize);
 
 template <typename T>
 __global__ void SliceKernel(int num, int dims, const T *input,
@@ -56,11 +58,196 @@ __global__ void SliceKernel(int num, int dims, const T *input,
   }
 }
 
+SlicePlugin::SlicePlugin(std::vector<int> starts, std::vector<int> ends,
+                         std::vector<int> axes, bool ban_fp16)
+    : starts_(starts), ends_(ends), axes_(axes), ban_fp16_(ban_fp16) {
+  cudaEventCreate(&copy_event_);
+  cudaStreamCreate(&copy_stream_);
+}
+
+SlicePlugin::SlicePlugin(void const *serial_data, size_t serial_length) {
+  deserializeBase(serial_data, serial_length);
+  DeserializeValue(&serial_data, &serial_length, &starts_);
+  DeserializeValue(&serial_data, &serial_length, &ends_);
+  DeserializeValue(&serial_data, &serial_length, &axes_);
+  DeserializeValue(&serial_data, &serial_length, &ban_fp16_);
+  cudaEventCreate(&copy_event_);
+  cudaStreamCreate(&copy_stream_);
+}
+
+SlicePlugin::~SlicePlugin() {
+  cudaStreamDestroy(copy_stream_);
+  cudaEventDestroy(copy_event_);
+  cudaFree(offset_temp_data_);
+}
+
+SlicePlugin *SlicePlugin::clone() const {
+  return new SlicePlugin(starts_, ends_, axes_, ban_fp16_);
+}
+
+bool SlicePlugin::supportsFormat(nvinfer1::DataType type,
+                                 nvinfer1::PluginFormat format) const {
+#ifdef SUPPORTS_CUDA_FP16
+  return ((type == nvinfer1::DataType::kFLOAT ||
+           type == nvinfer1::DataType::kHALF) &&
+          (format == nvinfer1::PluginFormat::kNCHW));
+#else
+  return ((type == nvinfer1::DataType::kFLOAT) &&
+          (format == nvinfer1::PluginFormat::kNCHW));
+#endif
+}
+
+nvinfer1::Dims SlicePlugin::getOutputDimensions(int index,
+                                                const nvinfer1::Dims *inputs,
+                                                int nb_input_dims) {
+  auto in_dims = inputs[0];
+  nvinfer1::Dims out_dims = in_dims;
+  for (size_t i = 0; i < axes_.size(); i++) {
+    int start = starts_[i];
+    int end = ends_[i];
+    out_dims.d[axes_[i] - 1] = end - start;
+  }
+  return out_dims;
+}
+
+int SlicePlugin::enqueue(int batch_size, const void *const *inputs,
+                         void **outputs, void *workspace, cudaStream_t stream) {
+  auto input_dims = getInputDims(0);
+
+  // notice input dims is [C, H, W], add input batch dim here
+  auto out_dims = getOutputDimensions(0, &input_dims, 1);
+  input_dims.nbDims += 1;
+  out_dims.nbDims += 1;
+  for (auto i = input_dims.nbDims; i > 0; --i) {
+    input_dims.d[i] = input_dims.d[i - 1];
+    out_dims.d[i] = out_dims.d[i - 1];
+  }
+  input_dims.d[0] = batch_size;
+  out_dims.d[0] = batch_size;
+
+  auto num_dims = input_dims.nbDims;
+  size_t out_num = ProductDim(out_dims);
+
+  std::vector<int> seg_offsets;
+  std::vector<int> offsets;
+  std::vector<int> extends;
+
+  offsets.resize(num_dims);
+  extends.resize(num_dims);
+  seg_offsets.resize(num_dims);
+
+  seg_offsets[num_dims - 1] = 1;
+  for (int i = num_dims - 2; i >= 0; i--) {
+    seg_offsets[i] = input_dims.d[i + 1] * seg_offsets[i + 1];
+  }
+  for (size_t i = 0; i < num_dims; ++i) {
+    offsets[i] = 0;
+    extends[i] = out_dims.d[i];
+  }
+  for (size_t i = 0; i < axes_.size(); ++i) {
+    offsets[axes_[i]] = starts_[i];
+  }
+
+  std::vector<int> offset_info;
+  for (size_t i = 0; i < num_dims; ++i) {
+    offset_info.push_back(offsets[i]);
+    offset_info.push_back(extends[i]);
+    offset_info.push_back(seg_offsets[i]);
+  }
+
+  if (offset_temp_data_ == nullptr) {
+    cudaMalloc(&offset_temp_data_, 3 * num_dims * sizeof(int));
+  }
+
+  cudaMemcpyAsync(offset_temp_data_, offset_info.data(),
+                  sizeof(int) * 3 * num_dims, cudaMemcpyHostToDevice,
+                  copy_stream_);
+
+  cudaEventRecord(copy_event_, copy_stream_);
+  cudaStreamWaitEvent(stream, copy_event_, 0);
+
+  int threads = 256;
+  int blocks = (out_num + threads - 1) / threads;
+  auto input_type = getDataType();
+  if (input_type == nvinfer1::DataType::kFLOAT) {
+    const float *input1 = static_cast<const float *>(inputs[0]);
+    float *output = static_cast<float *>(outputs[0]);
+    SliceKernel<float><<<blocks, threads, 3 * num_dims * sizeof(int), stream>>>(
+        out_num, num_dims, input1, offset_temp_data_, output);
+  } else if (input_type == nvinfer1::DataType::kHALF) {
+#ifdef SUPPORTS_CUDA_FP16
+    const half *input1 = static_cast<const half *>(inputs[0]);
+    half *output = static_cast<half *>(outputs[0]);
+    SliceKernel<half><<<blocks, threads, 3 * num_dims * sizeof(int), stream>>>(
+        out_num, num_dims, input1, offset_temp_data_, output);
+#else
+    PADDLE_THROW(platform::errors::Fatal(
+        "The cuda archs you specific should greater than 600."));
+#endif
+  } else {
+    PADDLE_THROW(platform::errors::Fatal(
+        "The Slice TRT Plugin's input type should be float or half."));
+  }
+  return cudaGetLastError() != cudaSuccess;
+}
+
+size_t SlicePlugin::getSerializationSize() {
+  return getBaseSerializationSize() + SerializedSize(getPluginType()) +
+         SerializedSize(starts_) + SerializedSize(ends_) +
+         SerializedSize(axes_) + SerializedSize(ban_fp16_);
+}
+
+void SlicePlugin::serialize(void *buffer) {
+  SerializeValue(&buffer, getPluginType());
+  serializeBase(buffer);
+  SerializeValue(&buffer, starts_);
+  SerializeValue(&buffer, ends_);
+  SerializeValue(&buffer, axes_);
+  SerializeValue(&buffer, ban_fp16_);
+}
+
+// Dynamic Plugin below.
+#if IS_TRT_VERSION_GE(6000)
+SlicePluginDynamic::SlicePluginDynamic(std::vector<int> starts,
+                                       std::vector<int> ends,
+                                       std::vector<int> axes, bool ban_fp16)
+    : starts_(starts), ends_(ends), axes_(axes), ban_fp16_(ban_fp16) {
+  cudaEventCreate(&copy_event_);
+  cudaStreamCreate(&copy_stream_);
+}
+
+SlicePluginDynamic::SlicePluginDynamic(void const *serialData,
+                                       size_t serialLength) {
+  DeserializeValue(&serialData, &serialLength, &starts_);
+  DeserializeValue(&serialData, &serialLength, &ends_);
+  DeserializeValue(&serialData, &serialLength, &axes_);
+  DeserializeValue(&serialData, &serialLength, &ban_fp16_);
+  cudaEventCreate(&copy_event_);
+  cudaStreamCreate(&copy_stream_);
+}
+
+void SlicePluginDynamic::destroy() {
+  cudaStreamDestroy(copy_stream_);
+  cudaEventDestroy(copy_event_);
+  cudaFree(offset_temp_data_);
+  delete this;
+}
+
 int SlicePluginDynamic::initialize() { return 0; }
 
-size_t SlicePluginDynamic::getSerializationSize() const { return 0; }
+size_t SlicePluginDynamic::getSerializationSize() const {
+  size_t size = SerializedSize(starts_) + SerializedSize(ends_) +
+                SerializedSize(axes_) + SerializedSize(ban_fp16_);
 
-void SlicePluginDynamic::serialize(void *buffer) const {}
+  return size;
+}
+
+void SlicePluginDynamic::serialize(void *buffer) const {
+  SerializeValue(&buffer, starts_);
+  SerializeValue(&buffer, ends_);
+  SerializeValue(&buffer, axes_);
+  SerializeValue(&buffer, ban_fp16_);
+}
 
 nvinfer1::DimsExprs SlicePluginDynamic::getOutputDimensions(
     int output_index, const nvinfer1::DimsExprs *inputs, int nb_inputs,
@@ -136,9 +323,9 @@ int SlicePluginDynamic::enqueue(const nvinfer1::PluginTensorDesc *input_desc,
   std::vector<int> offsets;
   std::vector<int> extends;
 
-  offsets.reserve(num_dims);
-  extends.reserve(num_dims);
-  seg_offsets.reserve(num_dims);
+  offsets.resize(num_dims);
+  extends.resize(num_dims);
+  seg_offsets.resize(num_dims);
 
   seg_offsets[num_dims - 1] = 1;
   for (int i = num_dims - 2; i >= 0; i--) {
@@ -160,16 +347,16 @@ int SlicePluginDynamic::enqueue(const nvinfer1::PluginTensorDesc *input_desc,
     offset_info.push_back(seg_offsets[i]);
   }
 
-  framework::Tensor offset_temp_tensor;
+  if (offset_temp_data_ == nullptr) {
+    cudaMalloc(&offset_temp_data_, 3 * num_dims * sizeof(int));
+  }
 
-  int device_id;
-  cudaGetDevice(&device_id);
-  offset_temp_tensor.Resize({3 * num_dims});
-  auto *offset_temp_data =
-      offset_temp_tensor.mutable_data<int>(platform::CUDAPlace(device_id));
+  cudaMemcpyAsync(offset_temp_data_, offset_info.data(),
+                  sizeof(int) * 3 * num_dims, cudaMemcpyHostToDevice,
+                  copy_stream_);
 
-  cudaMemcpyAsync(offset_temp_data, offset_info.data(),
-                  sizeof(int) * 3 * num_dims, cudaMemcpyHostToDevice, stream);
+  cudaEventRecord(copy_event_, copy_stream_);
+  cudaStreamWaitEvent(stream, copy_event_, 0);
 
   int threads = 256;
   int blocks = (out_num + threads - 1) / threads;
@@ -178,13 +365,13 @@ int SlicePluginDynamic::enqueue(const nvinfer1::PluginTensorDesc *input_desc,
     const float *input1 = static_cast<const float *>(inputs[0]);
     float *output = static_cast<float *>(outputs[0]);
     SliceKernel<float><<<blocks, threads, 3 * num_dims * sizeof(int), stream>>>(
-        out_num, num_dims, input1, offset_temp_data, output);
+        out_num, num_dims, input1, offset_temp_data_, output);
   } else if (input_type == nvinfer1::DataType::kHALF) {
 #ifdef SUPPORTS_CUDA_FP16
     const half *input1 = static_cast<const half *>(inputs[0]);
     half *output = static_cast<half *>(outputs[0]);
     SliceKernel<half><<<blocks, threads, 3 * num_dims * sizeof(int), stream>>>(
-        out_num, num_dims, input1, offset_temp_data, output);
+        out_num, num_dims, input1, offset_temp_data_, output);
 #else
     PADDLE_THROW(platform::errors::Fatal(
         "The cuda archs you specific should greater than 600."));
diff --git a/paddle/fluid/inference/tensorrt/plugin/slice_op_plugin.h b/paddle/fluid/inference/tensorrt/plugin/slice_op_plugin.h
index 13d86df131f6fff58dc896d802c8f3ad959b30bc..e36a270f05d9fee497fa1a033ed16faf08c08225 100644
--- a/paddle/fluid/inference/tensorrt/plugin/slice_op_plugin.h
+++ b/paddle/fluid/inference/tensorrt/plugin/slice_op_plugin.h
@@ -26,17 +26,56 @@ namespace inference {
 namespace tensorrt {
 namespace plugin {
 
+class SlicePlugin : public PluginTensorRT {
+ public:
+  explicit SlicePlugin(std::vector<int> starts, std::vector<int> ends,
+                       std::vector<int> axes, bool ban_fp16);
+
+  // It was used for tensorrt deserialization.
+  // It should not be called by users.
+  SlicePlugin(void const* serial_data, size_t serial_length);
+  ~SlicePlugin();
+  SlicePlugin* clone() const override;
+
+  const char* getPluginType() const override { return "slice_plugin"; }
+  int getNbOutputs() const override { return 1; }
+  int initialize() override { return 0; }
+  bool supportsFormat(nvinfer1::DataType type,
+                      nvinfer1::PluginFormat format) const override;
+  nvinfer1::Dims getOutputDimensions(int index, const nvinfer1::Dims* inputs,
+                                     int nb_input_dims) override;
+  int enqueue(int batch_size, const void* const* inputs, void** outputs,
+              void* workspace, cudaStream_t stream) override;
+
+ protected:
+  size_t getSerializationSize() override;
+
+  // TRT will call this func  to serialize the configuration of TRT
+  // It should not be called by users.
+  void serialize(void* buffer) override;
+
+ private:
+  std::vector<int> starts_;
+  std::vector<int> ends_;
+  std::vector<int> axes_;
+  bool ban_fp16_{false};
+  int* offset_temp_data_{nullptr};
+  cudaEvent_t copy_event_;
+  cudaStream_t copy_stream_;
+};
+
 #if IS_TRT_VERSION_GE(6000)
 class SlicePluginDynamic : public DynamicPluginTensorRT {
  public:
   explicit SlicePluginDynamic(std::vector<int> starts, std::vector<int> ends,
-                              std::vector<int> axes, bool ban_fp16)
-      : starts_(starts), ends_(ends), axes_(axes), ban_fp16_(ban_fp16) {}
-  SlicePluginDynamic(void const* serialData, size_t serialLength) {}
+                              std::vector<int> axes, bool ban_fp16);
+
   nvinfer1::IPluginV2DynamicExt* clone() const override {
     return new SlicePluginDynamic(starts_, ends_, axes_, ban_fp16_);
   }
 
+  SlicePluginDynamic(void const* serialData, size_t serialLength);
+
   const char* getPluginType() const override { return "slice_plugin"; }
   int getNbOutputs() const override { return 1; }
   int initialize() override;
@@ -72,15 +111,54 @@ class SlicePluginDynamic : public DynamicPluginTensorRT {
                                        const nvinfer1::DataType* inputTypes,
                                        int nbInputs) const override;
 
-  void destroy() override { delete this; }
+  void destroy() override;
 
  private:
   std::vector<int> starts_;
   std::vector<int> ends_;
   std::vector<int> axes_;
-
   bool ban_fp16_{false};
+  int* offset_temp_data_{nullptr};
+  cudaEvent_t copy_event_;
+  cudaStream_t copy_stream_;
 };
+
+class SlicePluginV2Creator : public nvinfer1::IPluginCreator {
+ public:
+  SlicePluginV2Creator() {}
+  const char* getPluginName() const override { return "slice_plugin"; }
+
+  const char* getPluginVersion() const override { return "1"; }
+
+  const nvinfer1::PluginFieldCollection* getFieldNames() override {
+    return &field_collection_;
+  }
+
+  nvinfer1::IPluginV2* createPlugin(
+      const char* name, const nvinfer1::PluginFieldCollection* fc) override {
+    return nullptr;
+  }
+
+  nvinfer1::IPluginV2* deserializePlugin(const char* name,
+                                         const void* serialData,
+                                         size_t serialLength) override {
+    auto plugin = new SlicePluginDynamic(serialData, serialLength);
+    return plugin;
+  }
+
+  void setPluginNamespace(const char* libNamespace) override {
+    namespace_ = libNamespace;
+  }
+
+  const char* getPluginNamespace() const override { return namespace_.c_str(); }
+
+ private:
+  std::string namespace_;
+  nvinfer1::PluginFieldCollection field_collection_;
+};
+
+REGISTER_TRT_PLUGIN_V2(SlicePluginV2Creator);
+
 #endif
 
 }  // namespace plugin
diff --git a/paddle/fluid/inference/tensorrt/plugin/trt_plugin_factory.cc b/paddle/fluid/inference/tensorrt/plugin/trt_plugin_factory.cc
index 3c20b6d1e725273dbfdc20c01fb01deea4e8d88e..0bf8a1691e2192b278fcd209162135027ed24e71 100644
--- a/paddle/fluid/inference/tensorrt/plugin/trt_plugin_factory.cc
+++ b/paddle/fluid/inference/tensorrt/plugin/trt_plugin_factory.cc
@@ -25,8 +25,10 @@ PluginTensorRT* PluginFactoryTensorRT::createPlugin(const char* layer_name,
   const char* plugin_type;
   DeserializeValue(&serial_data, &serial_length, &plugin_type);
 
-  PADDLE_ENFORCE(Has(plugin_type),
-                 "trt plugin type %s does not exists, check it.", plugin_type);
+  PADDLE_ENFORCE_EQ(
+      Has(plugin_type), true,
+      platform::errors::NotFound("TensorRT plugin type `%s` does not exists.",
+                                 plugin_type));
   auto plugin = plugin_registry_[plugin_type](serial_data, serial_length);
   owned_plugins_.emplace_back(plugin);
 
diff --git a/paddle/fluid/inference/tensorrt/plugin/trt_plugin_utils.h b/paddle/fluid/inference/tensorrt/plugin/trt_plugin_utils.h
index 18037179c7b98952b6088361954e869ecedfb2c7..16751c764bd03af9bbb7cbd77dd9287c17150dd5 100644
--- a/paddle/fluid/inference/tensorrt/plugin/trt_plugin_utils.h
+++ b/paddle/fluid/inference/tensorrt/plugin/trt_plugin_utils.h
@@ -103,7 +103,11 @@ struct Serializer<std::vector<T>,
     DeserializeValue(buffer, buffer_size, &size);
     value->resize(size);
     size_t nbyte = value->size() * sizeof(T);
-    PADDLE_ENFORCE_GE(*buffer_size, nbyte);
+    PADDLE_ENFORCE_GE(*buffer_size, nbyte,
+                      platform::errors::InvalidArgument(
+                          "Insufficient data in buffer, expect contains %d "
+                          "byte, but actually only contains %d byte.",
+                          *buffer_size, nbyte));
     std::memcpy(value->data(), *buffer, nbyte);
     reinterpret_cast<char const*&>(*buffer) += nbyte;
     *buffer_size -= nbyte;
diff --git a/paddle/fluid/inference/tensorrt/test_engine.cc b/paddle/fluid/inference/tensorrt/test_engine.cc
index a03dd45db0f80487cb4c2e6b68f94944e8558ae4..72962c733ecf6a7bc6871fd3a5c65d6156b084d4 100644
--- a/paddle/fluid/inference/tensorrt/test_engine.cc
+++ b/paddle/fluid/inference/tensorrt/test_engine.cc
@@ -74,7 +74,9 @@ TEST_F(TensorRTEngineTest, add_layer) {
                                   nvinfer1::DimsCHW{1, 1, 1});
   auto *fc_layer = TRT_ENGINE_ADD_LAYER(engine_, FullyConnected, *x, size,
                                         weight.get(), bias.get());
-  PADDLE_ENFORCE(fc_layer != nullptr);
+  PADDLE_ENFORCE_NOT_NULL(fc_layer,
+                          platform::errors::InvalidArgument(
+                              "TRT fully connected layer building failed."));
 
   engine_->DeclareOutput(fc_layer, 0, "y");
   LOG(INFO) << "freeze network";
@@ -116,7 +118,9 @@ TEST_F(TensorRTEngineTest, add_layer_multi_dim) {
                                   nvinfer1::DimsCHW{1, 2, 1});
   auto *fc_layer = TRT_ENGINE_ADD_LAYER(engine_, FullyConnected, *x, 2,
                                         weight.get(), bias.get());
-  PADDLE_ENFORCE(fc_layer != nullptr);
+  PADDLE_ENFORCE_NOT_NULL(fc_layer,
+                          platform::errors::InvalidArgument(
+                              "TRT fully connected layer building failed."));
 
   engine_->DeclareOutput(fc_layer, 0, "y");
   engine_->FreezeNetwork();
@@ -160,7 +164,9 @@ TEST_F(TensorRTEngineTest, test_conv2d) {
   auto *conv_layer =
       TRT_ENGINE_ADD_LAYER(engine_, Convolution, *x, 1, nvinfer1::DimsHW{3, 3},
                            weight.get(), bias.get());
-  PADDLE_ENFORCE(conv_layer != nullptr);
+  PADDLE_ENFORCE_NOT_NULL(conv_layer,
+                          platform::errors::InvalidArgument(
+                              "TRT convolution layer building failed."));
   conv_layer->setStride(nvinfer1::DimsHW{1, 1});
   conv_layer->setPadding(nvinfer1::DimsHW{1, 1});
 
@@ -199,7 +205,9 @@ TEST_F(TensorRTEngineTest, test_pool2d) {
   auto *pool_layer = TRT_ENGINE_ADD_LAYER(engine_, Pooling, *x, pool_t,
                                           nvinfer1::DimsHW{2, 2});
 
-  PADDLE_ENFORCE(pool_layer != nullptr);
+  PADDLE_ENFORCE_NOT_NULL(
+      pool_layer,
+      platform::errors::InvalidArgument("TRT pooling layer building failed."));
   pool_layer->setStride(nvinfer1::DimsHW{1, 1});
   pool_layer->setPadding(nvinfer1::DimsHW{0, 0});
 
diff --git a/paddle/fluid/inference/tensorrt/trt_int8_calibrator.cc b/paddle/fluid/inference/tensorrt/trt_int8_calibrator.cc
index 34b7072b2ee688c2ac01229ff5d3a234af3680b5..743f7740e5faaa1991172ef2a8d1cd38ad47fab5 100644
--- a/paddle/fluid/inference/tensorrt/trt_int8_calibrator.cc
+++ b/paddle/fluid/inference/tensorrt/trt_int8_calibrator.cc
@@ -83,9 +83,8 @@ bool TRTInt8Calibrator::setBatch(
           engine_name_, it.first));
     }
     const auto& d = dataptr->second;
-    PADDLE_ENFORCE(
-        cudaMemcpy(d.first, it.second, d.second, cudaMemcpyDeviceToDevice),
-        "Fail to cudaMemcpy %s for %s", engine_name_, it.first);
+    PADDLE_ENFORCE_CUDA_SUCCESS(
+        cudaMemcpy(d.first, it.second, d.second, cudaMemcpyDeviceToDevice));
   }
 
   data_is_set_ = true;
diff --git a/paddle/fluid/inference/tests/api/CMakeLists.txt b/paddle/fluid/inference/tests/api/CMakeLists.txt
index 07af5c152b1cd42d1034ed9c5a1d8d8bc3782827..146d5932577fb7f4e2e33f6d98c51756ffd02073 100644
--- a/paddle/fluid/inference/tests/api/CMakeLists.txt
+++ b/paddle/fluid/inference/tests/api/CMakeLists.txt
@@ -9,7 +9,8 @@ if(WITH_GPU AND TENSORRT_FOUND)
 endif()
 
 function(download_data install_dir data_file)
-    if (NOT EXISTS ${install_dir}/${data_file})
+    string(REGEX MATCH "[^/\\]+$" file_name ${data_file})
+    if (NOT EXISTS ${install_dir}/${file_name})
         inference_download_and_uncompress(${install_dir} ${INFERENCE_URL} ${data_file})
     endif()
 endfunction()
@@ -125,14 +126,16 @@ endfunction()
 if(NOT APPLE AND WITH_MKLML)
     # RNN1
     set(RNN1_INSTALL_DIR "${INFERENCE_DEMO_INSTALL_DIR}/rnn1")
-    download_model_and_data(${RNN1_INSTALL_DIR} "rnn1%2Fmodel.tar.gz" "rnn1%2Fdata.txt.tar.gz")
+    download_model_and_data(${RNN1_INSTALL_DIR} "rnn1/model.tar.gz" "rnn1/data.txt.tar.gz")
     inference_analysis_api_test(test_analyzer_rnn1 ${RNN1_INSTALL_DIR} analyzer_rnn1_tester.cc)
     
     # seq_pool1
     set(SEQ_POOL1_INSTALL_DIR "${INFERENCE_DEMO_INSTALL_DIR}/seq_pool")
     download_model_and_data(${SEQ_POOL1_INSTALL_DIR} "seq_pool1_model_.tar.gz" "seq_pool1_data.txt.tar.gz")
     inference_analysis_api_test(test_analyzer_seq_pool1 ${SEQ_POOL1_INSTALL_DIR} analyzer_seq_pool1_tester.cc)
-    set_tests_properties(test_analyzer_seq_pool1 PROPERTIES TIMEOUT 150)
+    if(NOT WIN32)
+        set_tests_properties(test_analyzer_seq_pool1 PROPERTIES TIMEOUT 150)
+    endif()
 else()
     # TODO: fix this test on MACOS and OPENBLAS, the reason is that
     # fusion_seqexpand_concat_fc_op is not supported on MACOS and OPENBLAS
@@ -192,8 +195,9 @@ download_result(${ERNIE_INSTALL_DIR} "Ernie_large_result.txt.tar.gz")
 inference_analysis_test(test_analyzer_ernie_large SRCS analyzer_ernie_tester.cc
     EXTRA_DEPS ${INFERENCE_EXTRA_DEPS}
     ARGS --infer_model=${ERNIE_INSTALL_DIR}/model --infer_data=${ERNIE_INSTALL_DIR}/data.txt --refer_result=${ERNIE_INSTALL_DIR}/result.txt --ernie_large=true)
-
-set_tests_properties(test_analyzer_ernie_large PROPERTIES TIMEOUT 150 LABELS "RUN_TYPE=NIGHTLY")
+if(NOT WIN32 AND NOT APPLE)
+    set_tests_properties(test_analyzer_ernie_large PROPERTIES TIMEOUT 150 LABELS "RUN_TYPE=NIGHTLY")
+endif()
 
 # text_classification
 set(TEXT_CLASSIFICATION_INSTALL_DIR "${INFERENCE_DEMO_INSTALL_DIR}/text_classification")
@@ -207,7 +211,7 @@ inference_analysis_api_test(test_analyzer_seq_conv1 ${SEQ_CONV1_INSTALL_DIR} ana
 
 # transformer, the dataset only works on batch_size=8 now
 set(TRANSFORMER_INSTALL_DIR "${INFERENCE_DEMO_INSTALL_DIR}/transformer")
-download_model_and_data(${TRANSFORMER_INSTALL_DIR} "temp%2Ftransformer_model.tar.gz" "temp%2Ftransformer_data.txt.tar.gz")
+download_model_and_data(${TRANSFORMER_INSTALL_DIR} "temp/transformer_model.tar.gz" "temp/transformer_data.txt.tar.gz")
 inference_analysis_test(test_analyzer_transformer SRCS analyzer_transformer_tester.cc 
   EXTRA_DEPS ${INFERENCE_EXTRA_DEPS}
   ARGS --infer_model=${TRANSFORMER_INSTALL_DIR}/model --infer_data=${TRANSFORMER_INSTALL_DIR}/data.txt --batch_size=8 
@@ -215,8 +219,8 @@ inference_analysis_test(test_analyzer_transformer SRCS analyzer_transformer_test
 
 # ocr
 set(OCR_INSTALL_DIR "${INFERENCE_DEMO_INSTALL_DIR}/ocr")
-if (NOT EXISTS ${OCR_INSTALL_DIR})
-    inference_download_and_uncompress(${OCR_INSTALL_DIR} "http://paddlemodels.bj.bcebos.com/" "inference-vis-demos%2Focr.tar.gz")
+if (NOT EXISTS ${OCR_INSTALL_DIR}/ocr.tar.gz)
+    inference_download_and_uncompress(${OCR_INSTALL_DIR} "http://paddlemodels.bj.bcebos.com/" "inference-vis-demos/ocr.tar.gz")
 endif()
 inference_analysis_api_test(test_analyzer_ocr ${OCR_INSTALL_DIR} analyzer_vis_tester.cc)
 
@@ -231,8 +235,8 @@ set_property(TEST test_analyzer_detect PROPERTY ENVIRONMENT GLOG_vmodule=analysi
 
 # mobilenet with transpose op
 set(MOBILENET_INSTALL_DIR "${INFERENCE_DEMO_INSTALL_DIR}/mobilenet")
-if (NOT EXISTS ${MOBILENET_INSTALL_DIR})
-    inference_download_and_uncompress(${MOBILENET_INSTALL_DIR} "http://paddlemodels.bj.bcebos.com/" "inference-vis-demos%2Fmobilenet.tar.gz")
+if (NOT EXISTS ${MOBILENET_INSTALL_DIR}/mobilenet.tar.gz)
+    inference_download_and_uncompress(${MOBILENET_INSTALL_DIR} "http://paddlemodels.bj.bcebos.com/" "inference-vis-demos/mobilenet.tar.gz")
 endif()
 inference_analysis_api_test(test_analyzer_mobilenet_transpose ${MOBILENET_INSTALL_DIR} analyzer_vis_tester.cc)
 
@@ -339,9 +343,9 @@ if(WITH_MKLDNN)
   ### Lexcial analysis GRU model
   set(GRU_PATH "${INFERENCE_DEMO_INSTALL_DIR}/gru")
   download_GRU_data("${GRU_PATH}" "GRU_eval_data.tar.gz")
-  download_GRU_data("${GRU_PATH}" "GRU_eval_model.tar.gz")
+  download_GRU_data("${GRU_PATH}" "GRU_eval_model_v2.tar.gz")
   set(GRU_DATA_PATH "${GRU_PATH}/GRU_eval_data.bin")
-  set(GRU_MODEL_PATH "${GRU_PATH}/GRU_eval_model")
+  set(GRU_MODEL_PATH "${GRU_PATH}/GRU_eval_model_v2")
   set(LEXICAL_TEST_APP "test_analyzer_lexical_analysis")
   set(LEXICAL_TEST_APP_SRC "analyzer_lexical_analysis_gru_tester.cc")
 
@@ -360,9 +364,12 @@ if(WITH_MKLDNN)
   inference_analysis_api_test_build(${QUANT_IMG_CLASS_TEST_APP} ${QUANT_IMG_CLASS_TEST_APP_SRC})
 
   # MobileNetV1 FP32 vs. Quant INT8
-  # The FP32 model should already be downloaded for slim Quant unit tests
+  # The FP32 model should already be downloaded for slim Quant unit tests on Linux
   set(QUANT2_MobileNetV1_MODEL_DIR "${QUANT_DATA_DIR}/MobileNetV1_quant2")
   set(QUANT2_INT8_MobileNetV1_MODEL_DIR "${QUANT_DATA_DIR}/MobileNetV1_quant2_int8")
+  if(NOT LINUX)
+      download_quant_data(${QUANT2_MobileNetV1_MODEL_DIR} "MobileNet_qat_perf.tar.gz")
+  endif(NOT LINUX)
   download_quant_data(${QUANT2_INT8_MobileNetV1_MODEL_DIR} "MobileNet_qat_perf_int8.tar.gz")
   inference_analysis_api_quant_test_run(test_analyzer_quant_performance_benchmark ${QUANT_IMG_CLASS_TEST_APP} ${QUANT2_MobileNetV1_MODEL_DIR}/MobileNet_qat_perf/float ${QUANT2_INT8_MobileNetV1_MODEL_DIR}/MobileNet_qat_perf_int8 ${IMAGENET_DATA_PATH})
 
@@ -395,15 +402,15 @@ inference_analysis_api_test(test_analyzer_bert ${BERT_INSTALL_DIR} analyzer_bert
 
 if(WITH_GPU AND TENSORRT_FOUND)
     set(TRT_MODEL_INSTALL_DIR "${INFERENCE_DEMO_INSTALL_DIR}/trt_models")
-    if (NOT EXISTS ${TRT_MODEL_INSTALL_DIR})
+    if (NOT EXISTS ${TRT_MODEL_INSTALL_DIR}/trt_inference_test_models.tar.gz)
         inference_download_and_uncompress(${TRT_MODEL_INSTALL_DIR} ${INFERENCE_URL}/tensorrt_test "trt_inference_test_models.tar.gz")
     endif()
     set(TEST_SPLIT_CONVERTER_MODEL "${TRT_MODEL_INSTALL_DIR}/trt_split_op_converter_test")
-    if (NOT EXISTS ${TEST_SPLIT_CONVERTER_MODEL})
+    if (NOT EXISTS ${TEST_SPLIT_CONVERTER_MODEL}/split_converter.tgz)
         inference_download_and_uncompress(${TEST_SPLIT_CONVERTER_MODEL} ${INFERENCE_URL}/tensorrt_test "split_converter.tgz")
     endif()
     set(TEST_INSTANCE_NORM_MODEL "${TRT_MODEL_INSTALL_DIR}/trt_instance_norm_test")
-    if (NOT EXISTS ${TEST_INSTANCE_NORM_MODEL})
+    if (NOT EXISTS ${TEST_INSTANCE_NORM_MODEL}/instance_norm.tgz)
         inference_download_and_uncompress(${TEST_INSTANCE_NORM_MODEL} ${INFERENCE_URL}/tensorrt_test "instance_norm.tgz")
     endif()
     inference_analysis_test(trt_mobilenet_test SRCS trt_mobilenet_test.cc
@@ -432,7 +439,7 @@ if(WITH_GPU AND TENSORRT_FOUND)
             ARGS --infer_model=${TRT_MODEL_INSTALL_DIR}/trt_inference_test_models)
      
     set(TRT_MODEL_QUANT_RESNET_DIR "${INFERENCE_DEMO_INSTALL_DIR}/small_quant_model")
-    if (NOT EXISTS ${TRT_MODEL_QUANT_RESNET_DIR})
+    if (NOT EXISTS ${TRT_MODEL_QUANT_RESNET_DIR}/small_quant_model.tgz)
         inference_download_and_uncompress(${INFERENCE_DEMO_INSTALL_DIR} ${INFERENCE_URL}/tensorrt_test "small_quant_model.tgz")
     endif()
     inference_analysis_test(trt_quant_int8_test SRCS trt_quant_int8_test.cc
@@ -440,7 +447,7 @@ if(WITH_GPU AND TENSORRT_FOUND)
             ARGS --infer_model=${TRT_MODEL_QUANT_RESNET_DIR})
 
     set(TRT_MODEL_QUANT_YOLOV3_DIR "${INFERENCE_DEMO_INSTALL_DIR}/yolov3_r50_quant_aware")
-    if (NOT EXISTS ${TRT_MODEL_QUANT_YOLOV3_DIR})
+    if (NOT EXISTS ${TRT_MODEL_QUANT_YOLOV3_DIR}/yolov3_r50_quant_aware.tgz)
         inference_download_and_uncompress(${INFERENCE_DEMO_INSTALL_DIR} ${INFERENCE_URL}/tensorrt_test "yolov3_r50_quant_aware.tgz")
     endif()
     inference_analysis_test(trt_quant_int8_yolov3_r50_test SRCS trt_quant_int8_yolov3_r50_test.cc
@@ -448,12 +455,12 @@ if(WITH_GPU AND TENSORRT_FOUND)
             ARGS --infer_model=${TRT_MODEL_QUANT_YOLOV3_DIR})
 
     set(TEST_TRT_DYNAMIC_MODEL2 "${TRT_MODEL_INSTALL_DIR}/complex_model_dynamic")
-    if (NOT EXISTS ${TEST_TRT_DYNAMIC_MODEL2})
+    if (NOT EXISTS ${TEST_TRT_DYNAMIC_MODEL2}/complex_model_dynamic2.tar.gz)
         inference_download_and_uncompress(${TEST_TRT_DYNAMIC_MODEL2} ${INFERENCE_URL}/tensorrt_test "complex_model_dynamic2.tar.gz")
     endif()
 
     set(TEST_TRT_DYNAMIC_MODEL "${TRT_MODEL_INSTALL_DIR}/conv_bn_swish_split_gelu")
-    if (NOT EXISTS ${TEST_TRT_DYNAMIC_MODEL})
+    if (NOT EXISTS ${TEST_TRT_DYNAMIC_MODEL}/conv_bn_swish_split_gelu.tar.gz)
         inference_download_and_uncompress(${TEST_TRT_DYNAMIC_MODEL} ${INFERENCE_URL}/tensorrt_test "conv_bn_swish_split_gelu.tar.gz")
     endif()
     inference_analysis_test(trt_dynamic_shape_test SRCS trt_dynamic_shape_test.cc
@@ -461,7 +468,7 @@ if(WITH_GPU AND TENSORRT_FOUND)
             ARGS --infer_model=${TRT_MODEL_INSTALL_DIR})
 
     set(TEST_TRT_ERNIE_MODEL "${TRT_MODEL_INSTALL_DIR}/ernie_test")
-    if (NOT EXISTS ${TEST_TRT_ERNIE_MODEL})
+    if (NOT EXISTS ${TEST_TRT_ERNIE_MODEL}/ernie_model_4.tar.gz)
         inference_download_and_uncompress(${TEST_TRT_ERNIE_MODEL} ${INFERENCE_URL}/tensorrt_test "ernie_model_4.tar.gz")
     endif()
 
@@ -470,7 +477,7 @@ if(WITH_GPU AND TENSORRT_FOUND)
             ARGS --infer_model=${TEST_TRT_ERNIE_MODEL}/ernie_model_4)
 
     set(TEST_TRT_ERNIE_UNSER_MODEL "${TRT_MODEL_INSTALL_DIR}/ernie_test/ernie_model_4_unserialized/")
-    if (NOT EXISTS ${TEST_TRT_ERNIE_UNSER_MODEL})
+    if (NOT EXISTS ${TEST_TRT_ERNIE_UNSER_MODEL}/ernie_model_4_unserialized.tgz)
         inference_download_and_uncompress(${TEST_TRT_ERNIE_MODEL} ${INFERENCE_URL}/tensorrt_test "ernie_model_4_unserialized.tgz")
     endif()
 
diff --git a/paddle/fluid/inference/tests/api/analyzer_bert_tester.cc b/paddle/fluid/inference/tests/api/analyzer_bert_tester.cc
index f956c34f23ac7cc6ca06b9fcf411d0f2e9b29c54..2570325c24abcbb4bd459944480d3279f24fab1f 100644
--- a/paddle/fluid/inference/tests/api/analyzer_bert_tester.cc
+++ b/paddle/fluid/inference/tests/api/analyzer_bert_tester.cc
@@ -245,8 +245,14 @@ TEST(Analyzer_bert, transfer_scope_cache) {
   // Since paddle::framework::global_transfer_scope_cache() and
   // paddle::framework::global_transfer_data_cache() are thread_local,
   // their pointer should be different among different thread id.
-  PADDLE_ENFORCE(global_transfer_scope_cache.size(), threads_num);
-  PADDLE_ENFORCE(global_transfer_data_cache.size(), threads_num);
+  PADDLE_ENFORCE_EQ(
+      global_transfer_scope_cache.size(), threads_num,
+      paddle::platform::errors::Fatal(
+          "The size of scope cache is not equal to thread number."));
+  PADDLE_ENFORCE_EQ(
+      global_transfer_data_cache.size(), threads_num,
+      paddle::platform::errors::Fatal(
+          "The size of data cache is not equal to thread number."));
 }
 
 }  // namespace inference
diff --git a/paddle/fluid/inference/tests/api/analyzer_capi_pd_tensor_tester.cc b/paddle/fluid/inference/tests/api/analyzer_capi_pd_tensor_tester.cc
index 0bc67aff7af1be9f34ffa2bb71c25d2964a62521..a9c24c4503f9f1b803c0d9fcde21199ef4089c41 100644
--- a/paddle/fluid/inference/tests/api/analyzer_capi_pd_tensor_tester.cc
+++ b/paddle/fluid/inference/tests/api/analyzer_capi_pd_tensor_tester.cc
@@ -69,11 +69,13 @@ void PD_run() {
   PD_DeletePaddleTensor(input);
   int size;
   const int* out_shape = PD_GetPaddleTensorShape(out_data, &size);
-  CHECK(size == 2) << "The Output shape's size is NOT match.";
+  PADDLE_ENFORCE_EQ(size, 2, paddle::platform::errors::InvalidArgument(
+                                 "The Output shape's size is NOT match."));
   std::vector<int> ref_outshape_size({9, 6});
   for (int i = 0; i < 2; ++i) {
-    CHECK(out_shape[i] == ref_outshape_size[i])
-        << "The Output's shape is NOT match.";
+    PADDLE_ENFORCE_EQ(out_shape[i], ref_outshape_size[i],
+                      paddle::platform::errors::InvalidArgument(
+                          "The Output shape's size is NOT match."));
   }
   PD_DeletePaddleBuf(buf);
 }
diff --git a/paddle/fluid/inference/tests/api/analyzer_capi_tester.cc b/paddle/fluid/inference/tests/api/analyzer_capi_tester.cc
index 93fcb43447d01dcafa10d8c85234d243d5095d4e..fd20581123c10f8c569e7765c7a0bf17ddd1d0b9 100644
--- a/paddle/fluid/inference/tests/api/analyzer_capi_tester.cc
+++ b/paddle/fluid/inference/tests/api/analyzer_capi_tester.cc
@@ -36,15 +36,15 @@ void zero_copy_run() {
   PD_SwitchIrDebug(config, true);
   PD_SetModel(config, prog_file.c_str(), params_file.c_str());
   bool use_feed_fetch = PD_UseFeedFetchOpsEnabled(config);
-  CHECK(!use_feed_fetch) << "NO";
+  EXPECT_FALSE(use_feed_fetch);
   bool specify_input_names = PD_SpecifyInputName(config);
-  CHECK(specify_input_names) << "NO";
+  EXPECT_TRUE(specify_input_names);
 
   const int batch_size = 1;
   const int channels = 3;
   const int height = 318;
   const int width = 318;
-  float input[batch_size * channels * height * width] = {0};
+  float *input = new float[batch_size * channels * height * width]();
 
   int shape[4] = {batch_size, channels, height, width};
   int shape_size = 4;
@@ -65,6 +65,7 @@ void zero_copy_run() {
 
   PD_PredictorZeroCopyRun(config, inputs, in_size, &outputs, &out_size);
 
+  delete[] input;
   delete[] inputs;
   delete[] outputs;
 }
@@ -84,10 +85,13 @@ TEST(PD_AnalysisConfig, profile_mkldnn) {
   PD_SwitchIrDebug(config, true);
   PD_EnableMKLDNN(config);
   bool mkldnn_enable = PD_MkldnnEnabled(config);
-  CHECK(mkldnn_enable) << "NO";
+  EXPECT_TRUE(mkldnn_enable);
   PD_EnableMkldnnQuantizer(config);
   bool quantizer_enable = PD_MkldnnQuantizerEnabled(config);
-  CHECK(quantizer_enable) << "NO";
+  EXPECT_TRUE(quantizer_enable);
+  PD_EnableMkldnnBfloat16(config);
+  bool bfloat16_enable = PD_MkldnnBfloat16Enabled(config);
+  EXPECT_TRUE(bfloat16_enable);
   PD_SetMkldnnCacheCapacity(config, 0);
   PD_SetModel(config, prog_file.c_str(), params_file.c_str());
   PD_DeleteAnalysisConfig(config);
diff --git a/paddle/fluid/inference/tests/api/analyzer_dam_tester.cc b/paddle/fluid/inference/tests/api/analyzer_dam_tester.cc
index 00a475b6047e8215264c664dd3c775b9687eb0ff..d61c28c30d203acf4dd48e1461a881d61f8ec263 100644
--- a/paddle/fluid/inference/tests/api/analyzer_dam_tester.cc
+++ b/paddle/fluid/inference/tests/api/analyzer_dam_tester.cc
@@ -126,7 +126,9 @@ void PrepareInputs(std::vector<PaddleTensor> *input_slots, DataRecord *data,
   std::string turn_mask_pre = "turn_mask_";
 
   auto one_batch = data->NextBatch();
-  PADDLE_ENFORCE(!one_batch.response.empty());
+  PADDLE_ENFORCE(
+      !one_batch.response.empty(),
+      paddle::platform::errors::Fatal("The response of one batch is empty."));
   int size = one_batch.response[0].size();
   CHECK_EQ(size, kMaxTurnLen);
   // turn tensor assignment
@@ -214,11 +216,17 @@ void profile(bool use_mkldnn = false) {
                  input_slots_all, &outputs, FLAGS_num_threads);
 
   if (FLAGS_num_threads == 1 && !FLAGS_test_all_data) {
-    PADDLE_ENFORCE_GT(outputs.size(), 0);
+    PADDLE_ENFORCE_GT(outputs.size(), 0,
+                      paddle::platform::errors::Fatal(
+                          "The size of outputs should be greater than 0."));
     auto output = outputs.back();
-    PADDLE_ENFORCE_GT(output.size(), 0);
+    PADDLE_ENFORCE_GT(output.size(), 0,
+                      paddle::platform::errors::Fatal(
+                          "The size of output should be greater than 0."));
     size_t size = GetSize(output[0]);
-    PADDLE_ENFORCE_GT(size, 0);
+    PADDLE_ENFORCE_GT(size, 0,
+                      paddle::platform::errors::Fatal(
+                          "The size of output should be greater than 0."));
     float *result = static_cast<float *>(output[0].data.data());
     for (size_t i = 0; i < size; i++) {
       EXPECT_NEAR(result[i], result_data[i], 1e-3);
diff --git a/paddle/fluid/inference/tests/api/analyzer_image_classification_tester.cc b/paddle/fluid/inference/tests/api/analyzer_image_classification_tester.cc
index 1faffacebcfdb173b96815a6ad223f06ea69c07f..c6a898dc2f315a67e3693abd73f481b08cac414a 100644
--- a/paddle/fluid/inference/tests/api/analyzer_image_classification_tester.cc
+++ b/paddle/fluid/inference/tests/api/analyzer_image_classification_tester.cc
@@ -112,7 +112,11 @@ TEST(Analyzer_resnet50, compare_determine) {
 TEST(Analyzer_resnet50, save_optim_model) {
   AnalysisConfig cfg;
   std::string optimModelPath = FLAGS_infer_model + "/saved_optim_model";
+#ifdef _WIN32
+  _mkdir(optimModelPath.c_str());
+#else
   mkdir(optimModelPath.c_str(), 0777);
+#endif
   SetConfig(&cfg);
   SaveOptimModel(&cfg, optimModelPath);
 }
diff --git a/paddle/fluid/inference/tests/api/analyzer_int8_object_detection_tester.cc b/paddle/fluid/inference/tests/api/analyzer_int8_object_detection_tester.cc
index 7f06a3b9023ba3e907c9731d576f014a3e451113..91a3233b9851f1def7717d04c4c9df5275a805ee 100644
--- a/paddle/fluid/inference/tests/api/analyzer_int8_object_detection_tester.cc
+++ b/paddle/fluid/inference/tests/api/analyzer_int8_object_detection_tester.cc
@@ -146,8 +146,9 @@ std::shared_ptr<std::vector<PaddleTensor>> GetWarmupData(
   auto iterations = test_data.size();
   PADDLE_ENFORCE_LE(
       static_cast<size_t>(num_images), iterations * test_data_batch_size,
-      "The requested quantization warmup data size " +
-          std::to_string(num_images) + " is bigger than all test data size.");
+      paddle::platform::errors::Fatal(
+          "The requested quantization warmup data size " +
+          std::to_string(num_images) + " is bigger than all test data size."));
 
   PaddleTensor images;
   images.name = "image";
@@ -237,8 +238,9 @@ std::shared_ptr<std::vector<PaddleTensor>> GetWarmupData(
   }
   PADDLE_ENFORCE_EQ(
       static_cast<size_t>(num_objects), static_cast<size_t>(objects_accum),
-      "The requested num of objects " + std::to_string(num_objects) +
-          " is the same as objects_accum.");
+      paddle::platform::errors::Fatal("The requested num of objects " +
+                                      std::to_string(num_objects) +
+                                      " is the same as objects_accum."));
 
   auto warmup_data = std::make_shared<std::vector<PaddleTensor>>(4);
   (*warmup_data)[0] = std::move(images);
diff --git a/paddle/fluid/inference/tests/api/analyzer_lac_tester.cc b/paddle/fluid/inference/tests/api/analyzer_lac_tester.cc
index 142905dcd8d9964d93d0c5f7444823eef2b84900..bd3a1d737afb1ba230015fbd602c493f33952ffb 100644
--- a/paddle/fluid/inference/tests/api/analyzer_lac_tester.cc
+++ b/paddle/fluid/inference/tests/api/analyzer_lac_tester.cc
@@ -98,7 +98,9 @@ void GetOneBatch(std::vector<PaddleTensor> *input_slots, DataRecord *data,
   input_tensor.name = "word";
   input_tensor.dtype = PaddleDType::INT64;
   TensorAssignData<int64_t>(&input_tensor, {one_batch.data}, one_batch.lod);
-  PADDLE_ENFORCE_EQ(batch_size, static_cast<int>(one_batch.lod.size() - 1));
+  PADDLE_ENFORCE_EQ(
+      batch_size, static_cast<int>(one_batch.lod.size() - 1),
+      paddle::platform::errors::Fatal("The lod size of one batch is invaild."));
   input_slots->assign({input_tensor});
 }
 
@@ -137,12 +139,17 @@ TEST(Analyzer_LAC, profile) {
         24, 25, 25, 25, 38, 30, 31, 14, 15, 44, 24, 25, 25, 25, 25, 25,
         44, 24, 25, 25, 25, 36, 42, 43, 44, 14, 15, 44, 14, 15, 44, 14,
         15, 44, 38, 39, 14, 15, 44, 22, 23, 23, 23, 23, 23, 23, 23};
-    PADDLE_ENFORCE_GT(outputs.size(), 0);
+    PADDLE_ENFORCE_GT(outputs.size(), 0,
+                      paddle::platform::errors::Fatal(
+                          "The size of output should be greater than 0."));
     auto output = outputs.back();
-    PADDLE_ENFORCE_EQ(output.size(), 1UL);
+    PADDLE_ENFORCE_EQ(output.size(), 1UL,
+                      paddle::platform::errors::Fatal(
+                          "The size of output should be equal to 1."));
     size_t size = GetSize(output[0]);
     size_t batch1_size = sizeof(lac_ref_data) / sizeof(int64_t);
-    PADDLE_ENFORCE_GE(size, batch1_size);
+    PADDLE_ENFORCE_GE(size, batch1_size, paddle::platform::errors::Fatal(
+                                             "The size of batch is invaild."));
     int64_t *pdata = static_cast<int64_t *>(output[0].data.data());
     for (size_t i = 0; i < batch1_size; ++i) {
       EXPECT_EQ(pdata[i], lac_ref_data[i]);
diff --git a/paddle/fluid/inference/tests/api/analyzer_ner_tester.cc b/paddle/fluid/inference/tests/api/analyzer_ner_tester.cc
index 36e07d5f55600dc7aa96227289f707fb19f92d56..50a68361d536f5aab3ed2a6bafe60f2438a9c129 100644
--- a/paddle/fluid/inference/tests/api/analyzer_ner_tester.cc
+++ b/paddle/fluid/inference/tests/api/analyzer_ner_tester.cc
@@ -117,13 +117,19 @@ void profile(bool memory_load = false) {
     // the first inference result
     const int chinese_ner_result_data[] = {30, 45, 41, 48, 17, 26,
                                            48, 39, 38, 16, 25};
-    PADDLE_ENFORCE_GT(outputs.size(), 0);
+    PADDLE_ENFORCE_GT(outputs.size(), 0,
+                      paddle::platform::errors::Fatal(
+                          "The size of output should be greater than 0."));
     auto output = outputs.back();
-    PADDLE_ENFORCE_EQ(output.size(), 1UL);
+    PADDLE_ENFORCE_EQ(output.size(), 1UL,
+                      paddle::platform::errors::Fatal(
+                          "The size of output should be equal to 1."));
     size_t size = GetSize(output[0]);
-    PADDLE_ENFORCE_GT(size, 0);
+    PADDLE_ENFORCE_GT(size, 0,
+                      paddle::platform::errors::Fatal(
+                          "The size of output should be greater than 0."));
     int64_t *result = static_cast<int64_t *>(output[0].data.data());
-    for (size_t i = 0; i < std::min(11UL, size); i++) {
+    for (size_t i = 0; i < std::min<size_t>(11, size); i++) {
       EXPECT_EQ(result[i], chinese_ner_result_data[i]);
     }
   }
diff --git a/paddle/fluid/inference/tests/api/analyzer_pyramid_dnn_tester.cc b/paddle/fluid/inference/tests/api/analyzer_pyramid_dnn_tester.cc
index 06a8e01b10c6eb70fe2cbac19725d96281863c29..bb1f0e8cd6334bab83973fb7d314f7017edd9e90 100644
--- a/paddle/fluid/inference/tests/api/analyzer_pyramid_dnn_tester.cc
+++ b/paddle/fluid/inference/tests/api/analyzer_pyramid_dnn_tester.cc
@@ -136,11 +136,17 @@ TEST(Analyzer_Pyramid_DNN, profile) {
                  input_slots_all, &outputs, FLAGS_num_threads);
 
   if (FLAGS_num_threads == 1 && !FLAGS_test_all_data && !FLAGS_zero_copy) {
-    PADDLE_ENFORCE_GT(outputs.size(), 0);
+    PADDLE_ENFORCE_GT(outputs.size(), 0,
+                      paddle::platform::errors::Fatal(
+                          "The size of output should be greater than 0."));
     auto output = outputs.back();
-    PADDLE_ENFORCE_EQ(output.size(), 1UL);
+    PADDLE_ENFORCE_EQ(output.size(), 1UL,
+                      paddle::platform::errors::Fatal(
+                          "The size of output should be equal to 1."));
     size_t size = GetSize(output[0]);
-    PADDLE_ENFORCE_GT(size, 0);
+    PADDLE_ENFORCE_GT(size, 0,
+                      paddle::platform::errors::Fatal(
+                          "The size of output should be greater than 0."));
     float *result = static_cast<float *>(output[0].data.data());
     // output is probability, which is in (0, 1).
     for (size_t i = 0; i < size; i++) {
diff --git a/paddle/fluid/inference/tests/api/analyzer_rnn2_tester.cc b/paddle/fluid/inference/tests/api/analyzer_rnn2_tester.cc
index 9ccbf58cbd2bbaab9b1a132c27e50356e1a5df37..34a0a5f398d7fee0f8e44f0ad59ff9711263b575 100644
--- a/paddle/fluid/inference/tests/api/analyzer_rnn2_tester.cc
+++ b/paddle/fluid/inference/tests/api/analyzer_rnn2_tester.cc
@@ -135,11 +135,17 @@ TEST(Analyzer_rnn2, profile) {
 
   if (FLAGS_num_threads == 1 && !FLAGS_test_all_data) {
     // the first inference result
-    PADDLE_ENFORCE_GT(outputs.size(), 0);
+    PADDLE_ENFORCE_GT(outputs.size(), 0,
+                      paddle::platform::errors::Fatal(
+                          "The size of output should be greater than 0."));
     auto output = outputs.back();
-    PADDLE_ENFORCE_GT(output.size(), 0);
+    PADDLE_ENFORCE_GT(output.size(), 0,
+                      paddle::platform::errors::Fatal(
+                          "The size of output should be greater than 0."));
     size_t size = GetSize(output[0]);
-    PADDLE_ENFORCE_GT(size, 0);
+    PADDLE_ENFORCE_GT(size, 0,
+                      paddle::platform::errors::Fatal(
+                          "The size of output should be greater than 0."));
     float *result = static_cast<float *>(output[0].data.data());
     for (size_t i = 0; i < size; i++) {
       EXPECT_NEAR(result[i], result_data[i], 1e-3);
diff --git a/paddle/fluid/inference/tests/api/analyzer_seq_conv1_tester.cc b/paddle/fluid/inference/tests/api/analyzer_seq_conv1_tester.cc
index e3f8b835f78371170aaf107e1b2d1ca41b300e56..978aaf1c6a32d5b4ec8f2d06b8873af892705da5 100644
--- a/paddle/fluid/inference/tests/api/analyzer_seq_conv1_tester.cc
+++ b/paddle/fluid/inference/tests/api/analyzer_seq_conv1_tester.cc
@@ -47,7 +47,8 @@ struct DataRecord {
       num_lines++;
       std::vector<std::string> data;
       split(line, '\t', &data);
-      PADDLE_ENFORCE(data.size() >= 4);
+      PADDLE_ENFORCE_GT(data.size(), 4, paddle::platform::errors::Fatal(
+                                            "The size of data is invaild."));
       // load title1 data
       std::vector<int64_t> title1_data;
       split_to_int64(data[0], ' ', &title1_data);
@@ -120,11 +121,17 @@ TEST(Analyzer_seq_conv1, profile) {
 
   if (FLAGS_num_threads == 1 && !FLAGS_test_all_data) {
     // the first inference result
-    PADDLE_ENFORCE_GT(outputs.size(), 0);
+    PADDLE_ENFORCE_GT(outputs.size(), 0,
+                      paddle::platform::errors::Fatal(
+                          "The size of output should be greater than 0."));
     auto output = outputs.back();
-    PADDLE_ENFORCE_EQ(output.size(), 1UL);
+    PADDLE_ENFORCE_EQ(output.size(), 1UL,
+                      paddle::platform::errors::Fatal(
+                          "The size of output should be equal to 0."));
     size_t size = GetSize(output[0]);
-    PADDLE_ENFORCE_GT(size, 0);
+    PADDLE_ENFORCE_GT(size, 0,
+                      paddle::platform::errors::Fatal(
+                          "The size of output should be greater than 0."));
     float *result = static_cast<float *>(output[0].data.data());
     // output is probability, which is in (0, 1).
     for (size_t i = 0; i < size; i++) {
diff --git a/paddle/fluid/inference/tests/api/analyzer_seq_pool1_tester.cc b/paddle/fluid/inference/tests/api/analyzer_seq_pool1_tester.cc
index 56f706ae56bda8b06eba5dd9e080552aa9785c6e..9f1556cdb871aa3e5bbe613aa98299c162661c42 100644
--- a/paddle/fluid/inference/tests/api/analyzer_seq_pool1_tester.cc
+++ b/paddle/fluid/inference/tests/api/analyzer_seq_pool1_tester.cc
@@ -56,20 +56,26 @@ struct DataRecord {
       std::vector<float> slot_data;
       split_to_float(data[1], ' ', &slot_data);
       std::string name = data[0];
-      PADDLE_ENFORCE_EQ(slot_data.size() % 11, 0UL,
-                        "line %d, %s should be divisible", num_lines, name);
+      PADDLE_ENFORCE_EQ(
+          slot_data.size() % 11, 0UL,
+          paddle::platform::errors::Fatal("line %d, %s should be divisible",
+                                          num_lines, name));
       datasets[name].emplace_back(std::move(slot_data));
     }
     num_samples = num_lines / num_slots;
-    PADDLE_ENFORCE_EQ(num_samples * num_slots, static_cast<size_t>(num_lines),
-                      "num samples should be divisible");
-    PADDLE_ENFORCE_GT(num_samples, 0UL);
+    PADDLE_ENFORCE_EQ(
+        num_samples * num_slots, static_cast<size_t>(num_lines),
+        paddle::platform::errors::Fatal("num samples should be divisible"));
+    PADDLE_ENFORCE_GT(num_samples, 0UL,
+                      paddle::platform::errors::Fatal(
+                          "The num of samples should be greater than 0."));
   }
 
   void Prepare(int bs) {
     for (auto it = datasets.begin(); it != datasets.end(); ++it) {
-      PADDLE_ENFORCE_EQ(it->second.size(), num_samples,
-                        "size of each slot should be equal");
+      PADDLE_ENFORCE_EQ(
+          it->second.size(), num_samples,
+          paddle::platform::errors::Fatal("size of each slot should be equal"));
     }
     size_t num_batches = num_samples / bs;
     EXPECT_GT(num_batches, 0UL);
@@ -90,8 +96,10 @@ struct DataRecord {
           std::copy(datas[id].begin(), datas[id].end(),
                     std::back_inserter(slot.data[k]));
           size_t len = datas[id].size() / 11;
-          PADDLE_ENFORCE_EQ(len * 11, datas[id].size(),
-                            "%s %d size should be divisible", slot.name, id);
+          PADDLE_ENFORCE_EQ(
+              len * 11, datas[id].size(),
+              paddle::platform::errors::Fatal("%s %d size should be divisible",
+                                              slot.name, id));
           lod[k + 1] = lod[k] + len;
         }
         slot.shape.assign({static_cast<int>(lod[bs]), 11});
diff --git a/paddle/fluid/inference/tests/api/analyzer_text_classification_tester.cc b/paddle/fluid/inference/tests/api/analyzer_text_classification_tester.cc
index 78e500b2ed530d5a1dce8a7927538fdd0bbb6907..ae38bcbc20a9f44eb6ef5c313b318dec38a30550 100644
--- a/paddle/fluid/inference/tests/api/analyzer_text_classification_tester.cc
+++ b/paddle/fluid/inference/tests/api/analyzer_text_classification_tester.cc
@@ -22,7 +22,9 @@ struct DataReader {
       : file(new std::ifstream(path)) {}
 
   bool NextBatch(std::vector<PaddleTensor> *input, int batch_size) {
-    PADDLE_ENFORCE_EQ(batch_size, 1);
+    PADDLE_ENFORCE_EQ(batch_size, 1,
+                      paddle::platform::errors::Fatal(
+                          "The size of batch should be equal to 1."));
     std::string line;
     PaddleTensor tensor;
     tensor.dtype = PaddleDType::INT64;
@@ -81,7 +83,9 @@ TEST(Analyzer_Text_Classification, profile) {
 
   if (FLAGS_num_threads == 1) {
     // Get output
-    PADDLE_ENFORCE_GT(outputs.size(), 0);
+    PADDLE_ENFORCE_GT(outputs.size(), 0,
+                      paddle::platform::errors::Fatal(
+                          "The size of output should be greater than 0."));
     LOG(INFO) << "get outputs " << outputs.back().size();
     for (auto &output : outputs.back()) {
       LOG(INFO) << "output.shape: " << to_string(output.shape);
diff --git a/paddle/fluid/inference/tests/api/analyzer_vis_tester.cc b/paddle/fluid/inference/tests/api/analyzer_vis_tester.cc
index 65755b7b15ad54e38e398a82db41a0b9d8fc59e3..a2ced21a9ac9ad10c2b067a60597eee9fdff9eeb 100644
--- a/paddle/fluid/inference/tests/api/analyzer_vis_tester.cc
+++ b/paddle/fluid/inference/tests/api/analyzer_vis_tester.cc
@@ -59,7 +59,9 @@ void SetConfig(AnalysisConfig *cfg) {
 }
 
 void SetInput(std::vector<std::vector<PaddleTensor>> *inputs) {
-  PADDLE_ENFORCE_EQ(FLAGS_test_all_data, 0, "Only have single batch of data.");
+  PADDLE_ENFORCE_EQ(
+      FLAGS_test_all_data, 0,
+      paddle::platform::errors::Fatal("Only have single batch of data."));
   std::string line;
   std::ifstream file(FLAGS_infer_data);
   std::getline(file, line);
@@ -99,7 +101,9 @@ void profile(bool use_mkldnn = false) {
     auto refer = ProcessALine(line);
     file.close();
 
-    PADDLE_ENFORCE_GT(outputs.size(), 0);
+    PADDLE_ENFORCE_GT(outputs.size(), 0,
+                      paddle::platform::errors::Fatal(
+                          "The size of output should be greater than 0."));
     auto &output = outputs.back().front();
     size_t numel = output.data.length() / PaddleDtypeSize(output.dtype);
     CHECK_EQ(numel, refer.data.size());
diff --git a/paddle/fluid/inference/tests/api/full_ILSVRC2012_val_preprocess.py b/paddle/fluid/inference/tests/api/full_ILSVRC2012_val_preprocess.py
index c5610961d65832b455d56c3d5dcc87d9a375f6b9..9f3a389ea344e7e827c5864dff70a1b0eec10f08 100644
--- a/paddle/fluid/inference/tests/api/full_ILSVRC2012_val_preprocess.py
+++ b/paddle/fluid/inference/tests/api/full_ILSVRC2012_val_preprocess.py
@@ -23,7 +23,7 @@ from PIL import Image
 import math
 from paddle.dataset.common import download
 import tarfile
-import StringIO
+from six.moves import StringIO
 import argparse
 
 random.seed(0)
@@ -152,7 +152,7 @@ def convert_Imagenet_tar2bin(tar_file, output_file):
 
         idx = 0
         for imagedata in dataset.values():
-            img = Image.open(StringIO.StringIO(imagedata))
+            img = Image.open(StringIO(imagedata))
             img = process_image(img)
             np_img = np.array(img)
             ofs.write(np_img.astype('float32').tobytes())
diff --git a/paddle/fluid/inference/tests/api/full_pascalvoc_test_preprocess.py b/paddle/fluid/inference/tests/api/full_pascalvoc_test_preprocess.py
index 8a098aa1eb4875b9cf016ea649f90c5beb511d79..84c4eb7e5e87ee36692e25c70a93cbc32082db45 100644
--- a/paddle/fluid/inference/tests/api/full_pascalvoc_test_preprocess.py
+++ b/paddle/fluid/inference/tests/api/full_pascalvoc_test_preprocess.py
@@ -19,7 +19,7 @@ import os
 import sys
 from paddle.dataset.common import download
 import tarfile
-import StringIO
+from six.moves import StringIO
 import hashlib
 import tarfile
 import argparse
@@ -191,7 +191,7 @@ def convert_pascalvoc_tar2bin(tar_path, data_out_path):
                 gt_labels[name_prefix] = tar.extractfile(tarInfo).read()
 
     for line_idx, name_prefix in enumerate(lines):
-        im = Image.open(StringIO.StringIO(images[name_prefix]))
+        im = Image.open(StringIO(images[name_prefix]))
         if im.mode == 'L':
             im = im.convert('RGB')
         im_width, im_height = im.size
diff --git a/paddle/fluid/inference/tests/api/lite_resnet50_test.cc b/paddle/fluid/inference/tests/api/lite_resnet50_test.cc
index 31701c59ec33dfced5745f7f16d8f00ffce462ef..9ae073e9e5b142254b32396e0355f59ae1826909 100644
--- a/paddle/fluid/inference/tests/api/lite_resnet50_test.cc
+++ b/paddle/fluid/inference/tests/api/lite_resnet50_test.cc
@@ -27,7 +27,7 @@ TEST(AnalysisPredictor, use_gpu) {
   AnalysisConfig config;
   config.EnableUseGpu(100, 0);
   config.SetModel(model_dir + "/model", model_dir + "/params");
-  config.EnableLiteEngine(paddle::AnalysisConfig::Precision::kFloat32);
+  config.EnableLiteEngine(paddle::AnalysisConfig::Precision::kFloat32, true);
 
   std::vector<PaddleTensor> inputs;
   auto predictor = CreatePaddlePredictor(config);
diff --git a/paddle/fluid/inference/tests/api/trt_dynamic_shape_ernie_deserialize_test.cc b/paddle/fluid/inference/tests/api/trt_dynamic_shape_ernie_deserialize_test.cc
index 524e08891f4e90d8a322822e26d75689526d30f5..d49f83b9d38a3d16099c4bc698c47f18a4280da0 100644
--- a/paddle/fluid/inference/tests/api/trt_dynamic_shape_ernie_deserialize_test.cc
+++ b/paddle/fluid/inference/tests/api/trt_dynamic_shape_ernie_deserialize_test.cc
@@ -12,15 +12,33 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
+#include <dirent.h>
 #include <gflags/gflags.h>
 #include <glog/logging.h>
 #include <gtest/gtest.h>
+#include <unistd.h>
 
 #include "paddle/fluid/inference/tests/api/trt_test_helper.h"
 
 namespace paddle {
 namespace inference {
 
+int DeleteCache(std::string path) {
+  DIR* dir = opendir(path.c_str());
+  if (dir == NULL) return 0;
+  struct dirent* ptr;
+  while ((ptr = readdir(dir)) != NULL) {
+    if (std::strcmp(ptr->d_name, ".") == 0 ||
+        std::strcmp(ptr->d_name, "..") == 0) {
+      continue;
+    } else if (ptr->d_type == 8) {
+      std::string file_rm = path + "/" + ptr->d_name;
+      return remove(file_rm.c_str());
+    }
+  }
+  return 0;
+}
+
 void run(const AnalysisConfig& config, std::vector<float>* out_data) {
   auto predictor = CreatePaddlePredictor(config);
   auto input_names = predictor->GetInputNames();
@@ -86,6 +104,11 @@ void run(const AnalysisConfig& config, std::vector<float>* out_data) {
 void trt_ernie(bool with_fp16, std::vector<float> result) {
   AnalysisConfig config;
   std::string model_dir = FLAGS_infer_model;
+  // Delete serialization cache to perform serialization first rather than
+  // deserialization.
+  std::string opt_cache_dir = FLAGS_infer_model + "/_opt_cache";
+  DeleteCache(opt_cache_dir);
+
   SetConfig(&config, model_dir, true /* use_gpu */);
 
   config.SwitchUseFeedFetchOps(false);
@@ -128,7 +151,7 @@ void trt_ernie(bool with_fp16, std::vector<float> result) {
   run(config, &out_data);         // serialize
   run(*config_deser, &out_data);  // deserialize
   for (size_t i = 0; i < out_data.size(); i++) {
-    EXPECT_NEAR(result[i], out_data[i], 1e-6);
+    EXPECT_NEAR(result[i], out_data[i], 1e-2);
   }
 }
 
@@ -136,13 +159,11 @@ TEST(AnalysisPredictor, no_fp16) {
   std::vector<float> result = {0.597841, 0.219972, 0.182187};
   trt_ernie(false, result);
 }
-
-TEST(AnalysisPredictor, fp16) {
 #ifdef SUPPORTS_CUDA_FP16
-  std::vector<float> result = {0.598336, 0.219558, 0.182106};
+TEST(AnalysisPredictor, fp16) {
+  std::vector<float> result = {0.59923654, 0.21923761, 0.18152587};
   trt_ernie(true, result);
-#endif
 }
-
+#endif  // SUPPORTS_CUDA_FP16
 }  // namespace inference
 }  // namespace paddle
diff --git a/paddle/fluid/inference/tests/test.cmake b/paddle/fluid/inference/tests/test.cmake
index b9f979f96d4b106642795151fb8e34b025b2caef..9bde2a99db1b75a454b005eec2d237294c7aa815 100644
--- a/paddle/fluid/inference/tests/test.cmake
+++ b/paddle/fluid/inference/tests/test.cmake
@@ -25,7 +25,8 @@ endfunction()
 
 function(inference_download_and_uncompress INSTALL_DIR URL FILENAME)
   message(STATUS "Download inference test stuff from ${URL}/${FILENAME}")
-  string(REGEX REPLACE "[-%.]" "_" FILENAME_EX ${FILENAME})
+  string(REGEX REPLACE "[-%./\\]" "_" FILENAME_EX ${FILENAME})
+  string(REGEX MATCH "[^/\\]+$" DOWNLOAD_NAME ${FILENAME})
   set(EXTERNAL_PROJECT_NAME "extern_inference_download_${FILENAME_EX}")
   set(UNPACK_DIR "${INSTALL_DIR}/src/${EXTERNAL_PROJECT_NAME}")
   ExternalProject_Add(
@@ -38,14 +39,14 @@ function(inference_download_and_uncompress INSTALL_DIR URL FILENAME)
       DOWNLOAD_NO_PROGRESS  1
       CONFIGURE_COMMAND     ""
       BUILD_COMMAND         ${CMAKE_COMMAND} -E chdir ${INSTALL_DIR}
-                            ${CMAKE_COMMAND} -E tar xzf ${FILENAME}
+                            ${CMAKE_COMMAND} -E tar xzf ${DOWNLOAD_NAME}
       UPDATE_COMMAND        ""
       INSTALL_COMMAND       ""
   )
 endfunction()
 
 set(WORD2VEC_INSTALL_DIR "${INFERENCE_DEMO_INSTALL_DIR}/word2vec")
-if(NOT EXISTS ${WORD2VEC_INSTALL_DIR})
+if(NOT EXISTS ${WORD2VEC_INSTALL_DIR}/word2vec.inference.model.tar.gz)
   inference_download_and_uncompress(${WORD2VEC_INSTALL_DIR} ${INFERENCE_URL} "word2vec.inference.model.tar.gz")
 endif()
 set(WORD2VEC_MODEL_DIR "${WORD2VEC_INSTALL_DIR}/word2vec.inference.model")
diff --git a/paddle/fluid/inference/tests/test_helper.h b/paddle/fluid/inference/tests/test_helper.h
index 7183cbac71562bfe4092bf78270096996b74c525..1457f5337e3ed05b74d247d65e2f6b2f7f6735d3 100644
--- a/paddle/fluid/inference/tests/test_helper.h
+++ b/paddle/fluid/inference/tests/test_helper.h
@@ -21,6 +21,7 @@ limitations under the License. */
 
 #include "paddle/fluid/framework/lod_tensor.h"
 #include "paddle/fluid/inference/io.h"
+#include "paddle/fluid/platform/errors.h"
 #include "paddle/fluid/platform/port.h"
 #include "paddle/fluid/platform/profiler.h"
 
@@ -162,7 +163,8 @@ void TestInference(const std::string& dirname,
     //   int device_id = place.GetDeviceId();
     paddle::platform::SetDeviceId(0);
 #else
-    PADDLE_THROW("'CUDAPlace' is not supported in CPU only device.");
+    PADDLE_THROW(paddle::platform::errors::Unavailable(
+        "'CUDAPlace' is not supported in CPU only device."));
 #endif
   }
 
diff --git a/paddle/fluid/inference/utils/singleton.h b/paddle/fluid/inference/utils/singleton.h
index 990bef359499834c3a7cb025c3fb1d94ceea958e..6828924c300fdfec6640e7b19a2c06b0826aa455 100644
--- a/paddle/fluid/inference/utils/singleton.h
+++ b/paddle/fluid/inference/utils/singleton.h
@@ -46,7 +46,9 @@ struct Registry {
 
   template <typename ItemChild>
   void Register(const std::string& name) {
-    PADDLE_ENFORCE_EQ(items_.count(name), 0);
+    PADDLE_ENFORCE_EQ(items_.count(name), 0,
+                      platform::errors::AlreadyExists(
+                          "Item `%s` has beed registered.", name));
     items_[name] = new ItemChild;
   }
 
diff --git a/paddle/fluid/memory/allocation/best_fit_allocator_test.cu b/paddle/fluid/memory/allocation/best_fit_allocator_test.cu
index eb24ba84c886e3393cf36b6f764d7b33e76defeb..59c14103ca67dbf325928a9aee73d903d7d9e9e3 100644
--- a/paddle/fluid/memory/allocation/best_fit_allocator_test.cu
+++ b/paddle/fluid/memory/allocation/best_fit_allocator_test.cu
@@ -16,6 +16,7 @@
 #include <random>
 #include <thread>  // NOLINT
 #include <vector>
+
 #include "gtest/gtest.h"
 #include "paddle/fluid/memory/allocation/best_fit_allocator.h"
 #include "paddle/fluid/memory/allocation/cuda_allocator.h"
@@ -41,12 +42,14 @@ TEST(BestFitAllocator, concurrent_cuda) {
   LockedAllocator concurrent_allocator(
       std::unique_ptr<Allocator>(new BestFitAllocator(cuda_allocation.get())));
 
+  platform::CUDAPlace gpu(0);
+  platform::CUDADeviceContext dev_ctx(gpu);
+
   auto th_main = [&](std::random_device::result_type seed) {
     std::default_random_engine engine(seed);
     std::uniform_int_distribution<size_t> dist(1U, 1024U);
-    platform::CUDAPlace gpu(0);
-    platform::CUDADeviceContext dev_ctx(gpu);
     std::array<size_t, 1024> buf;
+
     for (size_t i = 0; i < 128; ++i) {
       size_t allocate_size = dist(engine);
 
diff --git a/paddle/fluid/operators/CMakeLists.txt b/paddle/fluid/operators/CMakeLists.txt
index 6e8ff52ed4a8846f5f6060e10cfd9bec22308e9e..53e6f4aa6e41bb8c02c01b4897e35c103260e167 100644
--- a/paddle/fluid/operators/CMakeLists.txt
+++ b/paddle/fluid/operators/CMakeLists.txt
@@ -45,7 +45,9 @@ endif()
 SET(OP_HEADER_DEPS xxhash executor)
 
 if (WITH_GPU)
-    SET(OP_HEADER_DEPS ${OP_HEADER_DEPS} cub)
+    if (${CMAKE_CUDA_COMPILER_VERSION} LESS 11.0)
+        SET(OP_HEADER_DEPS ${OP_HEADER_DEPS} cub)
+    endif()
 endif()
 
 SET(OP_PREFETCH_DEPS "")
@@ -90,7 +92,7 @@ cc_library(common_infer_shape_functions SRCS common_infer_shape_functions.cc DEP
 
 set(COMMON_OP_DEPS ${COMMON_OP_DEPS} selected_rows_functor selected_rows
 lod_tensor maxouting unpooling pooling lod_rank_table context_project
-sequence_pooling executor device_memory_aligment generator)
+sequence_pooling segment_pooling executor device_memory_aligment generator)
 set(COMMON_OP_DEPS ${COMMON_OP_DEPS} dynload_warpctc)
 set(COMMON_OP_DEPS ${COMMON_OP_DEPS} sequence_padding sequence_scale cos_sim_functor memory jit_kernel_helper concat_and_split cross_entropy softmax vol2col im2col sampler sample_prob tree2col)
 set(COMMON_OP_DEPS ${COMMON_OP_DEPS} sequence2batch lstm_compute matrix_bit_code gru_compute activation_functions beam_search fc matrix_inverse)
diff --git a/paddle/fluid/operators/add_position_encoding_op.cc b/paddle/fluid/operators/add_position_encoding_op.cc
index 629fedba6e3db474869ebddc02470c2ff007e658..e5fcd270eb8b8fa58175e11e955161ebfbb2846c 100644
--- a/paddle/fluid/operators/add_position_encoding_op.cc
+++ b/paddle/fluid/operators/add_position_encoding_op.cc
@@ -69,12 +69,18 @@ class AddPositionEncodingOpMaker : public framework::OpProtoAndCheckerMaker {
     AddAttr<float>("alpha", "The scale of Original Embedding.")
         .SetDefault(1.0f)
         .AddCustomChecker([](const float& alpha) {
-          PADDLE_ENFORCE(alpha >= 0.0f, "'alpha' must be above 0.0.");
+          PADDLE_ENFORCE_GE(
+              alpha, 0.0f,
+              platform::errors::InvalidArgument(
+                  "Attribute 'alpha' must be greater than or equal to 0.0."));
         });
     AddAttr<float>("beta", "The scale of Position Embedding.")
         .SetDefault(1.0f)
         .AddCustomChecker([](const float& beta) {
-          PADDLE_ENFORCE(beta >= 0.0f, "'beta' must be between 0.0.");
+          PADDLE_ENFORCE_GE(
+              beta, 0.0f,
+              platform::errors::InvalidArgument(
+                  "Attribute 'beta' must be greater than or equal to 0.0."));
         });
     AddComment(R"DOC(
     Add Position Encoding Operator.
diff --git a/paddle/fluid/operators/affine_grid_op.cu b/paddle/fluid/operators/affine_grid_op.cu
index 7aaaa0002c5ab31af72c75e69f5a283c09633ba4..58b56bdcf5614ed9183ce3bf11c1767f92650d20 100644
--- a/paddle/fluid/operators/affine_grid_op.cu
+++ b/paddle/fluid/operators/affine_grid_op.cu
@@ -62,11 +62,11 @@ __global__ void affine_grid_kernel(const int count, int n, int out_h, int out_w,
 
     int theta_offset = n * 6;  // 2 * 3;
     // affine from (h_coor, w_coor) to (x, y)
-    output[index * 2] = theta[theta_offset] * h_coor +
-                        theta[theta_offset + 1] * w_coor +
+    output[index * 2] = theta[theta_offset] * w_coor +
+                        theta[theta_offset + 1] * h_coor +
                         theta[theta_offset + 2];
-    output[index * 2 + 1] = theta[theta_offset + 3] * h_coor +
-                            theta[theta_offset + 4] * w_coor +
+    output[index * 2 + 1] = theta[theta_offset + 3] * w_coor +
+                            theta[theta_offset + 4] * h_coor +
                             theta[theta_offset + 5];
   }
 }
@@ -86,13 +86,13 @@ __global__ void affine_grid_grad_kernel(const int count, int n, int out_h,
 
     int theta_offset = n * 6;  // 2 * 3;
     T out_grad_x = out_grad[index * 2];
-    platform::CudaAtomicAdd(theta_grad + theta_offset, out_grad_x * h_coor);
-    platform::CudaAtomicAdd(theta_grad + theta_offset + 1, out_grad_x * w_coor);
+    platform::CudaAtomicAdd(theta_grad + theta_offset, out_grad_x * w_coor);
+    platform::CudaAtomicAdd(theta_grad + theta_offset + 1, out_grad_x * h_coor);
     platform::CudaAtomicAdd(theta_grad + theta_offset + 2, out_grad_x);
 
     T out_grad_y = out_grad[index * 2 + 1];
-    platform::CudaAtomicAdd(theta_grad + theta_offset + 3, out_grad_y * h_coor);
-    platform::CudaAtomicAdd(theta_grad + theta_offset + 4, out_grad_y * w_coor);
+    platform::CudaAtomicAdd(theta_grad + theta_offset + 3, out_grad_y * w_coor);
+    platform::CudaAtomicAdd(theta_grad + theta_offset + 4, out_grad_y * h_coor);
     platform::CudaAtomicAdd(theta_grad + theta_offset + 5, out_grad_y);
   }
 }
diff --git a/paddle/fluid/operators/amp/amp_check_finite_and_scale_op.cc b/paddle/fluid/operators/amp/amp_check_finite_and_scale_op.cc
deleted file mode 100644
index 7f0ca1493f712f7f4809a56bf6a23f8757f94c2d..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/amp/amp_check_finite_and_scale_op.cc
+++ /dev/null
@@ -1,104 +0,0 @@
-/* Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/fluid/operators/amp/amp_check_finite_and_scale_op.h"
-
-#include <string>
-#include <vector>
-
-namespace paddle {
-namespace operators {
-
-class AmpCheckFiniteAndScaleOp : public framework::OperatorWithKernel {
- public:
-  AmpCheckFiniteAndScaleOp(const std::string &type,
-                           const framework::VariableNameMap &inputs,
-                           const framework::VariableNameMap &outputs,
-                           const framework::AttributeMap &attrs)
-      : OperatorWithKernel(type, inputs, outputs, attrs) {}
-
-  void InferShape(framework::InferShapeContext *ctx) const override {
-    OP_INOUT_CHECK(ctx->HasInputs("X"), "Input", "X",
-                   "amp_check_finite_and_unscale");
-    OP_INOUT_CHECK(ctx->HasOutputs("Out"), "Output", "Out",
-                   "amp_check_finite_and_unscale");
-    PADDLE_ENFORCE_EQ(
-        ctx->Inputs("X").size(), ctx->Outputs("Out").size(),
-        platform::errors::InvalidArgument(
-            "The input(X) and output(Out) should have same size in "
-            "Operator(amp_check_finite_and_unscale), size of input(X) is %d "
-            "and size of output(Out) is %d.",
-            ctx->Inputs("X").size(), ctx->Outputs("Out").size()));
-    auto x_dims = ctx->GetInputsDim("X");
-    ctx->SetOutputsDim("Out", x_dims);
-    ctx->SetOutputDim("FoundInfinite", {1});
-  }
-
- protected:
-  framework::OpKernelType GetExpectedKernelType(
-      const framework::ExecutionContext &ctx) const override {
-    return framework::OpKernelType(
-        OperatorWithKernel::IndicateVarDataType(ctx, "X"), ctx.GetPlace());
-  }
-};
-
-class AmpCheckFiniteAndScaleOpMaker : public framework::OpProtoAndCheckerMaker {
- public:
-  void Make() override {
-    AddInput(
-        "X",
-        "(Tensors) The input tensors of amp_check_finite_and_scale operator.")
-        .AsDuplicable();
-    AddInput("Scale",
-             "(Tensor) 1-dim tensor, the scale of amp_check_finite_and_scale "
-             "operator.");
-    AddOutput("Out",
-              "(Tensors) The scaled output tensor of "
-              "amp_check_finite_and_unscale operator.")
-        .AsDuplicable();
-    AddOutput("FoundInfinite",
-              "(Tensor) 1-dim tensor, contains a bool scalar, which indicates "
-              "if there there is infinite or nan item in input X.");
-    AddComment(R"DOC(
-amp_check_finite_and_scale operator.
-Check if input X contains all finite data, if yes, scale it by input Scale.
-
-$$Out = X * scale$$
-
-If any tensor in X contains Inf or Nan, the Out will generate a indicator.
-FoundInfinite will be 1 (True), and Out will not be scaled. In this case, the data of 
-Out should not be used, and its data may not be deterministic. 
-Otherwise, FoundInfinite will be 0 (False).
-
-)DOC");
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
-
-namespace ops = paddle::operators;
-
-REGISTER_OPERATOR(
-    amp_check_finite_and_scale, ops::AmpCheckFiniteAndScaleOp,
-    ops::AmpCheckFiniteAndScaleOpMaker,
-    paddle::framework::EmptyGradOpMaker<paddle::framework::OpDesc>,
-    paddle::framework::EmptyGradOpMaker<paddle::imperative::OpBase>);
-
-REGISTER_OP_CPU_KERNEL(
-    amp_check_finite_and_scale,
-    ops::AmpCheckFiniteAndScaleKernel<paddle::platform::CPUDeviceContext,
-                                      float>,
-    ops::AmpCheckFiniteAndScaleKernel<paddle::platform::CPUDeviceContext,
-                                      double>);
diff --git a/paddle/fluid/operators/amp/amp_check_finite_and_scale_op.h b/paddle/fluid/operators/amp/amp_check_finite_and_scale_op.h
deleted file mode 100644
index 6c2c4eb8a615c4c04a98601c25b5de43b4262e6b..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/amp/amp_check_finite_and_scale_op.h
+++ /dev/null
@@ -1,66 +0,0 @@
-/* Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include <string>
-#include <vector>
-#include "paddle/fluid/framework/tensor_util.h"
-#include "paddle/fluid/operators/elementwise/elementwise_op_function.h"
-#include "paddle/fluid/operators/isfinite_op.h"
-
-namespace paddle {
-namespace operators {
-
-template <typename DeviceContext, typename T>
-class AmpCheckFiniteAndScaleKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const {
-    auto& dev_ctx = ctx.template device_context<DeviceContext>();
-    const auto xs = ctx.MultiInput<framework::Tensor>("X");
-    const auto* scale = ctx.Input<framework::Tensor>("Scale");
-    auto outs = ctx.MultiOutput<framework::Tensor>("Out");
-    auto* found_inf = ctx.Output<framework::Tensor>("FoundInfinite");
-
-    const T* scale_data = scale->data<T>();
-    bool* found_inf_data = found_inf->mutable_data<bool>(dev_ctx.GetPlace());
-
-    *found_inf_data = false;
-    framework::Tensor is_finite =
-        ctx.AllocateTmpTensor<bool, DeviceContext>({1}, dev_ctx);
-    bool* is_finite_data = is_finite.template data<bool>();
-
-    auto& dev = *ctx.template device_context<DeviceContext>().eigen_device();
-    for (size_t i = 0; i < xs.size(); ++i) {
-      const auto* x = xs[i];
-      auto* out = outs[i];
-      out->mutable_data<T>(dev_ctx.GetPlace());
-      if (!(*found_inf_data)) {
-        framework::TensorIsfinite(*x, &is_finite);
-        if (*is_finite_data) {
-          auto eigen_out = framework::EigenVector<T>::Flatten(*out);
-          auto eigen_in = framework::EigenVector<T>::Flatten(*x);
-          eigen_out.device(dev) = (*scale_data) * eigen_in;
-        } else {
-          *found_inf_data = true;
-          break;
-        }
-      }
-    }
-    return;
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
diff --git a/paddle/fluid/operators/amp/check_finite_and_unscale_op.cc b/paddle/fluid/operators/amp/check_finite_and_unscale_op.cc
new file mode 100644
index 0000000000000000000000000000000000000000..51c659d5db1c33d5e2db261b998a0673f5e766cb
--- /dev/null
+++ b/paddle/fluid/operators/amp/check_finite_and_unscale_op.cc
@@ -0,0 +1,141 @@
+/* Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/operators/amp/check_finite_and_unscale_op.h"
+#include "paddle/fluid/framework/tensor_util.h"
+
+namespace paddle {
+namespace operators {
+
+class CheckFiniteAndUnscaleOp : public framework::OperatorWithKernel {
+ public:
+  CheckFiniteAndUnscaleOp(const std::string& type,
+                          const framework::VariableNameMap& inputs,
+                          const framework::VariableNameMap& outputs,
+                          const framework::AttributeMap& attrs)
+      : OperatorWithKernel(type, inputs, outputs, attrs) {}
+
+  void InferShape(framework::InferShapeContext* ctx) const override {
+    OP_INOUT_CHECK(ctx->HasInputs("X"), "Input", "X",
+                   "check_finite_and_unscale");
+    OP_INOUT_CHECK(ctx->HasOutputs("Out"), "Output", "Out",
+                   "check_finite_and_unscale");
+    PADDLE_ENFORCE_EQ(
+        ctx->Inputs("X").size(), ctx->Outputs("Out").size(),
+        platform::errors::InvalidArgument(
+            "The input(X) and output(Out) should have same size in "
+            "Operator(check_finite_and_unscale), size of input(X) is %d "
+            "and size of output(Out) is %d.",
+            ctx->Inputs("X").size(), ctx->Outputs("Out").size()));
+    auto x_dims = ctx->GetInputsDim("X");
+    ctx->SetOutputsDim("Out", x_dims);
+    ctx->SetOutputDim("FoundInfinite", {1});
+  }
+
+ protected:
+  framework::OpKernelType GetExpectedKernelType(
+      const framework::ExecutionContext& ctx) const override {
+    return framework::OpKernelType(
+        OperatorWithKernel::IndicateVarDataType(ctx, "X"), ctx.GetPlace());
+  }
+};
+
+class CheckFiniteAndUnscaleOpMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  void Make() override {
+    AddInput(
+        "X",
+        "(Tensors) The input tensors of check_finite_and_unscale operator.")
+        .AsDuplicable();
+    AddInput("Scale",
+             "(Tensor) 1-dim tensor, the scale of check_finite_and_unscale "
+             "operator.");
+    AddOutput("Out",
+              "(Tensors) The scaled output tensor of "
+              "check_finite_and_unscale operator.")
+        .AsDuplicable();
+    AddOutput("FoundInfinite",
+              "(Tensor) 1-dim tensor, contains a bool scalar, which indicates "
+              "if there there is infinite or nan item in input X.");
+    AddComment(R"DOC(
+check_finite_and_unscale operator.
+Check if input X contains all finite data, if yes, scale it by input Scale.
+
+$$Out = X / scale$$
+
+If any tensor in X contains Inf or Nan, the Out will generate a indicator.
+FoundInfinite will be 1 (True), and Out will not be scaled. In this case, the data of 
+Out should not be used, and its data may not be deterministic. 
+Otherwise, FoundInfinite will be 0 (False).
+
+)DOC");
+  }
+};
+
+template <typename T>
+class CheckFiniteAndUnscaleCpuKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const {
+    auto& dev_ctx = ctx.template device_context<platform::CPUDeviceContext>();
+    const auto xs = ctx.MultiInput<framework::Tensor>("X");
+    const auto* scale = ctx.Input<framework::Tensor>("Scale");
+    auto outs = ctx.MultiOutput<framework::Tensor>("Out");
+    auto* found_inf = ctx.Output<framework::Tensor>("FoundInfinite");
+
+    const T* scale_data = scale->data<T>();
+    bool* found_inf_data = found_inf->mutable_data<bool>(dev_ctx.GetPlace());
+
+    *found_inf_data = false;
+    framework::Tensor is_finite =
+        ctx.AllocateTmpTensor<bool, platform::CPUDeviceContext>({1}, dev_ctx);
+    bool* is_finite_data = is_finite.template data<bool>();
+
+    auto& dev = *ctx.template device_context<platform::CPUDeviceContext>()
+                     .eigen_device();
+
+    T inverse_scale = Inverse<T>(*scale_data);
+    for (size_t i = 0; i < xs.size(); ++i) {
+      const auto* x = xs[i];
+      auto* out = outs[i];
+      out->mutable_data<T>(dev_ctx.GetPlace());
+      if (!(*found_inf_data)) {
+        framework::TensorIsfinite(*x, &is_finite);
+        *found_inf_data = !(*is_finite_data);
+      }
+      auto eigen_out = framework::EigenVector<T>::Flatten(*out);
+      auto eigen_in = framework::EigenVector<T>::Flatten(*x);
+      if (!(*found_inf_data)) {
+        eigen_out.device(dev) = eigen_in * inverse_scale;
+      } else {
+        eigen_out.device(dev) = eigen_in * static_cast<T>(0);
+      }
+    }
+    return;
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+
+REGISTER_OPERATOR(
+    check_finite_and_unscale, ops::CheckFiniteAndUnscaleOp,
+    ops::CheckFiniteAndUnscaleOpMaker,
+    paddle::framework::EmptyGradOpMaker<paddle::framework::OpDesc>,
+    paddle::framework::EmptyGradOpMaker<paddle::imperative::OpBase>);
+
+REGISTER_OP_CPU_KERNEL(check_finite_and_unscale,
+                       ops::CheckFiniteAndUnscaleCpuKernel<float>,
+                       ops::CheckFiniteAndUnscaleCpuKernel<double>);
diff --git a/paddle/fluid/operators/amp/amp_check_finite_and_scale_op.cu b/paddle/fluid/operators/amp/check_finite_and_unscale_op.cu
similarity index 63%
rename from paddle/fluid/operators/amp/amp_check_finite_and_scale_op.cu
rename to paddle/fluid/operators/amp/check_finite_and_unscale_op.cu
index ee00d7c5f4499867c2c706ddcf314c1bfae0a866..cf9df34a2467f8461c4c284b4848c54b76edf452 100644
--- a/paddle/fluid/operators/amp/amp_check_finite_and_scale_op.cu
+++ b/paddle/fluid/operators/amp/check_finite_and_unscale_op.cu
@@ -14,28 +14,31 @@ limitations under the License. */
 
 #include <cuda.h>
 
-#include "paddle/fluid/operators/amp/amp_check_finite_and_scale_op.h"
-#include "paddle/fluid/platform/float16.h"
+#include "paddle/fluid/operators/amp/check_finite_and_unscale_op.h"
 
 namespace paddle {
 namespace operators {
 
 template <typename T>
-__global__ void AmpCheckFiniteAndScale(const T* in, const T* scale, int num,
-                                       bool* found_inf, T* out) {
+__global__ void GpuInverse(const T* s, T* o) {
+  *o = Inverse<T>(*s);
+}
+
+template <typename T>
+__global__ void CheckFiniteAndUnscale(const T* in, const T* scale, int num,
+                                      bool* found_inf, T* out) {
   const int idx = threadIdx.x + blockIdx.x * blockDim.x;
 
   if (idx < num) {
     if (!isfinite(in[idx])) {
-      *found_inf = 1;
+      *found_inf = true;
     }
-    out[idx] = *found_inf ? in[idx] : in[idx] * scale[0];
+    out[idx] = *found_inf ? in[idx] : in[idx] * (*scale);
   }
 }
 
 template <typename T>
-class AmpCheckFiniteAndScaleKernel<platform::CUDADeviceContext, T>
-    : public framework::OpKernel<T> {
+class CheckFiniteAndUnscaleGpuKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& ctx) const {
     auto& dev_ctx = ctx.template device_context<platform::CUDADeviceContext>();
@@ -48,6 +51,12 @@ class AmpCheckFiniteAndScaleKernel<platform::CUDADeviceContext, T>
     bool* found_inf_data = found_inf->mutable_data<bool>(dev_ctx.GetPlace());
     cudaMemset(found_inf_data, false, found_inf->numel() * sizeof(bool));
 
+    framework::Tensor inverse_scale =
+        ctx.AllocateTmpTensor<T, platform::CUDADeviceContext>({1}, dev_ctx);
+    T* inverse_scale_v = inverse_scale.template data<T>();
+
+    GpuInverse<T><<<1, 1, 0, dev_ctx.stream()>>>(scale_data, inverse_scale_v);
+
     for (size_t i = 0; i < xs.size(); ++i) {
       const auto* x = xs[i];
       auto* out = outs[i];
@@ -55,11 +64,11 @@ class AmpCheckFiniteAndScaleKernel<platform::CUDADeviceContext, T>
       T* out_data = out->mutable_data<T>(dev_ctx.GetPlace());
 
       int num = x->numel();
-      int block = 512;
+      int block = 1024;
       int grid = (num + block - 1) / block;
       VLOG(3) << "launch kernel";
-      AmpCheckFiniteAndScale<T><<<grid, block, 0, dev_ctx.stream()>>>(
-          x_data, scale_data, num, found_inf_data, out_data);
+      CheckFiniteAndUnscale<T><<<grid, block, 0, dev_ctx.stream()>>>(
+          x_data, inverse_scale_v, num, found_inf_data, out_data);
       VLOG(3) << "finish kernel";
     }
   }
@@ -68,9 +77,6 @@ class AmpCheckFiniteAndScaleKernel<platform::CUDADeviceContext, T>
 }  // namespace paddle
 
 namespace ops = paddle::operators;
-REGISTER_OP_CUDA_KERNEL(
-    amp_check_finite_and_scale,
-    ops::AmpCheckFiniteAndScaleKernel<paddle::platform::CUDADeviceContext,
-                                      float>,
-    ops::AmpCheckFiniteAndScaleKernel<paddle::platform::CUDADeviceContext,
-                                      double>);
+REGISTER_OP_CUDA_KERNEL(check_finite_and_unscale,
+                        ops::CheckFiniteAndUnscaleGpuKernel<float>,
+                        ops::CheckFiniteAndUnscaleGpuKernel<double>);
diff --git a/paddle/fluid/operators/amp/check_finite_and_unscale_op.h b/paddle/fluid/operators/amp/check_finite_and_unscale_op.h
new file mode 100644
index 0000000000000000000000000000000000000000..4fb8744d0eee3c58f2948c5a466e08c2700b4332
--- /dev/null
+++ b/paddle/fluid/operators/amp/check_finite_and_unscale_op.h
@@ -0,0 +1,31 @@
+/* Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include <string>
+#include <vector>
+#include "paddle/fluid/operators/isfinite_op.h"
+#include "paddle/fluid/platform/hostdevice.h"
+
+namespace paddle {
+namespace operators {
+
+template <typename T>
+inline HOSTDEVICE T Inverse(T s) {
+  return 1.0 / s;
+}
+
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/fluid/operators/amp/update_loss_scaling_op.cc b/paddle/fluid/operators/amp/update_loss_scaling_op.cc
new file mode 100644
index 0000000000000000000000000000000000000000..fca3c531b40550952273f03f41bbc62cbff170fc
--- /dev/null
+++ b/paddle/fluid/operators/amp/update_loss_scaling_op.cc
@@ -0,0 +1,170 @@
+/* Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/operators/amp/update_loss_scaling_op.h"
+#include <cstring>
+#include <string>
+#include <vector>
+#include "paddle/fluid/framework/op_registry.h"
+
+namespace paddle {
+namespace operators {
+
+class UpdateLossScalingOp : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+
+  void InferShape(framework::InferShapeContext* ctx) const override {
+    OP_INOUT_CHECK(ctx->HasInputs("X"), "Input", "X", "update_loss_scaling");
+    OP_INOUT_CHECK(ctx->HasInput("FoundInfinite"), "Input", "FoundInfinite",
+                   "update_loss_scaling");
+    OP_INOUT_CHECK(ctx->HasInput("PrevLossScaling"), "Input", "PrevLossScaling",
+                   "update_loss_scaling");
+    OP_INOUT_CHECK(ctx->HasInput("InGoodSteps"), "Input", "InGoodSteps",
+                   "update_loss_scaling");
+    OP_INOUT_CHECK(ctx->HasInput("InBadSteps"), "Input", "InBadSteps",
+                   "update_loss_scaling");
+    OP_INOUT_CHECK(ctx->HasOutputs("Out"), "Output", "Out",
+                   "update_loss_scaling");
+    OP_INOUT_CHECK(ctx->HasOutput("LossScaling"), "Output", "LossScaling",
+                   "update_loss_scaling");
+    OP_INOUT_CHECK(ctx->HasOutput("OutGoodSteps"), "Output", "OutGoodSteps",
+                   "update_loss_scaling");
+    OP_INOUT_CHECK(ctx->HasOutput("OutBadSteps"), "Output", "OutBadSteps",
+                   "update_loss_scaling");
+    auto x_dims = ctx->GetInputsDim("X");
+    ctx->SetOutputsDim("Out", x_dims);
+    ctx->SetOutputDim("LossScaling", {1});
+    ctx->SetOutputDim("OutGoodSteps", {1});
+    ctx->SetOutputDim("OutBadSteps", {1});
+  }
+
+ protected:
+  framework::OpKernelType GetExpectedKernelType(
+      const framework::ExecutionContext& ctx) const override {
+    return framework::OpKernelType(
+        OperatorWithKernel::IndicateVarDataType(ctx, "PrevLossScaling"),
+        ctx.device_context());
+  }
+};
+
+class UpdateLossScalingOpMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  void Make() override {
+    AddInput("X",
+             "(Tensors) The input tensors of update_loss_scaling operator.")
+        .AsDuplicable();
+    AddInput("FoundInfinite",
+             "(Tensor) 1-dim tensor, contains a bool scalar, which indicates "
+             "whether there is any infinite gradient.");
+    AddInput("PrevLossScaling",
+             "(Tensor) 1-dim tensor, previous loss scaling.");
+    AddInput("InGoodSteps",
+             "(Tensor) 1-dim tensor, accumulates good steps in which all "
+             "gradients are finite.");
+    AddInput("InBadSteps",
+             "(Tensor) 1-dim tensor, accumulates bad steps in which some "
+             "gradients are infinite.");
+    AddOutput("Out",
+              "(Tensors) The output tensor of update_loss_scaling operator.")
+        .AsDuplicable();
+    AddOutput("LossScaling", "(Tensor) 1-dim tensor, updated loss scaling.");
+    AddOutput("OutGoodSteps", "(Tensor) 1-dim tensor, pdated good steps.");
+    AddOutput("OutBadSteps", "(Tensor) 1-dim tensor, updated bad steps.");
+    AddAttr<int>("incr_every_n_steps",
+                 "A value represents increasing loss scaling every n "
+                 "consecutive steps with finite gradients.");
+    AddAttr<int>("decr_every_n_nan_or_inf",
+                 "A value represents decreasing loss scaling every n "
+                 "accumulated steps with nan or inf gradients.");
+    AddAttr<float>("incr_ratio",
+                   "The multiplier to use when increasing the loss scaling.")
+        .AddCustomChecker([](float incr_ratio) {
+          PADDLE_ENFORCE_EQ(incr_ratio > 1.0f, true,
+                            platform::errors::InvalidArgument(
+                                "'incr_ratio' should be greater than 1, but "
+                                "the received is %f",
+                                incr_ratio));
+        });
+    AddAttr<float>(
+        "decr_ratio",
+        "The less-than-one-multiplier to use when decreasing loss scaling.")
+        .AddCustomChecker([](float decr_ratio) {
+          PADDLE_ENFORCE_EQ(decr_ratio > 0.0f && decr_ratio < 1.0f, true,
+                            platform::errors::InvalidArgument(
+                                "'incr_ratio' should be between 0 and 1, but "
+                                "the received is %f",
+                                decr_ratio));
+        });
+    AddComment(R"DOC(
+Update loss scaling according to overall gradients. If all gradients is 
+finite after incr_every_n_steps, loss scaling will increase by incr_ratio. 
+Otherwise, loss scaling will decrease by decr_ratio after
+decr_every_n_nan_or_inf steps and each step some gradients are infinite.
+
+)DOC");
+  }
+};
+
+template <typename T>
+class UpdateLossScalingFunctor<platform::CPUDeviceContext, T> {
+ public:
+  void operator()(const platform::CPUDeviceContext& ctx,
+                  const bool* found_inf_data, const T* pre_loss_scaling_data,
+                  const int* good_in_data, const int* bad_in_data,
+                  const int incr_every_n_steps,
+                  const int decr_every_n_nan_or_inf, const float incr_ratio,
+                  const float decr_ratio, T* updated_loss_scaling_data,
+                  int* good_out_data, int* bad_out_data) const {
+    Update<T>(found_inf_data, pre_loss_scaling_data, good_in_data, bad_in_data,
+              incr_every_n_steps, decr_every_n_nan_or_inf, incr_ratio,
+              decr_ratio, updated_loss_scaling_data, good_out_data,
+              bad_out_data);
+  }
+};
+
+template <typename T>
+class LazyZeroInputs<platform::CPUDeviceContext, T> {
+ public:
+  void operator()(const platform::CPUDeviceContext& dev_ctx,
+                  const bool* found_inf_data,
+                  const std::vector<const framework::Tensor*>& xs,
+                  const std::vector<framework::Tensor*>& outs) const {
+    if (*found_inf_data) {
+      VLOG(1) << "-- UpdateLossScaling: Infinite values are found in grads. --";
+      for (size_t i = 0; i < xs.size(); ++i) {
+        auto* out = outs[i];
+        T* out_data = out->mutable_data<T>(dev_ctx.GetPlace());
+        int num = out->numel();
+        std::memset(out_data, 0, num * sizeof(T));
+      }
+    }
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+using CPU = paddle::platform::CPUDeviceContext;
+
+REGISTER_OPERATOR(
+    update_loss_scaling, ops::UpdateLossScalingOp,
+    ops::UpdateLossScalingOpMaker,
+    paddle::framework::EmptyGradOpMaker<paddle::framework::OpDesc>,
+    paddle::framework::EmptyGradOpMaker<paddle::imperative::OpBase>);
+
+REGISTER_OP_CPU_KERNEL(update_loss_scaling,
+                       ops::UpdateLossScalingKernel<CPU, float>,
+                       ops::UpdateLossScalingKernel<CPU, double>);
diff --git a/paddle/fluid/operators/amp/update_loss_scaling_op.cu b/paddle/fluid/operators/amp/update_loss_scaling_op.cu
new file mode 100644
index 0000000000000000000000000000000000000000..2bc60423d247447adf18eb3ef050ca9b395a2e2f
--- /dev/null
+++ b/paddle/fluid/operators/amp/update_loss_scaling_op.cu
@@ -0,0 +1,84 @@
+/* Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include <vector>
+#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/operators/amp/update_loss_scaling_op.h"
+#include "paddle/fluid/platform/enforce.h"
+
+namespace paddle {
+namespace operators {
+
+template <typename T>
+__global__ void GpuUpdateLossScaling(
+    const bool* found_inf_data, const T* pre_loss_scaling_data,
+    const int* good_in_data, const int* bad_in_data,
+    const int incr_every_n_steps, const int decr_every_n_nan_or_inf,
+    const float incr_ratio, const float decr_ratio,
+    T* updated_loss_scaling_data, int* good_out_data, int* bad_out_data) {
+  Update<T>(found_inf_data, pre_loss_scaling_data, good_in_data, bad_in_data,
+            incr_every_n_steps, decr_every_n_nan_or_inf, incr_ratio, decr_ratio,
+            updated_loss_scaling_data, good_out_data, bad_out_data);
+}
+
+template <typename T>
+class UpdateLossScalingFunctor<platform::CUDADeviceContext, T> {
+ public:
+  void operator()(const platform::CUDADeviceContext& dev_ctx,
+                  const bool* found_inf_data, const T* pre_loss_scaling_data,
+                  const int* good_in_data, const int* bad_in_data,
+                  const int incr_every_n_steps,
+                  const int decr_every_n_nan_or_inf, const float incr_ratio,
+                  const float decr_ratio, T* updated_loss_scaling_data,
+                  int* good_out_data, int* bad_out_data) const {
+    GpuUpdateLossScaling<T><<<1, 1, 0, dev_ctx.stream()>>>(
+        found_inf_data, pre_loss_scaling_data, good_in_data, bad_in_data,
+        incr_every_n_steps, decr_every_n_nan_or_inf, incr_ratio, decr_ratio,
+        updated_loss_scaling_data, good_out_data, bad_out_data);
+  }
+};
+
+template <typename T>
+class LazyZeroInputs<platform::CUDADeviceContext, T> {
+ public:
+  void operator()(const platform::CUDADeviceContext& dev_ctx,
+                  const bool* found_inf_data,
+                  const std::vector<const framework::Tensor*>& xs,
+                  const std::vector<framework::Tensor*>& outs) const {
+    const auto gpu_place =
+        BOOST_GET_CONST(platform::CUDAPlace, dev_ctx.GetPlace());
+    bool has_inf{false};
+    memory::Copy(platform::CPUPlace(), &has_inf, gpu_place, found_inf_data,
+                 sizeof(bool), dev_ctx.stream());
+    if (has_inf) {
+      VLOG(1) << "-- UpdateLossScaling: Infinite values are found in grads. --";
+      for (size_t i = 0; i < xs.size(); ++i) {
+        auto* out = outs[i];
+        T* out_data = out->mutable_data<T>(dev_ctx.GetPlace());
+        int num = out->numel();
+        cudaMemset(out_data, 0, num * sizeof(T));
+      }
+    }
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+using GPU = paddle::platform::CUDADeviceContext;
+
+REGISTER_OP_CUDA_KERNEL(update_loss_scaling,
+                        ops::UpdateLossScalingKernel<GPU, float>,
+                        ops::UpdateLossScalingKernel<GPU, double>);
diff --git a/paddle/fluid/operators/amp/update_loss_scaling_op.h b/paddle/fluid/operators/amp/update_loss_scaling_op.h
new file mode 100644
index 0000000000000000000000000000000000000000..ca23b72eff0e85ab94c4d1f11e986f69b4e2d776
--- /dev/null
+++ b/paddle/fluid/operators/amp/update_loss_scaling_op.h
@@ -0,0 +1,123 @@
+// Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include <cmath>
+#include <vector>
+#include "paddle/fluid/framework/operator.h"
+#include "paddle/fluid/platform/device_context.h"
+#include "paddle/fluid/platform/enforce.h"
+#include "paddle/fluid/platform/errors.h"
+#include "paddle/fluid/platform/hostdevice.h"
+
+namespace paddle {
+namespace operators {
+
+using Tensor = framework::Tensor;
+
+template <typename T>
+HOSTDEVICE void Update(const bool* found_inf_data,
+                       const T* pre_loss_scaling_data, const int* good_in_data,
+                       const int* bad_in_data, const int incr_every_n_steps,
+                       const int decr_every_n_nan_or_inf,
+                       const float incr_ratio, const float decr_ratio,
+                       T* updated_loss_scaling_data, int* good_out_data,
+                       int* bad_out_data) {
+  if (*found_inf_data) {
+    *good_out_data = 0;
+    *bad_out_data = *bad_in_data + 1;
+    if (*bad_out_data == decr_every_n_nan_or_inf) {
+      T new_loss_scaling = *pre_loss_scaling_data * decr_ratio;
+      *updated_loss_scaling_data = new_loss_scaling < static_cast<T>(1)
+                                       ? static_cast<T>(1)
+                                       : new_loss_scaling;
+      *bad_out_data = 0;
+    }
+  } else {
+    *bad_out_data = 0;
+    *good_out_data = *good_in_data + 1;
+    if (*good_out_data == incr_every_n_steps) {
+      T new_loss_scaling = *pre_loss_scaling_data * incr_ratio;
+      *updated_loss_scaling_data = std::isfinite(new_loss_scaling)
+                                       ? new_loss_scaling
+                                       : *pre_loss_scaling_data;
+      *good_out_data = 0;
+    }
+  }
+}
+
+template <typename DeviceContext, typename T>
+class UpdateLossScalingFunctor {
+ public:
+  void operator()(const DeviceContext& dev_ctx, const bool* found_inf_data,
+                  const T* pre_loss_scaling_data, const int* good_in_data,
+                  const int* bad_in_data, const int incr_every_n_steps,
+                  const int decr_every_n_nan_or_inf, const float incr_ratio,
+                  const float decr_ratio, T* updated_loss_scaling_data,
+                  int* good_out_data, int* bad_out_data) const;
+};
+
+template <typename DeviceContext, typename T>
+class LazyZeroInputs {
+ public:
+  void operator()(const DeviceContext& dev_ctx, const bool* found_inf_data,
+                  const std::vector<const framework::Tensor*>& xs,
+                  const std::vector<framework::Tensor*>& outs) const;
+};
+
+template <typename DeviceContext, typename T>
+class UpdateLossScalingKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    const auto xs = ctx.MultiInput<framework::Tensor>("X");
+    const auto* found_inf = ctx.Input<Tensor>("FoundInfinite");
+    const auto* pre_loss_scaling = ctx.Input<Tensor>("PrevLossScaling");
+    const auto* good_in = ctx.Input<Tensor>("InGoodSteps");
+    const auto* bad_in = ctx.Input<Tensor>("InBadSteps");
+    auto outs = ctx.MultiOutput<framework::Tensor>("Out");
+    auto* updated_loss_scaling = ctx.Output<Tensor>("LossScaling");
+    auto* good_out = ctx.Output<Tensor>("OutGoodSteps");
+    auto* bad_out = ctx.Output<Tensor>("OutBadSteps");
+
+    PADDLE_ENFORCE_EQ(found_inf->numel(), 1,
+                      platform::errors::InvalidArgument(
+                          "FoundInfinite must has only one element."));
+
+    const bool* found_inf_data = found_inf->data<bool>();
+    const T* pre_loss_scaling_data = pre_loss_scaling->data<T>();
+    const int* good_in_data = good_in->data<int>();
+    const int* bad_in_data = bad_in->data<int>();
+
+    auto& dev_ctx = ctx.template device_context<DeviceContext>();
+    T* updated_loss_scaling_data =
+        updated_loss_scaling->mutable_data<T>(dev_ctx.GetPlace());
+    int* good_out_data = good_out->mutable_data<int>(dev_ctx.GetPlace());
+    int* bad_out_data = bad_out->mutable_data<int>(dev_ctx.GetPlace());
+
+    const int incr_every_n_steps = ctx.Attr<int>("incr_every_n_steps");
+    const int decr_every_n_nan_or_inf =
+        ctx.Attr<int>("decr_every_n_nan_or_inf");
+    const float incr_ratio = ctx.Attr<float>("incr_ratio");
+    const float decr_ratio = ctx.Attr<float>("decr_ratio");
+    UpdateLossScalingFunctor<DeviceContext, T>{}(
+        dev_ctx, found_inf_data, pre_loss_scaling_data, good_in_data,
+        bad_in_data, incr_every_n_steps, decr_every_n_nan_or_inf, incr_ratio,
+        decr_ratio, updated_loss_scaling_data, good_out_data, bad_out_data);
+    LazyZeroInputs<DeviceContext, T>{}(dev_ctx, found_inf_data, xs, outs);
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/fluid/operators/arg_min_max_op_base.h b/paddle/fluid/operators/arg_min_max_op_base.h
index 69365357084b660b7c2f90149fe250854ea6a014..57e1c06f73c56334fc93dee7a16d6899f5a6f12a 100644
--- a/paddle/fluid/operators/arg_min_max_op_base.h
+++ b/paddle/fluid/operators/arg_min_max_op_base.h
@@ -110,10 +110,12 @@ struct VisitDataArgMinMaxFunctor {
         CALL_ARG_MINMAX_FUNCTOR(6);
         break;
       default:
-        PADDLE_THROW(
-            "%s operator doesn't supports tensors whose ranks are greater "
-            "than 6.",
-            (EnumArgMinMaxValue == kArgMin ? "argmin" : "argmax"));
+        PADDLE_ENFORCE_LE(
+            x_dims.size(), 6,
+            platform::errors::InvalidArgument(
+                "%s operator doesn't supports tensors whose ranks are greater "
+                "than 6.",
+                (EnumArgMinMaxValue == kArgMin ? "argmin" : "argmax")));
         break;
 #undef CALL_ARG_MINMAX_FUNCTOR
     }
@@ -164,12 +166,25 @@ class ArgMinMaxOp : public framework::OperatorWithKernel {
     PADDLE_ENFORCE_LT(
         axis, x_dims.size(),
         platform::errors::InvalidArgument(
-            "'axis'(%d) must be less than Rank(X)(%d).", axis, x_dims.size()));
+            "'axis'(%d) must be less than Rank(X)(%d) of Input(X).", axis,
+            x_dims.size()));
+
+    const int& dtype = ctx->Attrs().Get<int>("dtype");
+    PADDLE_ENFORCE_EQ(
+        (dtype < 0 || dtype == 2 || dtype == 3), true,
+        platform::errors::InvalidArgument(
+            "The attribute of dtype in argmin/argmax must be [%s] or [%s], but "
+            "received [%s]",
+            paddle::framework::DataTypeToString(
+                framework::proto::VarType::INT32),
+            paddle::framework::DataTypeToString(
+                framework::proto::VarType::INT64),
+            paddle::framework::DataTypeToString(
+                static_cast<framework::proto::VarType::Type>(dtype))));
 
     auto x_rank = x_dims.size();
     if (axis < 0) axis += x_rank;
     if (ctx->IsRuntime()) {
-      const int& dtype = ctx->Attrs().Get<int>("dtype");
       if (dtype == framework::proto::VarType::INT32) {
         int64_t all_element_num = 0;
         if (flatten) {
@@ -180,10 +195,11 @@ class ArgMinMaxOp : public framework::OperatorWithKernel {
         }
         PADDLE_ENFORCE_LE(
             all_element_num, INT_MAX,
-            "The element num of the argmin/argmax input at axis is "
-            "%d, is larger than int32 maximum value:%d, you must "
-            "set the dtype of argmin/argmax to 'int64'.",
-            all_element_num, INT_MAX);
+            platform::errors::InvalidArgument(
+                "The element num of the argmin/argmax input at axis is "
+                "%d, is larger than int32 maximum value:%d, you must "
+                "set the dtype of argmin/argmax to 'int64'.",
+                all_element_num, INT_MAX));
       }
     }
     std::vector<int64_t> vec;
diff --git a/paddle/fluid/operators/argsort_op.cu b/paddle/fluid/operators/argsort_op.cu
index cbd7e33bc6b7238eacb29ebab1306802d974a90b..7fc2a92b7d9129b3ab0724832d2e5f72adafb0e3 100644
--- a/paddle/fluid/operators/argsort_op.cu
+++ b/paddle/fluid/operators/argsort_op.cu
@@ -12,7 +12,9 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
+#include <thrust/copy.h>
 #include <thrust/execution_policy.h>
+#include <thrust/sequence.h>
 #include <thrust/sort.h>
 #include "cub/cub.cuh"
 #include "paddle/fluid/framework/op_registry.h"
@@ -58,6 +60,16 @@ static __global__ void FillIndex(T* indices, T num_rows, T num_cols) {
   }
 }
 
+template <typename T, typename IndType>
+static __global__ void FillFlattenGrad(const T* dO, const IndType* indices,
+                                       int64_t size, T* dX) {
+  int index = threadIdx.x + blockIdx.x * blockDim.x;
+  int stride = blockDim.x * gridDim.x;
+  for (int i = index; i < size; i += stride) {
+    dX[indices[i]] = dO[i];
+  }
+}
+
 template <typename T, typename IndType>
 static __global__ void FillGrad(const T* dO, const IndType* indices, T* dX,
                                 IndType num_rows, IndType num_cols) {
@@ -193,6 +205,23 @@ void ArgFullAssign(const platform::CUDADeviceContext& ctx, const Tensor* dO,
 }
 
 template <typename T>
+void ArgFlattenAssign(const platform::CUDADeviceContext& ctx, const Tensor* dO,
+                      const Tensor* indices, int64_t size, Tensor* dX) {
+  auto cu_stream = ctx.stream();
+
+  const int64_t block_size =
+      std::min(size, static_cast<int64_t>(ctx.GetMaxThreadsPerBlock()));
+  int64_t max_threads = ctx.GetMaxPhysicalThreadCount();
+  const int64_t max_blocks =
+      std::max(((max_threads - 1) / block_size + 1), static_cast<int64_t>(1));
+  const int64_t grid_size =
+      std::min(max_blocks, (size + block_size - 1) / block_size);
+
+  FillFlattenGrad<<<grid_size, block_size, 0, cu_stream>>>(
+      dO->data<T>(), indices->data<int64_t>(), size, dX->data<T>());
+}
+
+template <typename DeviceContext, typename T>
 class ArgsortOpCUDAKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& ctx) const override {
@@ -205,8 +234,25 @@ class ArgsortOpCUDAKernel : public framework::OpKernel<T> {
     auto in_dims = input->dims();
     axis = (axis < 0) ? (in_dims.size() + axis) : axis;
 
-    int64_t numel = input->numel();
-    int64_t groups = numel / in_dims[axis];
+    const T* in_data = input->data<T>();
+    auto size = input->numel();
+    T* out_data = output->mutable_data<T>(ctx.GetPlace());
+    int64_t* ids_data = indices->mutable_data<int64_t>(ctx.GetPlace());
+
+    // Use thrust for parallel acceleration when the input size is equal to the
+    // length of the ‘axis’ dimension.
+    // Compared to the following 'Special case for full sort', ascending sort is
+    // 34 times faster and descending sort is 31 times faster.
+    if (size == in_dims[axis]) {
+      thrust::sequence(thrust::device, ids_data, ids_data + size);
+      thrust::copy(thrust::device, in_data, in_data + size, out_data);
+      thrust::sort_by_key(thrust::device, out_data, out_data + size, ids_data);
+      if (descending) {
+        thrust::reverse(thrust::device, out_data, out_data + size);
+        thrust::reverse(thrust::device, ids_data, ids_data + size);
+      }
+      return;
+    }
 
     // Special case for full sort, speedup ~190x.
     if (axis == -1 || axis + 1 == in_dims.size()) {
@@ -276,23 +322,28 @@ class ArgsortGradOpCUDAKernel : public framework::OpKernel<T> {
     int axis = ctx.Attr<int>("axis");
 
     dX->mutable_data<T>(ctx.GetPlace());
-    auto dxt = framework::EigenVector<T>::Flatten(*dX);
-    auto& place = *ctx.template device_context<platform::CUDADeviceContext>()
-                       .eigen_device();
-    dxt.device(place) = dxt.constant(static_cast<T>(0));
     if (dO->numel() == 0) return;
 
-    auto in_dims = indices->dims();
+    auto in_dims = dX->dims();
     axis = (axis < 0) ? (in_dims.size() + axis) : axis;
 
-    int64_t numel = indices->numel();
+    int64_t size = dX->numel();
+    const auto& dev_ctx = ctx.cuda_device_context();
+
+    // Parallel acceleration when the input size is equal to the length of the
+    // ‘axis’ dimension.
+    // Compared to 'special case for full sort' below, the gradient calculation
+    // is 10 times faster.
+    if (size == in_dims[axis]) {
+      ArgFlattenAssign<T>(dev_ctx, dO, indices, size, dX);
+      return;
+    }
 
     // Special case for full sort, speedup ~190x.
     if (axis == -1 || axis + 1 == in_dims.size()) {
       const int64_t input_height = framework::product(
           framework::slice_ddim(in_dims, 0, in_dims.size() - 1));
       const int64_t input_width = in_dims[in_dims.size() - 1];
-      const auto& dev_ctx = ctx.cuda_device_context();
       ArgFullAssign<T, int64_t>(dev_ctx, dO, indices, dX, input_height,
                                 input_width);
     } else {
@@ -316,7 +367,6 @@ class ArgsortGradOpCUDAKernel : public framework::OpKernel<T> {
       Tensor trans_ind;
       trans_ind.mutable_data<int64_t>(trans_dims, ctx.GetPlace());
       int ndims = trans.size();
-      const auto& dev_ctx = ctx.cuda_device_context();
       // Do transpose
       TransCompute<platform::CUDADeviceContext, T>(ndims, dev_ctx, *dO,
                                                    &trans_dO, trans);
@@ -345,11 +395,17 @@ class ArgsortGradOpCUDAKernel : public framework::OpKernel<T> {
 }  // namespace paddle
 
 REGISTER_OP_CUDA_KERNEL(
-    argsort, paddle::operators::ArgsortOpCUDAKernel<float>,
-    paddle::operators::ArgsortOpCUDAKernel<double>,
-    paddle::operators::ArgsortOpCUDAKernel<int>,
-    paddle::operators::ArgsortOpCUDAKernel<int64_t>,
-    paddle::operators::ArgsortOpCUDAKernel<paddle::platform::float16>);
+    argsort,
+    paddle::operators::ArgsortOpCUDAKernel<paddle::platform::CUDADeviceContext,
+                                           float>,
+    paddle::operators::ArgsortOpCUDAKernel<paddle::platform::CUDADeviceContext,
+                                           double>,
+    paddle::operators::ArgsortOpCUDAKernel<paddle::platform::CUDADeviceContext,
+                                           int>,
+    paddle::operators::ArgsortOpCUDAKernel<paddle::platform::CUDADeviceContext,
+                                           int64_t>,
+    paddle::operators::ArgsortOpCUDAKernel<paddle::platform::CUDADeviceContext,
+                                           paddle::platform::float16>);
 REGISTER_OP_CUDA_KERNEL(
     argsort_grad, paddle::operators::ArgsortGradOpCUDAKernel<float>,
     paddle::operators::ArgsortGradOpCUDAKernel<double>,
diff --git a/paddle/fluid/operators/assign_op.h b/paddle/fluid/operators/assign_op.h
index 6ce04d19fc4376e4263712e2904e480e26590553..c2154f78bbe97418f2c7388a000dc833134d0c84 100644
--- a/paddle/fluid/operators/assign_op.h
+++ b/paddle/fluid/operators/assign_op.h
@@ -52,7 +52,10 @@ class AssignFunctor {
 
   template <typename T>
   void operator()(const T &v) const {
-    PADDLE_THROW("Not support type for assign op %s", typeid(T).name());
+    PADDLE_ENFORCE_EQ(
+        true, false,
+        platform::errors::PermissionDenied(
+            "Not support type for assign op with type %s", typeid(T).name()));
   }
 
  private:
diff --git a/paddle/fluid/operators/assign_value_op.h b/paddle/fluid/operators/assign_value_op.h
index b462c43d23a534c3520a2a852252fe0333222d77..1418d96b67b75ea3a2d4b3d95d3e4bdfb17618ee 100644
--- a/paddle/fluid/operators/assign_value_op.h
+++ b/paddle/fluid/operators/assign_value_op.h
@@ -76,7 +76,10 @@ class AssignValueKernel : public framework::OpKernel<T> {
         value_name = "int64_values";
         break;
       default:
-        PADDLE_THROW("Unsupported dtype for assign_value_op: %d", dtype);
+        PADDLE_THROW(platform::errors::Unimplemented(
+            "Unsupported data type(code %d) for AssignValue operator, only "
+            "supports bool, int32, float32 and int64.",
+            dtype));
         break;
     }
     CopyVecotorToTensor<T>(value_name, out, ctx);
diff --git a/paddle/fluid/operators/average_accumulates_op.h b/paddle/fluid/operators/average_accumulates_op.h
index 3958d3f685470f2505abf0e8bfd269d3834970ae..338e46111fca83230aca1c7877578e557cef5a31 100644
--- a/paddle/fluid/operators/average_accumulates_op.h
+++ b/paddle/fluid/operators/average_accumulates_op.h
@@ -54,9 +54,13 @@ class AverageAccumulatesKernel : public framework::OpKernel<T> {
     float average_window = ctx.Attr<float>("average_window");
     int64_t max_average_window = ctx.Attr<int64_t>("max_average_window");
     int64_t min_average_window = ctx.Attr<int64_t>("min_average_window");
-    PADDLE_ENFORCE_LE(min_average_window, max_average_window,
-                      "min_average_window shouldn't be larger than "
-                      "max_average_window");
+    PADDLE_ENFORCE_LE(
+        min_average_window, max_average_window,
+        platform::errors::InvalidArgument(
+            "The min_average_window > "
+            "max_average_window is not right, min_average_window is %ld, "
+            "max_average_window is %ld.",
+            min_average_window, max_average_window));
 
     // Get inputs
     auto* param = ctx.Input<Tensor>("param");
diff --git a/paddle/fluid/operators/batch_norm_op.cc b/paddle/fluid/operators/batch_norm_op.cc
index c92f72e653dbe843d76ec65954d17f3264ed1cc0..dcfe8bb1bb48a505f5526f6471e8ce9ba848b5b3 100644
--- a/paddle/fluid/operators/batch_norm_op.cc
+++ b/paddle/fluid/operators/batch_norm_op.cc
@@ -831,6 +831,401 @@ void BatchNormGradMaker<T>::Apply(GradOpPtr<T> op) const {
   op->SetOutput(framework::GradVarName("Bias"), this->InputGrad("Bias"));
 }
 
+template <typename T>
+void BatchNormDoubleGradMaker<T>::Apply(GradOpPtr<T> op) const {
+  op->SetType("batch_norm_grad_grad");
+  op->SetInput("X", this->Input("X"));
+  op->SetInput("Scale", this->Input("Scale"));
+  op->SetInput("SavedMean", this->Input("SavedMean"));
+  op->SetInput("SavedVariance", this->Input("SavedVariance"));
+  if (BOOST_GET_CONST(bool, this->GetAttr("use_global_stats"))) {
+    op->SetInput("Variance", this->Input("Variance"));
+  }
+  op->SetInput("DDX", this->OutputGrad(framework::GradVarName("X")));
+  op->SetInput("DDScale", this->OutputGrad(framework::GradVarName("Scale")));
+  op->SetInput("DDBias", this->OutputGrad(framework::GradVarName("Bias")));
+  op->SetInput("DY", this->Input(framework::GradVarName("Y")));
+
+  op->SetAttrMap(this->Attrs());
+  op->SetOutput("DX", this->InputGrad("X"));
+  op->SetOutput("DScale", this->InputGrad("Scale"));
+  op->SetOutput("DDY", this->InputGrad(framework::GradVarName("Y")));
+}
+
+void BatchNormDoubleGradOp::InferShape(
+    framework::InferShapeContext *ctx) const {
+  OP_INOUT_CHECK(ctx->HasInput("X"), "Input", "X", "BatchNormDoubleGrad");
+  OP_INOUT_CHECK(ctx->HasInput("Scale"), "Input", "Scale",
+                 "BatchNormDoubleGrad");
+  OP_INOUT_CHECK(ctx->HasInput("SavedMean"), "Input", "SavedMean",
+                 "BatchNormDoubleGrad");
+  OP_INOUT_CHECK(ctx->HasInput("SavedVariance"), "Input", "SavedVariance",
+                 "BatchNormDoubleGrad");
+
+  const bool use_global_stats = ctx->Attrs().Get<bool>("use_global_stats");
+  if (use_global_stats) {
+    OP_INOUT_CHECK(ctx->HasInput("Variance"), "Input", "VarianceOut",
+                   "BatchNormDoubleGrad");
+  }
+
+  OP_INOUT_CHECK(ctx->HasInput("DDX"), "Input", "DDX", "BatchNormDoubleGrad");
+  OP_INOUT_CHECK(ctx->HasInput("DY"), "Input", "DY", "BatchNormDoubleGrad");
+
+  // check output
+  OP_INOUT_CHECK(ctx->HasOutput("DX"), "Output", "DX", "BatchNormDoubleGrad");
+
+  const auto x_dims = ctx->GetInputDim("X");
+  const int C = x_dims[1];
+  if (ctx->HasOutput("DX")) {
+    ctx->SetOutputDim("DX", x_dims);
+  }
+  if (ctx->HasOutput("DScale")) {
+    ctx->SetOutputDim("DScale", {C});
+  }
+  if (ctx->HasOutput("DDY")) {
+    ctx->ShareDim("X", "DDY");
+  }
+}
+
+framework::OpKernelType BatchNormDoubleGradOp::GetExpectedKernelType(
+    const framework::ExecutionContext &ctx) const {
+  const auto *var = ctx.InputVar("DY");
+  if (var == nullptr) {
+    PADDLE_THROW(
+        platform::errors::NotFound("cannot find gradient variable of Y"));
+  }
+  const Tensor *t = nullptr;
+  if (var->IsType<Tensor>()) {
+    t = &var->Get<Tensor>();
+  } else if (var->IsType<LoDTensor>()) {
+    t = &var->Get<LoDTensor>();
+  }
+  if (t == nullptr) {
+    PADDLE_THROW(
+        platform::errors::InvalidArgument("gradient variable of Y is empty"));
+  }
+  return framework::OpKernelType(
+      OperatorWithKernel::IndicateVarDataType(ctx, "X"), ctx.GetPlace());
+}
+
+template <typename T>
+class BatchNormDoubleGradKernel<platform::CPUDeviceContext, T>
+    : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext &ctx) const override {
+    const auto *X = ctx.Input<Tensor>("X");
+    const auto *Scale = ctx.Input<Tensor>("Scale");
+    const auto *dY = ctx.Input<Tensor>("DY");
+    const auto *Saved_mean = ctx.Input<Tensor>("SavedMean");
+    const auto *Saved_variance = ctx.Input<Tensor>("SavedVariance");
+    const float epsilon = ctx.Attr<float>("epsilon");
+    const bool use_global_stats = ctx.Attr<bool>("use_global_stats");
+    const bool is_test = ctx.Attr<bool>("is_test");
+
+    PADDLE_ENFORCE_EQ(
+        is_test, false,
+        platform::errors::InvalidArgument(
+            "`is_test = True` CANNOT be used in train program. If "
+            "you want to use global status in pre_train model, "
+            "please set `use_global_stats = True`"));
+
+    const std::string data_layout_str = ctx.Attr<std::string>("data_layout");
+    const DataLayout data_layout =
+        framework::StringToDataLayout(data_layout_str);
+
+    const auto *ddX = ctx.Input<Tensor>("DDX");
+    const auto *ddScale = ctx.Input<Tensor>("DDScale");
+    const auto *ddBias = ctx.Input<Tensor>("DDBias");
+
+    auto *dX = ctx.Output<Tensor>("DX");
+    auto *dScale = ctx.Output<Tensor>("DScale");
+    auto *ddY = ctx.Output<Tensor>("DDY");
+    dX->mutable_data<T>(ctx.GetPlace());
+    ddY->mutable_data<T>(ctx.GetPlace());
+
+    auto &dev_ctx = ctx.template device_context<platform::CPUDeviceContext>();
+
+    const auto &x_dims = X->dims();
+    const int C =
+        (data_layout == DataLayout::kNCHW ? x_dims[1]
+                                          : x_dims[x_dims.size() - 1]);
+    const int sample_size = X->numel() / C;
+    math::SetConstant<platform::CPUDeviceContext, T> set_constant;
+
+    const T *mean_data = Saved_mean->data<T>();
+    const T *inv_var_data = Saved_variance->data<T>();
+
+    Tensor inv_var_tensor;
+    if (use_global_stats) {
+      const auto *running_variance = ctx.Input<Tensor>("Variance");
+      inv_var_tensor.Resize({C});
+
+      T *running_inv_var_data = inv_var_tensor.mutable_data<T>(ctx.GetPlace());
+      EigenVectorArrayMap<T> inv_var_tmp(running_inv_var_data, C);
+      ConstEigenVectorArrayMap<T> var_arr(running_variance->data<T>(), C);
+
+      inv_var_tmp = (var_arr + epsilon).sqrt().inverse();
+      inv_var_data = running_inv_var_data;
+    }
+
+    // transpose NCHW -> NHWC for easy calculate
+    Tensor transformed_x(X->type());
+    Tensor transformed_dy(dY->type());
+    Tensor transformed_ddx(ddX->type());
+
+    Tensor transformed_dx(dX->type());
+    Tensor transformed_ddy(ddY->type());
+    if (data_layout == DataLayout::kNCHW && x_dims.size() > 2) {
+      VLOG(3) << "Transform batchnorm output from NCHW to NHWC";
+      // Input Tensor
+      ResizeToChannelLast<platform::CPUDeviceContext, T>(ctx, X,
+                                                         &transformed_x);
+      TransToChannelLast<platform::CPUDeviceContext, T>(ctx, X, &transformed_x);
+      ResizeToChannelLast<platform::CPUDeviceContext, T>(ctx, dY,
+                                                         &transformed_dy);
+      TransToChannelLast<platform::CPUDeviceContext, T>(ctx, dY,
+                                                        &transformed_dy);
+      ResizeToChannelLast<platform::CPUDeviceContext, T>(ctx, ddX,
+                                                         &transformed_ddx);
+      TransToChannelLast<platform::CPUDeviceContext, T>(ctx, ddX,
+                                                        &transformed_ddx);
+      // Output Tensor
+      ResizeToChannelLast<platform::CPUDeviceContext, T>(ctx, dX,
+                                                         &transformed_dx);
+      ResizeToChannelLast<platform::CPUDeviceContext, T>(ctx, ddY,
+                                                         &transformed_ddy);
+    } else {
+      transformed_x.ShareDataWith(*X);
+      transformed_dy.ShareDataWith(*dY);
+      transformed_ddx.ShareDataWith(*ddX);
+
+      transformed_dx.ShareDataWith(*dX);
+      transformed_ddy.ShareDataWith(*ddY);
+    }
+
+    ConstEigenArrayMap<T> x_arr(transformed_x.data<T>(), C, sample_size);
+    ConstEigenVectorArrayMap<T> mean_arr(mean_data, C);
+    ConstEigenVectorArrayMap<T> inv_var_arr(inv_var_data, C);
+
+    Tensor mean_tile;
+    mean_tile.Resize({C, sample_size});
+    mean_tile.mutable_data<T>(ctx.GetPlace());
+    EigenArrayMap<T> mean_tile_data(mean_tile.mutable_data<T>(ctx.GetPlace()),
+                                    C, sample_size);
+
+    Tensor inv_var_tile;
+    inv_var_tile.Resize({C, sample_size});
+    inv_var_tile.mutable_data<T>(ctx.GetPlace());
+    EigenArrayMap<T> inv_var_tile_data(
+        inv_var_tile.mutable_data<T>(ctx.GetPlace()), C, sample_size);
+
+    mean_tile_data = mean_arr.replicate(1, sample_size);
+    inv_var_tile_data = inv_var_arr.replicate(1, sample_size);
+
+    Tensor Scale_data;
+    if (!Scale) {
+      Scale_data.mutable_data<T>({C}, ctx.GetPlace());
+      set_constant(dev_ctx, &Scale_data, static_cast<T>(1));
+    }
+    ConstEigenVectorArrayMap<T> scale_arr(
+        Scale ? Scale->data<T>() : Scale_data.data<T>(), C);
+
+    Tensor scale_tile;
+    scale_tile.Resize({C, sample_size});
+    scale_tile.mutable_data<T>(ctx.GetPlace());
+    EigenArrayMap<T> scale_tile_data(scale_tile.mutable_data<T>(ctx.GetPlace()),
+                                     C, sample_size);
+    scale_tile_data = scale_arr.replicate(1, sample_size);
+
+    ConstEigenArrayMap<T> dy_arr(transformed_dy.data<T>(), C, sample_size);
+    ConstEigenArrayMap<T> ddx_arr(transformed_ddx.data<T>(), C, sample_size);
+
+    Tensor x_sub_mean_mul_invstd;
+    x_sub_mean_mul_invstd.Resize({C, sample_size});
+    x_sub_mean_mul_invstd.mutable_data<T>(ctx.GetPlace());
+    EigenArrayMap<T> x_sub_mean_mul_invstd_arr(
+        x_sub_mean_mul_invstd.mutable_data<T>(ctx.GetPlace()), C, sample_size);
+    x_sub_mean_mul_invstd_arr = (x_arr - mean_tile_data) * inv_var_tile_data;
+
+    if (dX) {
+      dX->mutable_data<T>(ctx.GetPlace());
+      EigenArrayMap<T> dx_arr(transformed_dx.mutable_data<T>(ctx.GetPlace()), C,
+                              sample_size);
+      dx_arr.setZero();
+      if (use_global_stats) {
+        // math: dx = (ddscale * dy) * inv_var
+        if (ddScale) {
+          ConstEigenVectorArrayMap<T> ddscale_arr(ddScale->data<T>(), C);
+          Tensor ddscale_tile;
+          ddscale_tile.Resize({C, sample_size});
+          EigenArrayMap<T> ddscale_tile_data(
+              ddscale_tile.mutable_data<T>(ctx.GetPlace()), C, sample_size);
+          ddscale_tile_data = ddscale_arr.replicate(1, sample_size);
+
+          dx_arr = dy_arr * ddscale_tile_data * inv_var_tile_data;
+        }
+      } else {
+        // math: dx = scale * ((x - mean) * inv_var / NxHxW * (np.mean(ddx,
+        // axis=(n,h,w)) *
+        //          np.sum(dy, axis=(n,h,w)) -
+        //          np.sum(dy * ddx, axis=(n,h,w)) + 3 * np.mean(dy * (x -
+        //          mean),
+        //          axis=(n,h,w)) * inv_var.pow(2) *
+        //          np.sum(ddx * (x - mean), axis=(n,h,w))) + inv_var.pow(3) /
+        //          NxHxW *
+        //          np.sum(ddx * (x - mean)) *
+        //          (np.mean(dy, axis=(n,h,w)) - dy) + inv_var.pow(3) / NxHxW *
+        //          np.sum(dy,
+        //          axis=(n,h,w)) * (x - mean) *
+        //          (np.mean(ddx, axis=(n,h,w)) - ddx) + ddr * (dy * inv_var -
+        //          inv_var
+        //          *
+        //          np.mean(dy, axis=(n,h,w)) -
+        //          inv_var.pow(3) * (x - mean) * np.mean(dy * (x - mean),
+        //          axis=(n,h,w))))
+
+        if (ddX) {
+          dx_arr +=
+              (x_sub_mean_mul_invstd_arr * inv_var_tile_data *
+               inv_var_tile_data / sample_size)
+                  .colwise() *
+              (ddx_arr.rowwise().sum() * dy_arr.rowwise().sum() / sample_size -
+               (dy_arr * ddx_arr).rowwise().sum() +
+               3. * (dy_arr * x_sub_mean_mul_invstd_arr).rowwise().sum() *
+                   (ddx_arr * x_sub_mean_mul_invstd_arr).rowwise().sum() /
+                   sample_size);
+
+          dx_arr += (inv_var_tile_data * inv_var_tile_data).colwise() *
+                    (ddx_arr * x_sub_mean_mul_invstd_arr).rowwise().sum() /
+                    sample_size *
+                    (dy_arr.rowwise().sum() / sample_size - dy_arr);
+
+          dx_arr += (inv_var_tile_data * inv_var_tile_data).colwise() *
+                    (dy_arr * x_sub_mean_mul_invstd_arr).rowwise().sum() /
+                    sample_size *
+                    (ddx_arr.rowwise().sum() / sample_size - ddx_arr);
+
+          dx_arr = scale_tile_data * dx_arr;
+        }
+        if (ddScale) {
+          ConstEigenVectorArrayMap<T> ddscale_arr(ddScale->data<T>(), C);
+          Tensor ddscale_tile;
+          ddscale_tile.Resize({C, sample_size});
+          EigenArrayMap<T> ddscale_tile_data(
+              ddscale_tile.mutable_data<T>(ctx.GetPlace()), C, sample_size);
+          ddscale_tile_data = ddscale_arr.replicate(1, sample_size);
+
+          dx_arr += (dy_arr * inv_var_tile_data -
+                     (dy_arr.rowwise().sum().replicate(1, sample_size) /
+                      sample_size) *
+                         inv_var_tile_data -
+                     x_sub_mean_mul_invstd_arr * inv_var_tile_data *
+                         (dy_arr * x_sub_mean_mul_invstd_arr)
+                             .rowwise()
+                             .sum()
+                             .replicate(1, sample_size) /
+                         sample_size) *
+                    ddscale_tile_data;
+        }
+      }
+      if (data_layout == DataLayout::kNCHW) {
+        VLOG(3) << "Transform batchnorm output from NHWC to NCHW";
+        TransToChannelFirst<paddle::platform::CPUDeviceContext, T>(
+            ctx, &transformed_dx, dX);
+      }
+    }
+    if (dScale) {
+      dScale->mutable_data<T>(ctx.GetPlace());
+      EigenVectorArrayMap<T> dscale_arr(dScale->mutable_data<T>(ctx.GetPlace()),
+                                        C);
+      dscale_arr.setZero();
+      if (use_global_stats) {
+        // math: dscale = np.sum(ddx * dy, axis=(n,h,w)) * inv_var
+        if (ddX) {
+          dscale_arr = (ddx_arr * dy_arr * inv_var_tile_data).rowwise().sum();
+        }
+      } else {
+        // math: dscale = inv_var * (dy - np.mean(dy, axis=(n,h,w) - (x-mean) *
+        //            inv_var.pow(2) * np.mean(dy * (x-mean), axis=(n,h,w)))) *
+        //            ddx
+        if (ddX) {
+          Tensor first_grad;
+          first_grad.Resize({C, sample_size});
+          EigenArrayMap<T> first_grad_arr(
+              first_grad.mutable_data<T>(ctx.GetPlace()), C, sample_size);
+          first_grad_arr.setZero();
+
+          first_grad_arr +=
+              inv_var_tile_data *
+              (dy_arr -
+               dy_arr.rowwise().sum().replicate(1, sample_size) / sample_size -
+               x_sub_mean_mul_invstd_arr *
+                   (dy_arr * x_sub_mean_mul_invstd_arr)
+                       .rowwise()
+                       .sum()
+                       .replicate(1, sample_size) /
+                   sample_size);
+          dscale_arr = (first_grad_arr * ddx_arr).rowwise().sum();
+        }
+      }
+    }
+
+    if (ddY) {
+      ddY->mutable_data<T>(ctx.GetPlace());
+      EigenArrayMap<T> ddy_arr(transformed_ddy.mutable_data<T>(ctx.GetPlace()),
+                               C, sample_size);
+      ddy_arr.setZero();
+      if (use_global_stats) {
+        // math: ddy = r * ddx * inv_var
+        if (ddX) {
+          ddy_arr = scale_tile_data * ddx_arr * inv_var_tile_data;
+        }
+      } else {
+        // math: ddy = (x - mean) * inv_var * ddscale + ddbias +
+        //           scale * inv_var * (ddx - (x - mean) * inv_var.pow(2) *
+        //           np.mean(ddx * (x - mean), axis=(n,h,w)))
+        if (ddX) {
+          ddy_arr +=
+              scale_tile_data * inv_var_tile_data *
+              (ddx_arr -
+               ddx_arr.rowwise().sum().replicate(1, sample_size) / sample_size -
+               x_sub_mean_mul_invstd_arr *
+                   (ddx_arr * x_sub_mean_mul_invstd_arr)
+                       .rowwise()
+                       .sum()
+                       .replicate(1, sample_size) /
+                   sample_size);
+        }
+        if (ddScale && ddBias) {
+          ConstEigenVectorArrayMap<T> ddscale_arr(ddScale->data<T>(), C);
+          Tensor ddscale_tile;
+          ddscale_tile.Resize({C, sample_size});
+          EigenArrayMap<T> ddscale_tile_data(
+              ddscale_tile.mutable_data<T>(ctx.GetPlace()), C, sample_size);
+          ddscale_tile_data = ddscale_arr.replicate(1, sample_size);
+
+          ConstEigenVectorArrayMap<T> ddbias_arr(ddBias->data<T>(), C);
+          Tensor ddbias_tile;
+          ddbias_tile.Resize({C, sample_size});
+          EigenArrayMap<T> ddbias_tile_data(
+              ddbias_tile.mutable_data<T>(ctx.GetPlace()), C, sample_size);
+          ddbias_tile_data = ddbias_arr.replicate(1, sample_size);
+
+          ddy_arr += x_sub_mean_mul_invstd_arr * ddscale_tile_data;
+          ddy_arr += ddbias_tile_data;
+        }
+      }
+      if (data_layout == DataLayout::kNCHW) {
+        VLOG(3) << "Transform batchnorm output from NHWC to NCHW";
+        TransToChannelFirst<paddle::platform::CPUDeviceContext, T>(
+            ctx, &transformed_ddy, ddY);
+      }
+    }
+  }
+};
+
+DECLARE_INPLACE_OP_INFERER(BatchNormDoubleGradOpInplaceInferer, {"DY", "DDY"});
+
 }  // namespace operators
 }  // namespace paddle
 
@@ -839,7 +1234,11 @@ REGISTER_OPERATOR(batch_norm, ops::BatchNormOp, ops::BatchNormOpMaker,
                   ops::BatchNormOpInferVarType,
                   ops::BatchNormGradMaker<paddle::framework::OpDesc>,
                   ops::BatchNormGradMaker<paddle::imperative::OpBase>);
-REGISTER_OPERATOR(batch_norm_grad, ops::BatchNormGradOp);
+REGISTER_OPERATOR(batch_norm_grad, ops::BatchNormGradOp,
+                  ops::BatchNormDoubleGradMaker<paddle::framework::OpDesc>,
+                  ops::BatchNormDoubleGradMaker<paddle::imperative::OpBase>);
+REGISTER_OPERATOR(batch_norm_grad_grad, ops::BatchNormDoubleGradOp,
+                  ops::BatchNormDoubleGradOpInplaceInferer);
 
 REGISTER_OP_CPU_KERNEL(
     batch_norm, ops::BatchNormKernel<paddle::platform::CPUDeviceContext, float>,
@@ -848,3 +1247,7 @@ REGISTER_OP_CPU_KERNEL(
     batch_norm_grad,
     ops::BatchNormGradKernel<paddle::platform::CPUDeviceContext, float>,
     ops::BatchNormGradKernel<paddle::platform::CPUDeviceContext, double>);
+REGISTER_OP_CPU_KERNEL(
+    batch_norm_grad_grad,
+    ops::BatchNormDoubleGradKernel<paddle::platform::CPUDeviceContext, float>,
+    ops::BatchNormDoubleGradKernel<paddle::platform::CPUDeviceContext, double>);
diff --git a/paddle/fluid/operators/batch_norm_op.cu b/paddle/fluid/operators/batch_norm_op.cu
index be834772679acb1717ae77e3729822dbdb609db8..2d5b395ac6807dade59d473c9fcffb925e4abe3a 100644
--- a/paddle/fluid/operators/batch_norm_op.cu
+++ b/paddle/fluid/operators/batch_norm_op.cu
@@ -20,6 +20,7 @@ limitations under the License. */
 #include "paddle/fluid/framework/data_layout.h"
 #include "paddle/fluid/operators/batch_norm_op.h"
 #include "paddle/fluid/operators/math/math_function.h"
+#include "paddle/fluid/operators/norm_utils.cu.h"
 #include "paddle/fluid/platform/cudnn_helper.h"
 #include "paddle/fluid/platform/float16.h"
 
@@ -840,6 +841,45 @@ class BatchNormGradKernel<platform::CUDADeviceContext, T>
   }
 };
 
+template <typename T>
+class BatchNormDoubleGradKernel<platform::CUDADeviceContext, T>
+    : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext &ctx) const override {
+    const auto *X = ctx.Input<Tensor>("X");
+    const auto *Scale = ctx.Input<Tensor>("Scale");
+    const auto *dY = ctx.Input<Tensor>("DY");
+    const auto *Saved_mean = ctx.Input<Tensor>("SavedMean");
+    const auto *Saved_variance = ctx.Input<Tensor>("SavedVariance");
+    const double epsilon = static_cast<double>(ctx.Attr<float>("epsilon"));
+    const bool use_global_stats = ctx.Attr<bool>("use_global_stats");
+    const bool is_test = ctx.Attr<bool>("is_test");
+
+    PADDLE_ENFORCE_EQ(
+        is_test, false,
+        platform::errors::InvalidArgument(
+            "`is_test = True` CANNOT be used in train program. If "
+            "you want to use global status in pre_train model, "
+            "please set `use_global_stats = True`"));
+
+    const std::string data_layout_str = ctx.Attr<std::string>("data_layout");
+    const DataLayout data_layout =
+        framework::StringToDataLayout(data_layout_str);
+
+    const auto *ddX = ctx.Input<Tensor>("DDX");
+    const auto *ddScale = ctx.Input<Tensor>("DDScale");
+    const auto *ddBias = ctx.Input<Tensor>("DDBias");
+
+    auto *dX = ctx.Output<Tensor>("DX");
+    auto *dScale = ctx.Output<Tensor>("DScale");
+    auto *ddY = ctx.Output<Tensor>("DDY");
+
+    NormDoubleGradFunctor<platform::CUDADeviceContext, T>(
+        ctx, data_layout, X, Scale, dY, Saved_mean, Saved_variance, epsilon,
+        use_global_stats, ddX, ddScale, ddBias, dX, dScale, ddY);
+  }
+};
+
 }  // namespace operators
 }  // namespace paddle
 
@@ -853,3 +893,7 @@ REGISTER_OP_CUDA_KERNEL(
     batch_norm_grad, ops::BatchNormGradKernel<plat::CUDADeviceContext, float>,
     ops::BatchNormGradKernel<plat::CUDADeviceContext, double>,
     ops::BatchNormGradKernel<plat::CUDADeviceContext, plat::float16>);
+REGISTER_OP_CUDA_KERNEL(
+    batch_norm_grad_grad,
+    ops::BatchNormDoubleGradKernel<plat::CUDADeviceContext, float>,
+    ops::BatchNormDoubleGradKernel<plat::CUDADeviceContext, double>);
diff --git a/paddle/fluid/operators/batch_norm_op.h b/paddle/fluid/operators/batch_norm_op.h
index 9f844b7c078bb7397d98dad57d9fad475283f397..1440b74290ce43a9e30d59ff5ad94e00eb13f9f1 100644
--- a/paddle/fluid/operators/batch_norm_op.h
+++ b/paddle/fluid/operators/batch_norm_op.h
@@ -103,6 +103,42 @@ inline void TransToChannelFirst(const framework::ExecutionContext& context,
   }
 }
 
+template <typename DeviceContext, typename T>
+inline void ResizeToChannelLast(const framework::ExecutionContext& context,
+                                const Tensor* input,
+                                Tensor* transformed_input) {
+  int dim = input->dims().size() - 2;
+  if (dim == 3) {
+    transformed_input->Resize(input->dims());
+
+    auto in_dims_vec = framework::vectorize(input->dims());
+    in_dims_vec[1] = input->dims()[2];
+    in_dims_vec[2] = input->dims()[3];
+    in_dims_vec[3] = input->dims()[4];
+    in_dims_vec[4] = input->dims()[1];
+    transformed_input->Resize(framework::make_ddim(in_dims_vec));
+    transformed_input->mutable_data<T>(context.GetPlace());
+
+  } else if (dim == 2) {
+    transformed_input->Resize(input->dims());
+
+    auto in_dims_vec = framework::vectorize(input->dims());
+    in_dims_vec[1] = input->dims()[2];
+    in_dims_vec[2] = input->dims()[3];
+    in_dims_vec[3] = input->dims()[1];
+    transformed_input->Resize(framework::make_ddim(in_dims_vec));
+    transformed_input->mutable_data<T>(context.GetPlace());
+  } else if (dim == 1) {
+    transformed_input->Resize(input->dims());
+
+    auto in_dims_vec = framework::vectorize(input->dims());
+    in_dims_vec[1] = input->dims()[2];
+    in_dims_vec[2] = input->dims()[1];
+    transformed_input->Resize(framework::make_ddim(in_dims_vec));
+    transformed_input->mutable_data<T>(context.GetPlace());
+  }
+}
+
 template <typename DeviceContext, typename T>
 inline void TransToChannelLast(const framework::ExecutionContext& context,
                                const Tensor* input, Tensor* transformed_input) {
@@ -154,6 +190,16 @@ class BatchNormGradOp : public framework::OperatorWithKernel {
       const framework::OpKernelType& expected_kernel_type) const override;
 };
 
+class BatchNormDoubleGradOp : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+  void InferShape(framework::InferShapeContext* ctx) const override;
+
+ protected:
+  framework::OpKernelType GetExpectedKernelType(
+      const framework::ExecutionContext& ctx) const override;
+};
+
 class BatchNormOpMaker : public framework::OpProtoAndCheckerMaker {
  public:
   void Make() override;
@@ -168,6 +214,15 @@ class BatchNormGradMaker : public framework::SingleGradOpMaker<T> {
   void Apply(GradOpPtr<T> op) const override;
 };
 
+template <typename T>
+class BatchNormDoubleGradMaker : public framework::SingleGradOpMaker<T> {
+ public:
+  using framework::SingleGradOpMaker<T>::SingleGradOpMaker;
+
+ protected:
+  void Apply(GradOpPtr<T> op) const override;
+};
+
 class BatchNormOpInferVarType
     : public framework::PassInDtypeAndVarTypeToOutput {
  protected:
@@ -190,5 +245,11 @@ class BatchNormGradKernel : public framework::OpKernel<T> {
   void Compute(const framework::ExecutionContext& ctx) const override;
 };
 
+template <typename DeviceContext, typename T>
+class BatchNormDoubleGradKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override;
+};
+
 }  // namespace operators
 }  // namespace paddle
diff --git a/paddle/fluid/operators/bernoulli_op.cu b/paddle/fluid/operators/bernoulli_op.cu
index f665d2dd0e991847de2ad35bf6b18741fb3a6e26..6565f5a9a2176972e9e5085c6646097e8349f259 100644
--- a/paddle/fluid/operators/bernoulli_op.cu
+++ b/paddle/fluid/operators/bernoulli_op.cu
@@ -16,7 +16,6 @@ limitations under the License. */
 #include <thrust/random.h>
 #include <thrust/transform.h>
 
-#include "paddle/fluid/framework/generator.h"
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/framework/operator.h"
 #include "paddle/fluid/operators/bernoulli_op.h"
diff --git a/paddle/fluid/operators/clip_op.h b/paddle/fluid/operators/clip_op.h
index 03abfe7eb703b021dac2261dcd9c87d440b04001..68f5d5460efd16a79d6e1553c2fb78da31fc704a 100644
--- a/paddle/fluid/operators/clip_op.h
+++ b/paddle/fluid/operators/clip_op.h
@@ -66,7 +66,7 @@ template <typename DeviceContext, typename T>
 class ClipKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& context) const override {
-    auto max = context.Attr<T>("max");
+    auto max = static_cast<T>(context.Attr<float>("max"));
     Tensor max_cpu;
     if (context.HasInput("Max")) {
       auto* max_t = context.Input<Tensor>("Max");
@@ -77,8 +77,9 @@ class ClipKernel : public framework::OpKernel<T> {
       }
       max = max_data[0];
     }
+    max = static_cast<T>(max);
 
-    auto min = context.Attr<T>("min");
+    auto min = context.Attr<float>("min");
     Tensor min_cpu;
     if (context.HasInput("Min")) {
       auto* min_t = context.Input<Tensor>("Min");
@@ -141,7 +142,7 @@ template <typename DeviceContext, typename T>
 class ClipGradKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& context) const override {
-    auto max = context.Attr<T>("max");
+    auto max = static_cast<T>(context.Attr<float>("max"));
     Tensor max_cpu;
     if (context.HasInput("Max")) {
       auto* max_t = context.Input<Tensor>("Max");
@@ -152,8 +153,9 @@ class ClipGradKernel : public framework::OpKernel<T> {
       }
       max = max_data[0];
     }
+    max = static_cast<T>(max);
 
-    auto min = context.Attr<T>("min");
+    auto min = context.Attr<float>("min");
     Tensor min_cpu;
     if (context.HasInput("Min")) {
       auto* min_t = context.Input<Tensor>("Min");
@@ -164,6 +166,7 @@ class ClipGradKernel : public framework::OpKernel<T> {
       }
       min = min_data[0];
     }
+    min = static_cast<T>(min);
 
     auto* d_out =
         context.Input<framework::LoDTensor>(framework::GradVarName("Out"));
diff --git a/paddle/fluid/operators/coalesce_tensor_op.cc b/paddle/fluid/operators/coalesce_tensor_op.cc
index 5b7bcde21a99f23b653cc8b822aa3e22539e9d82..d67d90c348e6f1db9fff604b3eff7b6a79141d07 100644
--- a/paddle/fluid/operators/coalesce_tensor_op.cc
+++ b/paddle/fluid/operators/coalesce_tensor_op.cc
@@ -33,29 +33,37 @@ class CoalesceTensorOpKernel : public framework::OpKernel<T> {
     auto out_vars = context.MultiOutputVar("Output");
 
     PADDLE_ENFORCE_GT(in_var_names.size(), static_cast<size_t>(0),
-                      "The CoalesceTensorOp has no input.");
-    PADDLE_ENFORCE_EQ(
-        in_var_names.size(), out_var_names.size(),
-        "The number of CoalesceTensorOp's input and output is not match.");
+                      platform::errors::InvalidArgument(
+                          "The CoalesceTensor operator has no input."));
+    PADDLE_ENFORCE_EQ(in_var_names.size(), out_var_names.size(),
+                      platform::errors::InvalidArgument(
+                          "The number of CoalesceTensor operator's input and "
+                          "output is not match, "
+                          "input number is %u, output number is %u.",
+                          in_var_names.size(), out_var_names.size()));
 
     // Input & Output check: only support LoDTensor
     for (size_t i = 0; i < in_var_names.size(); ++i) {
       PADDLE_ENFORCE_NOT_NULL(
           in_vars[i],
-          "The input variable %s of CoalesceTensorOp does not exist.",
-          in_var_names[i]);
+          platform::errors::NotFound("The input variable %s of CoalesceTensor "
+                                     "operator does not exist.",
+                                     in_var_names[i]));
       PADDLE_ENFORCE_NOT_NULL(
           out_vars[i],
-          "The output variable %s of CoalesceTensorOp does not exist.",
-          out_var_names[i]);
-      PADDLE_ENFORCE_EQ(
-          in_vars[i]->IsType<framework::LoDTensor>(), true,
-          "The input variable %s of CoalesceTensorOp is not LoDTensor.",
-          in_var_names[i]);
-      PADDLE_ENFORCE_EQ(
-          out_vars[i]->IsType<framework::LoDTensor>(), true,
-          "The output variable %s of CoalesceTensorOp is not LoDTensor.",
-          in_var_names[i]);
+          platform::errors::NotFound("The output variable %s of CoalesceTensor "
+                                     "operator does not exist.",
+                                     out_var_names[i]));
+      PADDLE_ENFORCE_EQ(in_vars[i]->IsType<framework::LoDTensor>(), true,
+                        platform::errors::InvalidArgument(
+                            "The input variable %s of CoalesceTensor operator "
+                            "is not LoDTensor.",
+                            in_var_names[i]));
+      PADDLE_ENFORCE_EQ(out_vars[i]->IsType<framework::LoDTensor>(), true,
+                        platform::errors::InvalidArgument(
+                            "The output variable %s of CoalesceTensor operator "
+                            "is not LoDTensor.",
+                            in_var_names[i]));
     }
 
     auto in_tensors = context.MultiInput<framework::LoDTensor>("Input");
@@ -64,7 +72,10 @@ class CoalesceTensorOpKernel : public framework::OpKernel<T> {
       for (size_t i = 0; i < in_var_names.size(); ++i) {
         PADDLE_ENFORCE_EQ(
             in_var_names[i], out_var_names[i],
-            "The input and output variable of CoalesceTensorOp is different.");
+            platform::errors::InvalidArgument(
+                "The input and output variable of CoalesceTensor operator is "
+                "different, %dth input is %s, %dth output is %s.",
+                i, in_var_names[i], i, out_var_names[i]));
       }
     } else {
       // Init the output as input
@@ -134,16 +145,25 @@ class CoalesceTensorOpKernel : public framework::OpKernel<T> {
       const std::vector<const framework::LoDTensor *> &lod_tensors,
       const std::vector<std::string> var_names, size_t *numel,
       const size_t &size_of_dtype, const platform::Place &place) const {
-    PADDLE_ENFORCE_EQ(lod_tensors.size(), var_names.size());
+    PADDLE_ENFORCE_EQ(
+        lod_tensors.size(), var_names.size(),
+        platform::errors::InvalidArgument(
+            "The number of input tensor and variable does not match, the "
+            "number of input tensor is %u, the number of input variable is %u.",
+            lod_tensors.size(), var_names.size()));
     *numel = 0;
     std::stringstream ss;
     ss << "alloc_space_for_vars: ";
     for (size_t i = 0; i < var_names.size(); ++i) {
       PADDLE_ENFORCE_EQ(lod_tensors[i]->IsInitialized(), true,
-                        "%s is not initialized.", var_names[i]);
+                        platform::errors::InvalidArgument(
+                            "Tensor `%s` is not initialized.", var_names[i]));
 
       auto size = lod_tensors[i]->numel();
-      PADDLE_ENFORCE_GT(size, 0);
+      PADDLE_ENFORCE_GT(
+          size, 0,
+          platform::errors::InvalidArgument(
+              "The number of tensor `%s`'s elements is 0.", var_names[i]));
       ss << "input(" << var_names[i] << ") dim:(" << lod_tensors[i]->dims()
          << ") "
          << " addres:" << lod_tensors[i]->data<void>() << ", ";
diff --git a/paddle/fluid/operators/concat_op.cc b/paddle/fluid/operators/concat_op.cc
index 4f337c03599a548ac3d95ddd06c726be30d7c13f..7937e432d22faa3ffd93e46a39b7b1cc5500dbf8 100644
--- a/paddle/fluid/operators/concat_op.cc
+++ b/paddle/fluid/operators/concat_op.cc
@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "paddle/fluid/operators/concat_op.h"
+
 #include <memory>
 #include <string>
 #include <vector>
@@ -78,7 +79,8 @@ class ConcatOp : public framework::OperatorWithKernel {
       }
     }
     if (flag == 0) {
-      PADDLE_THROW("All Inputs of Concat OP are Empty!");
+      PADDLE_THROW(platform::errors::InvalidArgument(
+          "All Inputs of Concat OP are Empty!"));
     }
 #ifdef PADDLE_WITH_MKLDNN
     if (platform::CanMKLDNNBeUsed(ctx)) {
diff --git a/paddle/fluid/operators/controlflow/compare_op.cc b/paddle/fluid/operators/controlflow/compare_op.cc
index 60f29ba39a8ee64f9fe5d95e685cac1fb52dfd21..4940649c2a32649a068c364081071ac840b4e25a 100644
--- a/paddle/fluid/operators/controlflow/compare_op.cc
+++ b/paddle/fluid/operators/controlflow/compare_op.cc
@@ -111,8 +111,16 @@ class CompareOp : public framework::OperatorWithKernel {
     framework::OpKernelType kt = OperatorWithKernel::GetExpectedKernelType(ctx);
     // CompareOp kernel's device type is decided by input tensor place
     bool force_cpu = ctx.Attr<bool>("force_cpu");
-    kt.place_ = force_cpu ? platform::CPUPlace()
-                          : ctx.Input<framework::LoDTensor>("X")->place();
+    if (force_cpu) {
+      kt.place_ = platform::CPUPlace();
+    } else {
+      if (ctx.Input<framework::LoDTensor>("X")->place().type() !=
+          typeid(platform::CUDAPinnedPlace)) {
+        kt.place_ = ctx.Input<framework::LoDTensor>("X")->place();
+      } else {
+        kt.place_ = ctx.GetPlace();
+      }
+    }
     return kt;
   }
 };
diff --git a/paddle/fluid/operators/conv_cudnn_op.cu b/paddle/fluid/operators/conv_cudnn_op.cu
index 7f705755915924de4ca6ab4c698e46a437bb649c..00af724ac7fce64b9a210bf43a150acf20f34dce 100644
--- a/paddle/fluid/operators/conv_cudnn_op.cu
+++ b/paddle/fluid/operators/conv_cudnn_op.cu
@@ -14,6 +14,7 @@ limitations under the License. */
 
 #include <utility>
 #include <vector>
+
 #include "paddle/fluid/framework/eigen.h"
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/framework/tensor.h"
@@ -287,7 +288,9 @@ class CUDNNConvOpKernel : public framework::OpKernel<T> {
 #endif
 
     // ------------------- cudnn conv forward ---------------------
-    ScalingParamType<T> alpha = 1.0f, beta = 0.0f;
+    ScalingParamType<T> alpha = 1.0f;
+    ScalingParamType<T> beta = ctx.Attr<bool>("use_addto") ? 1.0f : 0.0f;
+    VLOG(4) << "Conv: use_addto = " << ctx.Attr<bool>("use_addto");
     for (int i = 0; i < groups; i++) {
       workspace_handle.RunFunc(
           [&](void* workspace_ptr) {
@@ -609,9 +612,13 @@ class CUDNNConvGradOpKernel : public framework::OpKernel<T> {
     }
 
     // ------------------- cudnn conv backward data ---------------------
-    ScalingParamType<T> alpha = 1.0f, beta = 0.0f;
+    ScalingParamType<T> alpha = 1.0f;
+    ScalingParamType<T> beta = ctx.Attr<bool>("use_addto") ? 1.0f : 0.0f;
+    VLOG(4) << "Conv_grad: use_addto = " << ctx.Attr<bool>("use_addto");
+
     if (input_grad) {
-      // Because beta is zero, it is unnecessary to reset input_grad.
+      // When beta is 0, it is unnecessary to reset input_grad.
+      // When beta is 1, the output cannot be reset since addt strategy used.
       for (int i = 0; i < groups; i++) {
         workspace_handle.RunFunc(
             [&](void* cudnn_workspace_ptr) {
@@ -653,6 +660,9 @@ class CUDNNConvGradOpKernel : public framework::OpKernel<T> {
             ctx, &transformed_input_grad_channel, input_grad);
       }
     }
+
+    // filter_grad do not use inplace addto.
+    ScalingParamType<T> beta_filter = 0.0f;
     // ------------------- cudnn conv backward filter ---------------------
     if (filter_grad) {
       // Because beta is zero, it is unnecessary to reset filter_grad.
@@ -665,7 +675,7 @@ class CUDNNConvGradOpKernel : public framework::OpKernel<T> {
                       input_data + i * group_offset_in, args2.odesc.desc(),
                       output_grad_data + i * group_offset_out,
                       args2.cdesc.desc(), filter_algo, cudnn_workspace_ptr,
-                      workspace_size, &beta, args2.wdesc.desc(),
+                      workspace_size, &beta_filter, args2.wdesc.desc(),
                       filter_grad_data + i * group_offset_filter));
             },
             workspace_size);
@@ -1017,7 +1027,14 @@ class CUDNNConvDoubleGradOpKernel : public framework::OpKernel<T> {
     int group_offset_out = o_c / groups * o_h * o_w * o_d;
     int group_offset_filter = W->numel() / groups;
 
-    ScalingParamType<T> alpha = 1.0f, beta = 0.0f;
+    ScalingParamType<T> alpha = 1.0f;
+    ScalingParamType<T> beta = 0.0f;
+
+    // NOTE(zhiqiu): inplace addto is not supportted in double grad yet.
+    // ScalingParamType<T> beta = ctx.Attr<bool>("use_addto") ? 1.0f :
+    // 0.0f;
+    // VLOG(4) << "Conv_grad_grad: use_addto = " << ctx.Attr<bool>("use_addto");
+
     auto wkspace_handle = dev_ctx.cudnn_workspace_handle();
 
     if (ddO) {
diff --git a/paddle/fluid/operators/conv_op.cc b/paddle/fluid/operators/conv_op.cc
index 9ed169fe3502e0c34b9f37d6520edc1a3fbfa91c..bf97b9d03c455182a8d95b6987896b9a580c84fe 100644
--- a/paddle/fluid/operators/conv_op.cc
+++ b/paddle/fluid/operators/conv_op.cc
@@ -305,6 +305,11 @@ void Conv2DOpMaker::Make() {
       .SetDefault(0.0f);
   AddAttr<float>("fuse_beta", "(float, default 0.0) Only used in mkldnn kernel")
       .SetDefault(0.0f);
+  AddAttr<bool>(
+      "use_addto",
+      "(bool, default false) If use addto strategy or not, only used in "
+      "cudnn kernel")
+      .SetDefault(false);
   AddAttr<bool>("fuse_residual_connection",
                 "(bool, default false) Only used in mkldnn kernel. Used "
                 "whenever convolution output is as an input to residual "
@@ -460,6 +465,11 @@ void Conv3DOpMaker::Make() {
       .SetDefault(0.0f);
   AddAttr<float>("fuse_beta", "(float, default 0.0) Only used in mkldnn kernel")
       .SetDefault(0.0f);
+  AddAttr<bool>(
+      "use_addto",
+      "(bool, default false) If use addto strategy or not, only used in "
+      "cudnn kernel")
+      .SetDefault(false);
   AddAttr<bool>("fuse_residual_connection",
                 "(bool, default false) Only used in mkldnn kernel. Used "
                 "whenever convolution output is as an input to residual "
diff --git a/paddle/fluid/operators/correlation_op.cc b/paddle/fluid/operators/correlation_op.cc
new file mode 100644
index 0000000000000000000000000000000000000000..a2e6ff214bfa30af2e25b7feac41d22f02ab75a7
--- /dev/null
+++ b/paddle/fluid/operators/correlation_op.cc
@@ -0,0 +1,181 @@
+/* Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include <memory>
+#include <string>
+#include <unordered_map>
+#include <vector>
+#include "paddle/fluid/framework/op_registry.h"
+
+namespace paddle {
+namespace operators {
+
+using Tensor = framework::Tensor;
+
+inline std::vector<int64_t> CorrelationOutputSize(int batch, int input_height,
+                                                  int input_width, int stride1,
+                                                  int stride2, int kernel_size,
+                                                  int pad_size,
+                                                  int max_displacement) {
+  std::vector<int64_t> output_shape({batch});
+  int kernel_radius = (kernel_size - 1) / 2;
+  int border_radius = kernel_radius + max_displacement;
+  int padded_input_height = input_height + 2 * pad_size;
+  int padded_input_width = input_width + 2 * pad_size;
+  int output_channel = ((max_displacement / stride2) * 2 + 1) *
+                       ((max_displacement / stride2) * 2 + 1);
+  output_shape.push_back(output_channel);
+  int output_height =
+      std::ceil(static_cast<float>(padded_input_height - 2 * border_radius) /
+                static_cast<float>(stride1));
+  int output_width =
+      std::ceil(static_cast<float>(padded_input_width - 2 * border_radius) /
+                static_cast<float>(stride1));
+  output_shape.push_back(output_height);
+  output_shape.push_back(output_width);
+  return output_shape;
+}
+
+class CorrelationOpMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  void Make() override {
+    AddInput("Input1", "Input is a 4-D Tensor with shape [N, C, H, W]");
+    AddInput("Input2", "Input is a 4-D Tensor with shape [N, C, H, W]");
+    AddOutput("Output",
+              "(Tensor) The output tensor of correlation operator. "
+              "It has same data fromat and data type as the Input.");
+    AddAttr<int>("pad_size", "pad size for input1 and input2");
+    AddAttr<int>("kernel_size", "kernel size of input1 and input2");
+    AddAttr<int>("max_displacement", "max displacement of input1 and input2");
+    AddAttr<int>("stride1", "Input1 stride");
+    AddAttr<int>("stride2", "Input2 stride");
+    AddAttr<int>("corr_type_multiply", "correlation coefficient").SetDefault(1);
+    AddComment(
+        R"DOC(Correlation of two feature map. Only support NCHW data format.)DOC");
+  }
+};
+
+class CorrelationOp : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+
+  void InferShape(framework::InferShapeContext* ctx) const override {
+    OP_INOUT_CHECK(ctx->HasInput("Input1"), "Input", "X", "CorrelationOp");
+    OP_INOUT_CHECK(ctx->HasInput("Input2"), "Input", "Y", "CorrelationOp");
+    int stride1 = ctx->Attrs().Get<int>("stride1");
+    int stride2 = ctx->Attrs().Get<int>("stride2");
+    int max_displacement = ctx->Attrs().Get<int>("max_displacement");
+    int pad_size = ctx->Attrs().Get<int>("pad_size");
+    int kernel_size = ctx->Attrs().Get<int>("kernel_size");
+
+    auto in_dims = ctx->GetInputDim("Input1");
+    auto in2_dims = ctx->GetInputDim("Input2");
+
+    PADDLE_ENFORCE_EQ(in_dims.size() == 4, true,
+                      platform::errors::InvalidArgument(
+                          "Input(X) of CorrelationOp must be 4 dims."
+                          "But received dims is %d.",
+                          in_dims.size()));
+
+    PADDLE_ENFORCE_EQ(in2_dims.size() == 4, true,
+                      platform::errors::InvalidArgument(
+                          "Input(Y) of CorrelationOp must be 4 dims."
+                          "But received dims is %d.",
+                          in2_dims.size()));
+    std::vector<int64_t> output_shape =
+        CorrelationOutputSize(in_dims[0], in_dims[2], in_dims[3], stride1,
+                              stride2, kernel_size, pad_size, max_displacement);
+    ctx->SetOutputDim("Output", framework::make_ddim(output_shape));
+  }
+
+ protected:
+  framework::OpKernelType GetExpectedKernelType(
+      const framework::ExecutionContext& ctx) const override {
+    auto input_data_type =
+        OperatorWithKernel::IndicateVarDataType(ctx, "Input1");
+    PADDLE_ENFORCE_EQ(input_data_type, ctx.Input<Tensor>("Input2")->type(),
+                      platform::errors::InvalidArgument(
+                          "X and Y shoule have the same datatype"));
+    return framework::OpKernelType(input_data_type, ctx.GetPlace());
+  }
+
+  framework::OpKernelType GetKernelTypeForVar(
+      const std::string& var_name, const Tensor& tensor,
+      const framework::OpKernelType& expected_kernel_type) const override {
+    return framework::OpKernelType(expected_kernel_type.data_type_,
+                                   tensor.place(), tensor.layout());
+  }
+};
+
+template <typename T>
+class CorrelationOpGradMaker : public framework::SingleGradOpMaker<T> {
+ public:
+  using framework::SingleGradOpMaker<T>::SingleGradOpMaker;
+
+ protected:
+  void Apply(GradOpPtr<T> op) const override {
+    op->SetType("correlation_grad");
+    op->SetInput("Input1", this->Input("Input1"));
+    op->SetInput("Input2", this->Input("Input2"));
+    op->SetInput(framework::GradVarName("Output"), this->OutputGrad("Output"));
+    op->SetOutput(framework::GradVarName("Input1"), this->InputGrad("Input1"));
+    op->SetOutput(framework::GradVarName("Input2"), this->InputGrad("Input2"));
+    op->SetAttrMap(this->Attrs());
+  }
+};
+
+class CorrelationOpGrad : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+
+  void InferShape(framework::InferShapeContext* ctx) const override {
+    OP_INOUT_CHECK(ctx->HasInput("Input1"), "Input", "X", "CorrelationOp");
+    OP_INOUT_CHECK(ctx->HasInput("Input2"), "Input", "Y", "CorrelationOp");
+    OP_INOUT_CHECK(ctx->HasInput(framework::GradVarName("Output")), "Input",
+                   "Output@GRAD", "CorrelationGradOp");
+
+    auto in1_dims = ctx->GetInputDim("Input1");
+    auto in2_dims = ctx->GetInputDim("Input2");
+    ctx->SetOutputDim(framework::GradVarName("Input1"), in1_dims);
+    ctx->SetOutputDim(framework::GradVarName("Input2"), in2_dims);
+  }
+
+ protected:
+  framework::OpKernelType GetExpectedKernelType(
+      const framework::ExecutionContext& ctx) const override {
+    return framework::OpKernelType(
+        OperatorWithKernel::IndicateVarDataType(ctx, "Input1"), ctx.GetPlace());
+  }
+};
+
+template <typename T>
+class CorrelationKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    PADDLE_ENFORCE_EQ(
+        platform::is_gpu_place(ctx.GetPlace()), true,
+        platform::errors::Unimplemented("Correlation only supports GPU now."));
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+REGISTER_OPERATOR(correlation, ops::CorrelationOp, ops::CorrelationOpMaker,
+                  ops::CorrelationOpGradMaker<paddle::framework::OpDesc>,
+                  ops::CorrelationOpGradMaker<paddle::imperative::OpBase>);
+REGISTER_OPERATOR(correlation_grad, ops::CorrelationOpGrad);
+REGISTER_OP_CPU_KERNEL(correlation, ops::CorrelationKernel<float>,
+                       ops::CorrelationKernel<double>);
diff --git a/paddle/fluid/operators/correlation_op.cu b/paddle/fluid/operators/correlation_op.cu
new file mode 100644
index 0000000000000000000000000000000000000000..0d177f653ec3d5cacbd4d938e6e5da6689b1bc74
--- /dev/null
+++ b/paddle/fluid/operators/correlation_op.cu
@@ -0,0 +1,483 @@
+/* Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include <algorithm>
+#include <string>
+#include "paddle/fluid/framework/op_registry.h"
+
+namespace paddle {
+namespace operators {
+
+#define THREADS_PER_BLOCK 32
+#define FULL_MASK 0xffffffff
+
+using framework::Tensor;
+using DataLayout = framework::DataLayout;
+
+template <typename T>
+__forceinline__ __device__ T warpReduceSum(T val) {
+  for (int offset = 16; offset > 0; offset /= 2) {
+    val += __shfl_down_sync(FULL_MASK, val, offset);
+  }
+  return val;
+}
+
+template <typename T>
+__forceinline__ __device__ T blockReduceSum(T val) {
+  static __shared__ T shared[32];
+  int lane = threadIdx.x % warpSize;
+  int wid = threadIdx.x / warpSize;
+
+  val = warpReduceSum(val);
+  if (lane == 0) shared[wid] = val;
+
+  __syncthreads();
+  val = (threadIdx.x < blockDim.x / warpSize) ? shared[lane] : 0;
+
+  if (wid == 0) val = warpReduceSum(val);
+
+  return val;
+}
+
+template <typename T>
+__global__ void set_zero(T *x, int num) {
+  for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < num;
+       i += blockDim.x * gridDim.x)
+    x[i] = static_cast<T>(0);
+}
+
+template <typename T>
+__global__ void channel_first(const T *input, T *rinput, const int channel,
+                              const int height, const int width,
+                              const int pad_size) {
+  int n = blockIdx.x;
+  int h = blockIdx.y;
+  int w = blockIdx.z;
+
+  int ch_off = threadIdx.x;
+  T value;
+  int dimchw = channel * height * width;
+  int dimhw = height * width;
+
+  int p_dimw = (width + 2 * pad_size);
+  int p_dimh = (height + 2 * pad_size);
+  int p_dimchw = channel * p_dimw * p_dimh;
+  int p_dimcw = channel * p_dimw;
+
+  for (int c = ch_off; c < channel; c += THREADS_PER_BLOCK) {
+    value = input[n * dimchw + c * dimhw + h * width + w];
+    rinput[n * p_dimchw + (h + pad_size) * p_dimcw + (w + pad_size) * channel +
+           c] = value;
+  }
+}
+
+template <typename T>
+__global__ void correlation_forward(
+    T *output, const int output_channel, const int output_height,
+    const int output_width, const T *rinput1, const int input_channel,
+    const int input_height, const int input_width, const T *rinput2,
+    const int pad_size, const int kernel_size, const int max_displacement,
+    const int stride1, const int stride2) {
+  int p_input_width = input_width + 2 * pad_size;
+  int p_input_height = input_height + 2 * pad_size;
+
+  int kernel_rad = (kernel_size - 1) / 2;
+  int displacement_rad = max_displacement / stride2;
+
+  int displacement_size = 2 * displacement_rad + 1;
+
+  int n = blockIdx.x;
+  int h1 = blockIdx.y * stride1 + max_displacement;
+  int w1 = blockIdx.z * stride1 + max_displacement;
+  int c = threadIdx.x;
+
+  int p_dimchw = p_input_height * p_input_width * input_channel;
+  int p_dimcw = p_input_width * input_channel;
+  int p_dimc = input_channel;
+
+  int t_dimchw = output_channel * output_height * output_width;
+  int t_dimhw = output_height * output_width;
+  int t_dimw = output_width;
+
+  int nelems = kernel_size * kernel_size * p_dimc;
+
+  for (int tj = -displacement_rad; tj <= displacement_rad; ++tj) {
+    for (int ti = -displacement_rad; ti <= displacement_rad; ++ti) {
+      int w2 = w1 + ti * stride2;
+      int h2 = h1 + tj * stride2;
+
+      T acc0 = 0;
+      for (int j = -kernel_rad; j <= kernel_rad; ++j) {
+        for (int i = -kernel_rad; i <= kernel_rad; ++i) {
+          for (int ch = c; ch < p_dimc; ch += blockDim.x) {
+            int index1 =
+                n * p_dimchw + (h1 + j) * p_dimcw + (w1 + i) * p_dimc + ch;
+            int index2 =
+                n * p_dimchw + (h2 + j) * p_dimcw + (w2 + i) * p_dimc + ch;
+            acc0 += static_cast<T>(rinput1[index1] * rinput2[index2]);
+          }
+        }
+      }
+      if (blockDim.x == warpSize) {
+        __syncwarp();
+        acc0 = warpReduceSum(acc0);
+      } else {
+        __syncthreads();
+        acc0 = blockReduceSum(acc0);
+      }
+
+      if (threadIdx.x == 0) {
+        int tc = (tj + displacement_rad) * displacement_size +
+                 (ti + displacement_rad);
+        const int t_index =
+            n * t_dimchw + tc * t_dimhw + blockIdx.y * t_dimw + blockIdx.z;
+        output[t_index] = static_cast<T>(acc0 / nelems);
+      }
+    }
+  }
+}
+
+// class CorrelationKernel<platform::CUDADeviceContext, T>
+template <typename T>
+class CorrelationCUDAKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext &ctx) const override {
+    PADDLE_ENFORCE_EQ(platform::is_gpu_place(ctx.GetPlace()), true,
+                      platform::errors::InvalidArgument(
+                          "Correlation only supports GPU now."));
+
+    auto *input1 = ctx.Input<Tensor>("Input1");
+    auto *input2 = ctx.Input<Tensor>("Input2");
+    int pad_size = ctx.Attr<int>("pad_size");
+    int kernel_size = ctx.Attr<int>("kernel_size");
+    int stride1 = ctx.Attr<int>("stride1");
+    int stride2 = ctx.Attr<int>("stride2");
+    int max_displacement = ctx.Attr<int>("max_displacement");
+    int corr_type_multiply = ctx.Attr<int>("corr_type_multiply");
+
+    auto *output = ctx.Output<Tensor>("Output");
+    output->mutable_data<T>(ctx.GetPlace());
+    auto &dev_ctx = ctx.template device_context<platform::CUDADeviceContext>();
+
+    // base on input1, NCHW
+    auto in_dims = input1->dims();
+    int N = in_dims[0];
+    int C = in_dims[1];
+    int H = in_dims[2];
+    int W = in_dims[3];
+
+    int padded_input_height = H + 2 * pad_size;
+    int padded_input_width = W + 2 * pad_size;
+
+    Tensor rinput1 = ctx.AllocateTmpTensor<T, platform::CUDADeviceContext>(
+        {N, padded_input_height, padded_input_width, C}, dev_ctx);
+    rinput1.mutable_data<T>(ctx.GetPlace());
+
+    Tensor rinput2 = ctx.AllocateTmpTensor<T, platform::CUDADeviceContext>(
+        {N, padded_input_height, padded_input_width, C}, dev_ctx);
+    rinput2.mutable_data<T>(ctx.GetPlace());
+
+    set_zero<<<(rinput1.numel() + 512 - 1) / 512, 512, 0, dev_ctx.stream()>>>(
+        rinput1.data<T>(), rinput1.numel());
+    set_zero<<<(rinput2.numel() + 512 - 1) / 512, 512, 0, dev_ctx.stream()>>>(
+        rinput2.data<T>(), rinput2.numel());
+    set_zero<<<(output->numel() + 512 - 1) / 512, 512, 0, dev_ctx.stream()>>>(
+        output->data<T>(), output->numel());
+
+    auto out_dims = output->dims();
+    int OC = out_dims[1];
+    int OH = out_dims[2];
+    int OW = out_dims[3];
+
+    dim3 blocks_grid(N, H, W);
+    dim3 threads_block(THREADS_PER_BLOCK);
+
+    channel_first<T><<<blocks_grid, threads_block, 0, dev_ctx.stream()>>>(
+        input1->data<T>(), rinput1.data<T>(), C, H, W, pad_size);
+    channel_first<T><<<blocks_grid, threads_block, 0, dev_ctx.stream()>>>(
+        input2->data<T>(), rinput2.data<T>(), C, H, W, pad_size);
+
+    dim3 threadsPerBlock(THREADS_PER_BLOCK);
+    dim3 totalBlocksCorr(N, OH, OW);
+
+    correlation_forward<
+        T><<<totalBlocksCorr, threadsPerBlock, 0, dev_ctx.stream()>>>(
+        output->data<T>(), OC, OH, OW, rinput1.data<T>(), C, H, W,
+        rinput2.data<T>(), pad_size, kernel_size, max_displacement, stride1,
+        stride2);
+  }
+};
+
+template <typename T>
+__global__ void correlation_backward_input1(
+    int item, T *grad_input1, const int input_channel, const int input_height,
+    const int input_width, const T *grad_output, const int output_channel,
+    const int output_height, const int output_width, const T *rinput2,
+    const int pad_size, const int kernel_size, const int max_displacement,
+    const int stride1, const int stride2) {
+  int n = item;
+  int h = blockIdx.x * stride1 + pad_size;
+  int w = blockIdx.y * stride1 + pad_size;
+  int c = blockIdx.z;
+  int tch_off = threadIdx.x;
+
+  int kernel_rad = (kernel_size - 1) / 2;
+  int displacement_rad = max_displacement / stride2;
+  int displacement_size = 2 * displacement_rad + 1;
+
+  int xmin = (w - kernel_rad - max_displacement) / stride1;
+  int ymin = (h - kernel_rad - max_displacement) / stride1;
+
+  int xmax = (w + kernel_rad - max_displacement) / stride1;
+  int ymax = (h + kernel_rad - max_displacement) / stride1;
+
+  if (xmax < 0 || ymax < 0 || xmin >= output_width || ymin >= output_height) {
+    return;
+  }
+
+  if (xmin > xmax || ymin > ymax) {
+    return;
+  }
+
+  xmin = max(0, xmin);
+  xmax = min(output_width - 1, xmax);
+
+  ymin = max(0, ymin);
+  ymax = min(output_height - 1, ymax);
+
+  int p_input_width = input_width + 2 * pad_size;
+  int p_input_height = input_height + 2 * pad_size;
+  int p_dimchw = input_channel * p_input_height * p_input_width;
+  int p_dimcw = input_channel * p_input_width;
+  int p_dimc = input_channel;
+
+  int t_dimchw = output_channel * output_height * output_width;
+  int t_dimhw = output_height * output_width;
+  int t_dimw = output_width;
+
+  int o_dimchw = input_channel * input_height * input_width;
+  int o_dimhw = input_height * input_width;
+  int o_dimw = input_width;
+
+  int nelems = kernel_size * kernel_size * input_channel;
+
+  __shared__ T prod_sum[THREADS_PER_BLOCK];
+  prod_sum[tch_off] = 0;
+
+  for (int tc = tch_off; tc < output_channel; tc += THREADS_PER_BLOCK) {
+    int i2 = (tc % displacement_size - displacement_rad) * stride2;
+    int j2 = (tc / displacement_size - displacement_rad) * stride2;
+
+    int index2 = n * p_dimchw + (h + j2) * p_dimcw + (w + i2) * p_dimc + c;
+
+    T val2 = rinput2[index2];
+    for (int j = ymin; j <= ymax; ++j) {
+      for (int i = xmin; i <= xmax; ++i) {
+        int t_index = n * t_dimchw + tc * t_dimhw + j * t_dimw + i;
+        prod_sum[tch_off] += grad_output[t_index] * val2;
+      }
+    }
+  }
+
+  __syncthreads();
+
+  if (tch_off == 0) {
+    T reduce_sum = 0;
+    for (int index = 0; index < THREADS_PER_BLOCK; index++) {
+      reduce_sum += prod_sum[index];
+    }
+    const int index1 =
+        n * o_dimchw + c * o_dimhw + (h - pad_size) * o_dimw + (w - pad_size);
+    grad_input1[index1] = static_cast<T>(reduce_sum / nelems);
+  }
+}
+
+template <typename T>
+__global__ void correlation_backward_input2(
+    int item, T *grad_input2, const int input_channel, const int input_height,
+    const int input_width, const T *grad_output, const int output_channel,
+    const int output_height, const int output_width, const T *rinput1,
+    const int pad_size, const int kernel_size, const int max_displacement,
+    const int stride1, const int stride2) {
+  int n = item;
+  int h = blockIdx.x * stride1 + pad_size;
+  int w = blockIdx.y * stride1 + pad_size;
+  int c = blockIdx.z;
+
+  int tch_off = threadIdx.x;
+
+  int kernel_rad = (kernel_size - 1) / 2;
+  int displacement_rad = max_displacement / stride2;
+  int displacement_size = 2 * displacement_rad + 1;
+
+  int p_input_width = input_width + 2 * pad_size;
+  int p_input_height = input_height + 2 * pad_size;
+  int p_dimchw = input_channel * p_input_height * p_input_width;
+  int p_dimcw = input_channel * p_input_width;
+  int p_dimc = input_channel;
+
+  int t_dimchw = output_channel * output_height * output_width;
+  int t_dimhw = output_height * output_width;
+  int t_dimw = output_width;
+
+  int o_dimchw = input_channel * input_height * input_width;
+  int o_dimhw = input_height * input_width;
+  int o_dimw = input_width;
+
+  int nelems = kernel_size * kernel_size * input_channel;
+
+  __shared__ T prod_sum[THREADS_PER_BLOCK];
+  prod_sum[tch_off] = 0;
+
+  for (int tc = tch_off; tc < output_channel; tc += THREADS_PER_BLOCK) {
+    int i2 = (tc % displacement_size - displacement_rad) * stride2;
+    int j2 = (tc / displacement_size - displacement_rad) * stride2;
+
+    int xmin = (w - kernel_rad - max_displacement - i2) / stride1;
+    int ymin = (h - kernel_rad - max_displacement - j2) / stride1;
+
+    int xmax = (w + kernel_rad - max_displacement - i2) / stride1;
+    int ymax = (h + kernel_rad - max_displacement - j2) / stride1;
+
+    if (xmax < 0 || ymax < 0 || xmin >= output_width || ymin >= output_height) {
+      continue;
+    }
+
+    if (xmin > xmax || ymin > ymax) {
+      continue;
+    }
+
+    xmin = max(0, xmin);
+    xmax = min(output_width - 1, xmax);
+
+    ymin = max(0, ymin);
+    ymax = min(output_height - 1, ymax);
+
+    int index1 = n * p_dimchw + (h - j2) * p_dimcw + (w - i2) * p_dimc + c;
+    T val1 = rinput1[index1];
+    for (int j = ymin; j <= ymax; ++j) {
+      for (int i = xmin; i <= xmax; ++i) {
+        int t_index = n * t_dimchw + tc * t_dimhw + j * t_dimw + i;
+        prod_sum[tch_off] += grad_output[t_index] * val1;
+      }
+    }
+  }
+
+  __syncthreads();
+
+  if (tch_off == 0) {
+    T reduce_sum = 0;
+    for (int index = 0; index < THREADS_PER_BLOCK; index++) {
+      reduce_sum += prod_sum[index];
+    }
+    const int index2 =
+        n * o_dimchw + c * o_dimhw + (h - pad_size) * o_dimw + (w - pad_size);
+    grad_input2[index2] = static_cast<T>(reduce_sum / nelems);
+  }
+}
+
+template <typename T>
+class CorrelationCUDAGradKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext &ctx) const override {
+    PADDLE_ENFORCE_EQ(platform::is_gpu_place(ctx.GetPlace()), true,
+                      platform::errors::InvalidArgument(
+                          "Correlation only supports GPU now."));
+    const auto *input1 = ctx.Input<Tensor>("Input1");
+    const auto *input2 = ctx.Input<Tensor>("Input2");
+    const auto *grad_output =
+        ctx.Input<Tensor>(framework::GradVarName("Output"));
+    const int pad_size = ctx.Attr<int>("pad_size");
+    const int kernel_size = ctx.Attr<int>("kernel_size");
+    const int stride1 = ctx.Attr<int>("stride1");
+    const int stride2 = ctx.Attr<int>("stride2");
+    const int max_displacement = ctx.Attr<int>("max_displacement");
+    const int corr_type_multiply = ctx.Attr<int>("corr_type_multiply");
+
+    auto *grad_input1 = ctx.Output<Tensor>(framework::GradVarName("Input1"));
+    grad_input1->mutable_data<T>(ctx.GetPlace());
+    auto *grad_input2 = ctx.Output<Tensor>(framework::GradVarName("Input2"));
+    grad_input2->mutable_data<T>(ctx.GetPlace());
+    auto &dev_ctx = ctx.template device_context<platform::CUDADeviceContext>();
+
+    auto in_dims = input1->dims();
+    int N = in_dims[0];
+    int C = in_dims[1];
+    int H = in_dims[2];
+    int W = in_dims[3];
+
+    int padded_input_height = H + 2 * pad_size;
+    int padded_input_width = W + 2 * pad_size;
+
+    Tensor rinput1 = ctx.AllocateTmpTensor<T, platform::CUDADeviceContext>(
+        {N, padded_input_height, padded_input_width, C}, dev_ctx);
+    rinput1.mutable_data<T>(ctx.GetPlace());
+
+    Tensor rinput2 = ctx.AllocateTmpTensor<T, platform::CUDADeviceContext>(
+        {N, padded_input_height, padded_input_width, C}, dev_ctx);
+    rinput2.mutable_data<T>(ctx.GetPlace());
+
+    set_zero<<<(rinput1.numel() + 512 - 1) / 512, 512, 0, dev_ctx.stream()>>>(
+        rinput1.data<T>(), rinput1.numel());
+    set_zero<<<(rinput2.numel() + 512 - 1) / 512, 512, 0, dev_ctx.stream()>>>(
+        rinput2.data<T>(), rinput2.numel());
+    set_zero<<<(grad_input1->numel() + 512 - 1) / 512, 512, 0,
+               dev_ctx.stream()>>>(grad_input1->data<T>(),
+                                   grad_input1->numel());
+    set_zero<<<(grad_input2->numel() + 512 - 1) / 512, 512, 0,
+               dev_ctx.stream()>>>(grad_input2->data<T>(),
+                                   grad_input2->numel());
+
+    auto grad_out_dims = grad_output->dims();
+    int GOC = grad_out_dims[1];
+    int GOH = grad_out_dims[2];
+    int GOW = grad_out_dims[3];
+
+    dim3 blocks_grid(N, H, W);
+    dim3 threads_block(THREADS_PER_BLOCK);
+
+    channel_first<T><<<blocks_grid, threads_block, 0, dev_ctx.stream()>>>(
+        input1->data<T>(), rinput1.data<T>(), C, H, W, pad_size);
+    channel_first<T><<<blocks_grid, threads_block, 0, dev_ctx.stream()>>>(
+        input2->data<T>(), rinput2.data<T>(), C, H, W, pad_size);
+
+    dim3 threadsPerBlock(THREADS_PER_BLOCK);
+    dim3 totalBlocksCorr(H, W, C);
+
+    for (int n = 0; n < N; n++) {
+      correlation_backward_input1<
+          T><<<totalBlocksCorr, threadsPerBlock, 0, dev_ctx.stream()>>>(
+          n, grad_input1->data<T>(), C, H, W, grad_output->data<T>(), GOC, GOH,
+          GOW, rinput2.data<T>(), pad_size, kernel_size, max_displacement,
+          stride1, stride2);
+    }
+
+    for (int n = 0; n < N; n++) {
+      correlation_backward_input2<
+          T><<<totalBlocksCorr, threadsPerBlock, 0, dev_ctx.stream()>>>(
+          n, grad_input2->data<T>(), C, H, W, grad_output->data<T>(), GOC, GOH,
+          GOW, rinput1.data<T>(), pad_size, kernel_size, max_displacement,
+          stride1, stride2);
+    }
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+REGISTER_OP_CUDA_KERNEL(correlation, ops::CorrelationCUDAKernel<float>,
+                        ops::CorrelationCUDAKernel<double>);
+REGISTER_OP_CUDA_KERNEL(correlation_grad, ops::CorrelationCUDAGradKernel<float>,
+                        ops::CorrelationCUDAGradKernel<double>);
diff --git a/paddle/fluid/operators/cudnn_lstm_cache.h b/paddle/fluid/operators/cudnn_lstm_cache.h
new file mode 100644
index 0000000000000000000000000000000000000000..3181e4b1d990b775f2c80a5d13f391886f83b080
--- /dev/null
+++ b/paddle/fluid/operators/cudnn_lstm_cache.h
@@ -0,0 +1,176 @@
+/* Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include <vector>
+#include "paddle/fluid/framework/tensor.h"
+#include "paddle/fluid/platform/cudnn_helper.h"
+#include "paddle/fluid/platform/dynload/cudnn.h"
+
+namespace paddle {
+namespace operators {
+
+class ScopedRNNBase {
+ public:
+  ScopedRNNBase(int seq_length, int batch_size, int input_size, int hidden_size,
+                int num_layers, float dropout_prob, int seed, int weight_numel,
+                bool initialized, bool is_bidirec)
+      : seq_length_(seq_length),
+        batch_size_(batch_size),
+        input_size_(input_size),
+        hidden_size_(hidden_size),
+        num_layers_(num_layers),
+        dropout_prob_(dropout_prob),
+        seed_(seed),
+        weight_numel_(weight_numel),
+        initialized_(initialized),
+        is_bidirec_(is_bidirec) {}
+
+  template <typename T>
+  void Create(const cudnnHandle_t& handle, const platform::Place& place,
+              const std::vector<int>& sequence_length, size_t* workspace_size,
+              size_t* reserve_size, framework::Tensor* dropout_state) {
+    int numDirections = is_bidirec_ ? 2 : 1;
+    cudnnDataType_t cudnn_type = platform::CudnnDataType<T>::type;
+
+    // ------------------- cudnn x, y descriptors ---------------------
+    std::vector<int> dims_x = {batch_size_, input_size_, 1};
+    std::vector<int> strides_x = {input_size_, 1, 1};
+    std::vector<int> dims_y = {batch_size_, hidden_size_ * numDirections, 1};
+    std::vector<int> strides_y = {hidden_size_ * numDirections, 1, 1};
+    for (int i = 0; i < seq_length_; ++i) {
+      x_descs_.emplace_back(x_desc_.descriptor<T>(dims_x, strides_x));
+      y_descs_.emplace_back(y_desc_.descriptor<T>(dims_y, strides_y));
+    }
+
+#if CUDNN_VERSION >= 7201
+    if (!sequence_length.empty()) {
+      x_seq_desc_.descriptor<T>(seq_length_, batch_size_, input_size_, true,
+                                sequence_length);
+      y_seq_desc_.descriptor<T>(seq_length_, batch_size_,
+                                hidden_size_ * numDirections, true,
+                                sequence_length);
+    }
+#endif
+
+    // ------------------- cudnn hx, hy, cx, cy descriptors----------
+    std::vector<int> dims_hx = {num_layers_ * numDirections, batch_size_,
+                                hidden_size_};
+    std::vector<int> strides_hx = {hidden_size_ * batch_size_, hidden_size_, 1};
+    init_h_desc_.descriptor<T>(dims_hx, strides_hx);
+    init_c_desc_.descriptor<T>(dims_hx, strides_hx);
+    last_h_desc_.descriptor<T>(dims_hx, strides_hx);
+    last_c_desc_.descriptor<T>(dims_hx, strides_hx);
+
+    // ------------------- cudnn dropout descriptors ---------------------
+    size_t state_size;
+    if (!initialized_) {
+      PADDLE_ENFORCE_CUDA_SUCCESS(
+          platform::dynload::cudnnDropoutGetStatesSize(handle, &state_size));
+      dropout_state->mutable_data<uint8_t>({static_cast<int64_t>(state_size)},
+                                           place);
+    }
+    dropout_desc_.descriptor(handle, place, initialized_, dropout_prob_,
+                             dropout_state, seed_, state_size);
+
+// ------------------- cudnn rnn descriptors ---------------------
+#if CUDNN_VERSION >= 6000
+    PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::cudnnSetRNNDescriptor_v6(
+        handle, rnn_desc_.desc(), hidden_size_, num_layers_,
+        dropout_desc_.desc(), CUDNN_LINEAR_INPUT,
+        is_bidirec_ ? CUDNN_BIDIRECTIONAL : CUDNN_UNIDIRECTIONAL, CUDNN_LSTM,
+        CUDNN_RNN_ALGO_STANDARD, cudnn_type));
+#else
+    PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::cudnnSetRNNDescriptor(
+        rnn_desc_.desc(), hidden_size_, num_layers_, dropout_desc_.desc(),
+        CUDNN_LINEAR_INPUT,
+        is_bidirec_ ? CUDNN_BIDIRECTIONAL : CUDNN_UNIDIRECTIONAL, CUDNN_LSTM,
+        cudnn_type));
+#endif
+
+#if CUDNN_VERSION >= 7201
+    if (!sequence_length.empty()) {
+      PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::cudnnSetRNNPaddingMode(
+          rnn_desc_.desc(), CUDNN_RNN_PADDED_IO_ENABLED));
+    }
+#endif
+
+    // ------------------- cudnn weights_size ---------------------
+    size_t weights_size_;
+    PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::cudnnGetRNNParamsSize(
+        handle, rnn_desc_.desc(), x_descs_[0], &weights_size_, cudnn_type));
+    PADDLE_ENFORCE_EQ(
+        weights_size_, sizeof(T) * weight_numel_,
+        platform::errors::InvalidArgument(
+            "The cudnn lstm and setting weight size should be same."));
+    // ------------------- cudnn weight descriptors ---------------------
+    platform::DataLayout layout = platform::DataLayout::kNCHW;
+    int dim_tmp = weights_size_ / sizeof(T);
+    std::vector<int> dim_w = {dim_tmp, 1, 1};
+    weight_desc_.descriptor<T>(layout, dim_w);
+    // ------------------- cudnn workspace, reserve size ---------------------
+    PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::cudnnGetRNNWorkspaceSize(
+        handle, rnn_desc_.desc(), seq_length_, x_descs_.data(),
+        workspace_size));
+    PADDLE_ENFORCE_CUDA_SUCCESS(
+        platform::dynload::cudnnGetRNNTrainingReserveSize(
+            handle, rnn_desc_.desc(), seq_length_, x_descs_.data(),
+            reserve_size));
+  }
+  cudnnTensorDescriptor_t* x_descs() { return x_descs_.data(); }
+  cudnnTensorDescriptor_t* y_descs() { return y_descs_.data(); }
+#if CUDNN_VERSION >= 7201
+  cudnnRNNDataDescriptor_t x_seq_desc() { return x_seq_desc_.desc(); }
+  cudnnRNNDataDescriptor_t y_seq_desc() { return y_seq_desc_.desc(); }
+#endif
+  cudnnTensorDescriptor_t init_h_desc() { return init_h_desc_.desc(); }
+  cudnnTensorDescriptor_t init_c_desc() { return init_c_desc_.desc(); }
+  cudnnTensorDescriptor_t last_h_desc() { return last_h_desc_.desc(); }
+  cudnnTensorDescriptor_t last_c_desc() { return last_c_desc_.desc(); }
+  cudnnRNNDescriptor_t rnn_desc() { return rnn_desc_.desc(); }
+  cudnnDropoutDescriptor_t dropout_desc() { return dropout_desc_.desc(); }
+  cudnnFilterDescriptor_t weight_desc() { return weight_desc_.desc(); }
+
+ private:
+  int seq_length_;
+  int batch_size_;
+  int input_size_;
+  int hidden_size_;
+  int num_layers_;
+  float dropout_prob_;
+  int seed_;
+  int weight_numel_;
+  bool initialized_;
+  bool is_bidirec_;
+  std::vector<cudnnTensorDescriptor_t> x_descs_;
+  std::vector<cudnnTensorDescriptor_t> y_descs_;
+
+  platform::ScopedTensorDescriptor x_desc_;
+  platform::ScopedTensorDescriptor y_desc_;
+#if CUDNN_VERSION >= 7201
+  platform::ScopedRNNTensorDescriptor x_seq_desc_;
+  platform::ScopedRNNTensorDescriptor y_seq_desc_;
+#endif
+  platform::ScopedTensorDescriptor init_h_desc_;
+  platform::ScopedTensorDescriptor init_c_desc_;
+  platform::ScopedTensorDescriptor last_h_desc_;
+  platform::ScopedTensorDescriptor last_c_desc_;
+  platform::ScopedDropoutDescriptor dropout_desc_;
+  platform::ScopedFilterDescriptor weight_desc_;
+  platform::ScopedRNNDescriptor rnn_desc_;
+};
+
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/fluid/operators/cudnn_lstm_op.cc b/paddle/fluid/operators/cudnn_lstm_op.cc
index 7081490fd1bf0e26cb8aa90d69a76a5476cef044..82954bc109a740c0fe31ab889eb07bbbe3f52417 100644
--- a/paddle/fluid/operators/cudnn_lstm_op.cc
+++ b/paddle/fluid/operators/cudnn_lstm_op.cc
@@ -37,41 +37,52 @@ class CudnnLSTMOp : public framework::OperatorWithKernel {
     OP_INOUT_CHECK(ctx->HasOutput("LastC"), "Output", "LastC", "CudnnLSTM");
 
     auto in_dims = ctx->GetInputDim("Input");
-    auto init_dims = ctx->GetInputDim("InitH");
+    auto init_h_dims = ctx->GetInputDim("InitH");
+    auto init_c_dims = ctx->GetInputDim("InitC");
+
     PADDLE_ENFORCE_EQ(in_dims.size(), 3,
                       platform::errors::InvalidArgument(
                           "The rank of Input in CudnnLSTM  must be 3. But "
                           "received Input's rank is %d.",
                           in_dims.size()));
-    PADDLE_ENFORCE_EQ(init_dims.size(), 3,
+    PADDLE_ENFORCE_EQ(init_h_dims.size(), 3,
                       platform::errors::InvalidArgument(
                           "The rank of InitH in CudnnLSTM  must be 3. But "
                           "received InitH's rank is %d.",
-                          init_dims.size()));
-
-    PADDLE_ENFORCE_EQ(in_dims[1], init_dims[1],
+                          init_h_dims.size()));
+
+    if (ctx->HasInput("SequenceLength")) {
+      auto seq_dims = ctx->GetInputDim("SequenceLength");
+      PADDLE_ENFORCE_EQ(
+          in_dims[1], seq_dims[0],
+          platform::errors::InvalidArgument(
+              "The size of SequenceLength has to equal the batch_size. But "
+              "received batch_size is %d and the size of SequenceLength is %d.",
+              in_dims[1], seq_dims[0]));
+    }
+
+    PADDLE_ENFORCE_EQ(
+        in_dims[1], init_h_dims[1],
+        platform::errors::InvalidArgument(
+            "The in_dims[1] (Input dims) and init_h_dims[1] (InitH "
+            "dims) should be equal. But "
+            "received in_dims[1] is %d and init_h_dims[1] is %d.",
+            in_dims[1], init_h_dims[1]));
+
+    PADDLE_ENFORCE_EQ(init_c_dims, init_h_dims,
                       platform::errors::InvalidArgument(
-                          "The in_dims[1] (Input dims) and init_dims[1] (InitH "
-                          "dims) should be equal. But "
-                          "received in_dims[1] is %d and init_dims[1] is %d.",
-                          in_dims[1], init_dims[1]));
-    PADDLE_ENFORCE_EQ(in_dims[2], init_dims[2],
-                      platform::errors::InvalidArgument(
-                          "The in_dims[2] (Input dims) and init_dims[2] (InitH "
-                          "dims) should be equal. But "
-                          "received in_dims[2] is %d and init_dims[2] is %d.",
-                          in_dims[2], init_dims[2]));
+                          "The InitC dims and InitH "
+                          "dims should be equal. But "
+                          "received init_c_dims is %d and init_h_dims is %d.",
+                          init_c_dims, init_h_dims));
 
     auto out_dims = in_dims;
     auto hidden_size = ctx->Attrs().Get<int>("hidden_size");
     bool is_bidirec = ctx->Attrs().Get<bool>("is_bidirec");
     out_dims[2] = is_bidirec ? hidden_size * 2 : hidden_size;
-
-    auto last_dims = init_dims;
-    last_dims[0] = is_bidirec ? last_dims[0] * 2 : last_dims[0];
     ctx->SetOutputDim("Out", out_dims);
-    ctx->SetOutputDim("LastH", last_dims);
-    ctx->SetOutputDim("LastC", last_dims);
+    ctx->SetOutputDim("LastH", init_c_dims);
+    ctx->SetOutputDim("LastC", init_h_dims);
   }
 
  protected:
@@ -95,7 +106,7 @@ class CudnnLSTMOpMaker : public framework::OpProtoAndCheckerMaker {
         "different batch)"
         "batch_size is the instance number of this batch"
         "input_size is the hidden size of the input."
-        "input_hidden_size and the hidden_size in the next may not be same");
+        "input_size and the hidden_size in the next may not be same");
     AddInput("InitH",
              "(Tensor) the initial hidden state of the LSTM"
              "input. This is a tensor with shape (num_layers x batch_size x "
@@ -112,6 +123,12 @@ class CudnnLSTMOpMaker : public framework::OpProtoAndCheckerMaker {
              "(Tensor) the learnable hidden-hidden weights."
              " The shape is (N), where N is total weight size of the LSTM. "
              " cudnn concatenate all the weight to one Tensor");
+    AddInput("SequenceLength",
+             "(Tensor) When the input data is padding, "
+             "set this parameter. This parameter represents "
+             "the variable sequence lengths in a batch. "
+             "The size of the vector has to equal the batch_size.")
+        .AsDispensable();
     AddOutput("Reserve",
               "(Tensor, a temporary output Tensor to store the reserve_data "
               "of cudnn kernel.")
@@ -235,6 +252,9 @@ class CudnnLSTMGradOpMaker : public framework::SingleGradOpMaker<T> {
     op->SetInput("InitH", this->Input("InitH"));
     op->SetInput("InitC", this->Input("InitC"));
     op->SetInput("W", this->Input("W"));
+    if (this->HasInput("SequenceLength")) {
+      op->SetInput("SequenceLength", this->Input("SequenceLength"));
+    }
     op->SetInput("Reserve", this->Output("Reserve"));
     op->SetInput("StateOut", this->Output("StateOut"));
     op->SetInput("Out", this->Output("Out"));
diff --git a/paddle/fluid/operators/cudnn_lstm_op.cu.cc b/paddle/fluid/operators/cudnn_lstm_op.cu.cc
index 37e5e518ea2af9bb437775c8fa7e86816bb1d8ae..6457d9295dcbfa99d18f63fbda3dae048d7713cd 100644
--- a/paddle/fluid/operators/cudnn_lstm_op.cu.cc
+++ b/paddle/fluid/operators/cudnn_lstm_op.cu.cc
@@ -13,9 +13,11 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/operators/cudnn_rnn_cache.h"
+#include "paddle/fluid/operators/cudnn_lstm_cache.h"
 #include "paddle/fluid/operators/math/math_function.h"
+#include "paddle/fluid/operators/utils.h"
 #include "paddle/fluid/platform/cudnn_desc.h"
+#include "paddle/fluid/platform/cudnn_helper.h"
 
 namespace paddle {
 namespace operators {
@@ -23,6 +25,43 @@ namespace operators {
 using LoDTensor = framework::LoDTensor;
 using Tensor = framework::Tensor;
 
+template <typename T>
+void LSTMInferece(const bool &has_seq_length, const cudnnHandle_t &handle,
+                  const int &seq_length, ScopedRNNBase *rnn, const T *x_data,
+                  const T *init_h_data, const T *init_c_data, const T *w_data,
+                  T *out_data, T *last_h_data, T *last_c_data,
+                  framework::Tensor *workspace_data,
+                  const size_t &workspace_size) {
+  if (!has_seq_length) {
+    // for inference
+    // This interface is used when the input/output is unpadded.
+    PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::cudnnRNNForwardInference(
+        handle, rnn->rnn_desc(), seq_length, rnn->x_descs(), x_data,
+        rnn->init_h_desc(), init_h_data, rnn->init_c_desc(), init_c_data,
+        rnn->weight_desc(), w_data, rnn->y_descs(), out_data,
+        rnn->last_h_desc(), last_h_data, rnn->last_c_desc(), last_c_data,
+        workspace_data->data<uint8_t>(), workspace_size));
+  } else {
+#if CUDNN_VERSION >= 7201
+    // for inference
+    // This interface is used when the input/output is padded.
+    PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::cudnnRNNForwardInferenceEx(
+        handle, rnn->rnn_desc(), rnn->x_seq_desc(), x_data, rnn->init_h_desc(),
+        init_h_data, rnn->init_c_desc(), init_c_data, rnn->weight_desc(),
+        w_data, rnn->y_seq_desc(), out_data, rnn->last_h_desc(), last_h_data,
+        rnn->last_c_desc(), last_c_data, nullptr, nullptr, nullptr, nullptr,
+        nullptr, nullptr, nullptr, nullptr, workspace_data->data<uint8_t>(),
+        workspace_size));
+#else
+    // CUDNN VERSION has to >=7.2.1
+    PADDLE_THROW(platform::errors::Unavailable(
+        "The padded input is supported by "
+        "cudnnRNNForwardInferenceEx, but it only works when "
+        "the version of cudnn is larger than 7.2.1"));
+#endif
+  }
+}
+
 template <typename T>
 class CudnnLSTMGPUKernel : public framework::OpKernel<T> {
  public:
@@ -56,49 +95,74 @@ class CudnnLSTMGPUKernel : public framework::OpKernel<T> {
     bool is_test = ctx.Attr<bool>("is_test");
     int seed = ctx.Attr<int>("seed");
 
+    bool has_seq_length = ctx.HasInput("SequenceLength");
+    std::vector<int> SequenceLength;
+    if (has_seq_length) {
+      auto *sequence_length = ctx.Input<Tensor>("SequenceLength");
+      SequenceLength = operators::GetDataFromTensor<int>(sequence_length);
+    }
+
     auto &dev_ctx = ctx.template device_context<platform::CUDADeviceContext>();
     auto handle = dev_ctx.cudnn_handle();
 
-    CudnnRNNCache *cudnn_rnn_cache = new CudnnRNNCache();
+    int seq_length = x->dims()[0];
+    int batch_size = x->dims()[1];
+    int input_size = x->dims()[2];
+    int weight_numel = w->numel();
+    bool state_initialized = state_out->IsInitialized() ? true : false;
 
-    auto input_w_numel = w->numel();
-    auto seq_len = x->dims()[0];
-    auto batch_size = x->dims()[1];
-    auto input_dim = x->dims()[2];
+    size_t workspace_size;
     size_t reserve_size;
-    bool state_initialized = state_out->IsInitialized() ? true : false;
-    cudnnDataType_t cudnn_type = platform::ToCudnnDataType(
-        framework::ToDataType(std::type_index(typeid(T))));
-    cudnn_rnn_cache->init(handle, ctx.GetPlace(), seq_len, batch_size,
-                          input_dim, hidden_size, num_layers, dropout_prob,
-                          is_bidirec, seed, input_w_numel, &reserve_size,
-                          state_out, state_initialized, cudnn_type);
+
+    ScopedRNNBase rnn(seq_length, batch_size, input_size, hidden_size,
+                      num_layers, dropout_prob, seed, weight_numel,
+                      state_initialized, is_bidirec);
+    rnn.Create<T>(handle, ctx.GetPlace(), SequenceLength, &workspace_size,
+                  &reserve_size, state_out);
+
+    framework::Tensor workspace_data_;
+    workspace_data_.mutable_data<uint8_t>(
+        {static_cast<int64_t>(workspace_size)}, ctx.GetPlace());
 
     auto *reserve_data = reserve->mutable_data<uint8_t>(
         {static_cast<int64_t>(reserve_size)}, ctx.GetPlace());
 
     if (is_test) {
-      // for inference
-      PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::cudnnRNNForwardInference(
-          handle, cudnn_rnn_cache->rnn_desc_, seq_len, cudnn_rnn_cache->x_desc_,
-          x_data, cudnn_rnn_cache->hx_desc_, init_h_data,
-          cudnn_rnn_cache->cx_desc_, init_c_data, cudnn_rnn_cache->w_desc_,
-          w_data, cudnn_rnn_cache->y_desc_, out_data, cudnn_rnn_cache->hy_desc_,
-          last_h_data, cudnn_rnn_cache->cy_desc_, last_c_data,
-          cudnn_rnn_cache->workspace_data_.data<uint8_t>(),
-          cudnn_rnn_cache->workspace_size_));
+      LSTMInferece<T>(has_seq_length, handle, seq_length, &rnn, x_data,
+                      init_h_data, init_c_data, w_data, out_data, last_h_data,
+                      last_c_data, &workspace_data_, workspace_size);
     } else {
-      // for train
-      PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::cudnnRNNForwardTraining(
-          handle, cudnn_rnn_cache->rnn_desc_, seq_len, cudnn_rnn_cache->x_desc_,
-          x_data, cudnn_rnn_cache->hx_desc_, init_h_data,
-          cudnn_rnn_cache->cx_desc_, init_c_data, cudnn_rnn_cache->w_desc_,
-          w_data, cudnn_rnn_cache->y_desc_, out_data, cudnn_rnn_cache->hy_desc_,
-          last_h_data, cudnn_rnn_cache->cy_desc_, last_c_data,
-          cudnn_rnn_cache->workspace_data_.data<uint8_t>(),
-          cudnn_rnn_cache->workspace_size_, reserve_data, reserve_size));
+      if (!has_seq_length) {
+        // for train
+        // This interface is used when the input/output is unpadded.
+        PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::cudnnRNNForwardTraining(
+            handle, rnn.rnn_desc(), seq_length, rnn.x_descs(), x_data,
+            rnn.init_h_desc(), init_h_data, rnn.init_c_desc(), init_c_data,
+            rnn.weight_desc(), w_data, rnn.y_descs(), out_data,
+            rnn.last_h_desc(), last_h_data, rnn.last_c_desc(), last_c_data,
+            workspace_data_.data<uint8_t>(), workspace_size, reserve_data,
+            reserve_size));
+      } else {
+#if CUDNN_VERSION >= 7201
+        // for train
+        // This interface is used when the input/output is padded.
+        PADDLE_ENFORCE_CUDA_SUCCESS(
+            platform::dynload::cudnnRNNForwardTrainingEx(
+                handle, rnn.rnn_desc(), rnn.x_seq_desc(), x_data,
+                rnn.init_h_desc(), init_h_data, rnn.init_c_desc(), init_c_data,
+                rnn.weight_desc(), w_data, rnn.y_seq_desc(), out_data,
+                rnn.last_h_desc(), last_h_data, rnn.last_c_desc(), last_c_data,
+                nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr,
+                nullptr, workspace_data_.data<uint8_t>(), workspace_size,
+                reserve_data, reserve_size));
+#else
+        PADDLE_THROW(platform::errors::Unavailable(
+            "The padded input is supported by "
+            "cudnnRNNForwardTrainingEx, but it only works when "
+            "the version of cudnn is larger than 7.2.1"));
+#endif
+      }
     }
-    delete cudnn_rnn_cache;
   }
 };
 
@@ -157,43 +221,77 @@ class CudnnLSTMGPUGradKernel : public framework::OpKernel<T> {
     int num_layers = ctx.Attr<int>("num_layers");
     int seed = ctx.Attr<int>("seed");
 
-    CudnnRNNCache *cudnn_rnn_cache = new CudnnRNNCache();
+    bool has_seq_length = ctx.HasInput("SequenceLength");
+    std::vector<int> SequenceLength;
+    if (has_seq_length) {
+      auto *sequence_length = ctx.Input<Tensor>("SequenceLength");
+      SequenceLength = operators::GetDataFromTensor<int>(sequence_length);
+    }
+
+    int seq_length = input_dims[0];
+    int batch_size = input->dims()[1];
+    int input_size = input->dims()[2];
+    int weight_numel = weight->numel();
 
-    auto input_w_numel = weight->numel();
-    auto seq_len = input_dims[0];
-    auto batch_size = input->dims()[1];
-    auto input_dim = input->dims()[2];
+    size_t workspace_size;
     size_t reserve_size;
-    cudnnDataType_t cudnn_type = platform::ToCudnnDataType(
-        framework::ToDataType(std::type_index(typeid(T))));
-    cudnn_rnn_cache->init(handle, ctx.GetPlace(), seq_len, batch_size,
-                          input_dim, hidden_size, num_layers, dropout_prob,
-                          is_bidirec, seed, input_w_numel, &reserve_size,
-                          const_cast<Tensor *>(state_out), true, cudnn_type);
-
-    auto work_data = cudnn_rnn_cache->workspace_data_.data<uint8_t>();
+
+    ScopedRNNBase rnn(seq_length, batch_size, input_size, hidden_size,
+                      num_layers, dropout_prob, seed, weight_numel, true,
+                      is_bidirec);
+
+    rnn.Create<T>(handle, ctx.GetPlace(), SequenceLength, &workspace_size,
+                  &reserve_size, const_cast<Tensor *>(state_out));
+
+    framework::Tensor workspace_data_;
+    workspace_data_.mutable_data<uint8_t>(
+        {static_cast<int64_t>(workspace_size)}, ctx.GetPlace());
     const uint8_t *reserve_data = reserve->data<uint8_t>();
 
-    PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::cudnnRNNBackwardData(
-        handle, cudnn_rnn_cache->rnn_desc_, seq_len, cudnn_rnn_cache->y_desc_,
-        out_data, cudnn_rnn_cache->y_desc_, out_grad_data,
-        cudnn_rnn_cache->hy_desc_, last_h_grad_data, cudnn_rnn_cache->cy_desc_,
-        last_c_grad_data, cudnn_rnn_cache->w_desc_, weight_data,
-        cudnn_rnn_cache->hx_desc_, init_h_data, cudnn_rnn_cache->cx_desc_,
-        init_c_data, cudnn_rnn_cache->x_desc_, in_grad_data,
-        cudnn_rnn_cache->hx_desc_, init_h_grad_data, cudnn_rnn_cache->cx_desc_,
-        init_c_grad_data, work_data, cudnn_rnn_cache->workspace_size_,
-        const_cast<uint8_t *>(reserve_data), reserve_size));
-
-    PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::cudnnRNNBackwardWeights(
-        handle, cudnn_rnn_cache->rnn_desc_, seq_len, cudnn_rnn_cache->x_desc_,
-        input->data<T>(), cudnn_rnn_cache->hx_desc_, init_h->data<T>(),
-        cudnn_rnn_cache->y_desc_, out->data<T>(),
-        cudnn_rnn_cache->workspace_data_.data<uint8_t>(),
-        cudnn_rnn_cache->workspace_size_, cudnn_rnn_cache->w_desc_,
-        weight_grad->data<T>(), const_cast<uint8_t *>(reserve_data),
-        reserve_size));
-    delete cudnn_rnn_cache;
+    if (!has_seq_length) {
+      // This interface is used when the input/output is unpadded.
+      PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::cudnnRNNBackwardData(
+          handle, rnn.rnn_desc(), seq_length, rnn.y_descs(), out_data,
+          rnn.y_descs(), out_grad_data, rnn.last_h_desc(), last_h_grad_data,
+          rnn.last_c_desc(), last_c_grad_data, rnn.weight_desc(), weight_data,
+          rnn.init_h_desc(), init_h_data, rnn.init_c_desc(), init_c_data,
+          rnn.x_descs(), in_grad_data, rnn.init_h_desc(), init_h_grad_data,
+          rnn.init_c_desc(), init_c_grad_data, workspace_data_.data<uint8_t>(),
+          workspace_size, const_cast<uint8_t *>(reserve_data), reserve_size));
+
+      PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::cudnnRNNBackwardWeights(
+          handle, rnn.rnn_desc(), seq_length, rnn.x_descs(), input->data<T>(),
+          rnn.init_h_desc(), init_h->data<T>(), rnn.y_descs(), out->data<T>(),
+          workspace_data_.data<uint8_t>(), workspace_size, rnn.weight_desc(),
+          weight_grad->data<T>(), const_cast<uint8_t *>(reserve_data),
+          reserve_size));
+    } else {
+#if CUDNN_VERSION >= 7201
+      // for train
+      // This interface is used when the input/output is padded.
+      PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::cudnnRNNBackwardDataEx(
+          handle, rnn.rnn_desc(), rnn.y_seq_desc(), out_data, rnn.y_seq_desc(),
+          out_grad_data, nullptr, nullptr, rnn.last_h_desc(), last_h_grad_data,
+          rnn.last_c_desc(), last_c_grad_data, rnn.weight_desc(), weight_data,
+          rnn.init_h_desc(), init_h_data, rnn.init_c_desc(), init_c_data,
+          rnn.x_seq_desc(), in_grad_data, rnn.init_h_desc(), init_h_grad_data,
+          rnn.init_c_desc(), init_c_grad_data, nullptr, nullptr,
+          workspace_data_.data<uint8_t>(), workspace_size,
+          const_cast<uint8_t *>(reserve_data), reserve_size));
+
+      PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::cudnnRNNBackwardWeightsEx(
+          handle, rnn.rnn_desc(), rnn.x_seq_desc(), input->data<T>(),
+          rnn.init_h_desc(), init_h->data<T>(), rnn.y_seq_desc(),
+          out->data<T>(), workspace_data_.data<uint8_t>(), workspace_size,
+          rnn.weight_desc(), weight_grad->data<T>(),
+          const_cast<uint8_t *>(reserve_data), reserve_size));
+#else
+      PADDLE_THROW(platform::errors::Unavailable(
+          "The padded input of rnn is supported by cudnnRNNBackwardDataEx, "
+          "cudnnRNNBackwardWeightsEx, but it only works when the version "
+          "of cudnn is larger than 7.2.1"));
+#endif
+    }
   }
 };
 
diff --git a/paddle/fluid/operators/dequantize_abs_max_op.cc b/paddle/fluid/operators/dequantize_abs_max_op.cc
index 48743f2e48c8a7686497adff52f23f31346aeda7..0d4d68d9f622fef9df4819d6092411a4d7db65f7 100644
--- a/paddle/fluid/operators/dequantize_abs_max_op.cc
+++ b/paddle/fluid/operators/dequantize_abs_max_op.cc
@@ -45,10 +45,8 @@ class DequantizeMaxAbsOp : public framework::OperatorWithKernel {
       : OperatorWithKernel(type, inputs, outputs, attrs) {}
 
   void InferShape(framework::InferShapeContext* ctx) const override {
-    PADDLE_ENFORCE_EQ(ctx->HasInput("X"), true,
-                      "Input(X) of DequantizeMaxAbsOp should not be null.");
-    PADDLE_ENFORCE_EQ(ctx->HasOutput("Out"), true,
-                      "Output(Out) of DequantizeMaxAbsOp should not be null.");
+    OP_INOUT_CHECK(ctx->HasInput("X"), "Input", "X", "DequantizeMaxAbs");
+    OP_INOUT_CHECK(ctx->HasOutput("Out"), "Output", "Out", "DequantizeMaxAbs");
 
     ctx->ShareDim("X", /*->*/ "Out");
     ctx->ShareLoD("X", /*->*/ "Out");
diff --git a/paddle/fluid/operators/detection/CMakeLists.txt b/paddle/fluid/operators/detection/CMakeLists.txt
index 537063640e4ef6e49f7b991482f0f3122ecef02f..c2b7c27ab4adb5282ad7aa5f7a16c15f81ba5f5e 100644
--- a/paddle/fluid/operators/detection/CMakeLists.txt
+++ b/paddle/fluid/operators/detection/CMakeLists.txt
@@ -41,9 +41,13 @@ detection_library(sigmoid_focal_loss_op SRCS sigmoid_focal_loss_op.cc sigmoid_fo
 detection_library(retinanet_detection_output_op SRCS retinanet_detection_output_op.cc)
 
 if(WITH_GPU)
-  detection_library(generate_proposals_op SRCS generate_proposals_op.cc generate_proposals_op.cu DEPS memory cub)
-  detection_library(distribute_fpn_proposals_op SRCS distribute_fpn_proposals_op.cc distribute_fpn_proposals_op.cu DEPS memory cub)
-  detection_library(collect_fpn_proposals_op SRCS collect_fpn_proposals_op.cc collect_fpn_proposals_op.cu DEPS memory cub)
+  set(TMPDEPS memory)
+  if (${CMAKE_CUDA_COMPILER_VERSION} LESS 11.0)
+      set(TMPDEPS memory cub)
+  endif()
+  detection_library(generate_proposals_op SRCS generate_proposals_op.cc generate_proposals_op.cu DEPS ${TMPDEPS})
+  detection_library(distribute_fpn_proposals_op SRCS distribute_fpn_proposals_op.cc distribute_fpn_proposals_op.cu DEPS ${TMPDEPS})
+  detection_library(collect_fpn_proposals_op SRCS collect_fpn_proposals_op.cc collect_fpn_proposals_op.cu DEPS ${TMPDEPS})
 else()
   detection_library(generate_proposals_op SRCS generate_proposals_op.cc)
   detection_library(distribute_fpn_proposals_op SRCS distribute_fpn_proposals_op.cc)
diff --git a/paddle/fluid/operators/detection/collect_fpn_proposals_op.cc b/paddle/fluid/operators/detection/collect_fpn_proposals_op.cc
index b3e3332fe3425301a3dede3a3d810697ad4debf3..44f602237da2e2c8fa26e39326f977d10235155d 100644
--- a/paddle/fluid/operators/detection/collect_fpn_proposals_op.cc
+++ b/paddle/fluid/operators/detection/collect_fpn_proposals_op.cc
@@ -10,6 +10,7 @@ See the License for the specific language governing permissions and
 limitations under the License.*/
 
 #include "paddle/fluid/operators/detection/collect_fpn_proposals_op.h"
+#include "paddle/fluid/framework/op_version_registry.h"
 
 namespace paddle {
 namespace operators {
@@ -54,11 +55,14 @@ class CollectFpnProposalsOp : public framework::OperatorWithKernel {
               score_dim[1]));
     }
     context->SetOutputDim("FpnRois", {post_nms_topN, 4});
+    if (context->HasOutput("RoisNum")) {
+      context->SetOutputDim("RoisNum", {-1});
+    }
     if (!context->IsRuntime()) {  // Runtime LoD infershape will be computed
       // in Kernel.
       context->ShareLoD("MultiLevelRois", "FpnRois");
     }
-    if (context->IsRuntime()) {
+    if (context->IsRuntime() && !context->HasInputs("MultiLevelRoIsNum")) {
       std::vector<framework::InferShapeVarPtr> roi_inputs =
           context->GetInputVarPtrs("MultiLevelRois");
       std::vector<framework::InferShapeVarPtr> score_inputs =
@@ -99,7 +103,16 @@ class CollectFpnProposalsOpMaker : public framework::OpProtoAndCheckerMaker {
              "(LoDTensor) Multiple score LoDTensors from each level in shape"
              " (N, 1), N is the number of RoIs.")
         .AsDuplicable();
+    AddInput(
+        "MultiLevelRoIsNum",
+        "(List of Tensor) The RoIs' number of each image on multiple levels."
+        "The number on each level has the shape of (N), N is the number of "
+        "images.")
+        .AsDuplicable()
+        .AsDispensable();
     AddOutput("FpnRois", "(LoDTensor) All selected RoIs with highest scores");
+    AddOutput("RoisNum", "(Tensor), Number of RoIs in each images.")
+        .AsDispensable();
     AddAttr<int>("post_nms_topN",
                  "Select post_nms_topN RoIs from"
                  " all images and all fpn layers");
@@ -123,3 +136,14 @@ REGISTER_OPERATOR(
 REGISTER_OP_CPU_KERNEL(collect_fpn_proposals,
                        ops::CollectFpnProposalsOpKernel<float>,
                        ops::CollectFpnProposalsOpKernel<double>);
+REGISTER_OP_VERSION(collect_fpn_proposals)
+    .AddCheckpoint(
+        R"ROC(
+              Upgrade collect_fpn_proposals add a new input 
+              [MultiLevelRoIsNum] and add a new output [RoisNum].)ROC",
+        paddle::framework::compatible::OpVersionDesc()
+            .NewInput("MultiLevelRoIsNum",
+                      "The RoIs' number of each image on multiple levels."
+                      "The number on each level has the shape of (N), "
+                      "N is the number of images.")
+            .NewOutput("RoisNum", "The number of RoIs in each image."));
diff --git a/paddle/fluid/operators/detection/collect_fpn_proposals_op.cu b/paddle/fluid/operators/detection/collect_fpn_proposals_op.cu
index 35222a85cd388f6fef3c61c440be7b36598d9e01..86207052bb2bef4f7bea34c2614fe7686f579de8 100644
--- a/paddle/fluid/operators/detection/collect_fpn_proposals_op.cu
+++ b/paddle/fluid/operators/detection/collect_fpn_proposals_op.cu
@@ -80,14 +80,27 @@ class GPUCollectFpnProposalsOpKernel : public framework::OpKernel<T> {
     int lod_size;
     auto place = BOOST_GET_CONST(platform::CUDAPlace, dev_ctx.GetPlace());
 
+    auto multi_rois_num = ctx.MultiInput<Tensor>("MultiLevelRoIsNum");
     for (size_t i = 0; i < roi_ins.size(); ++i) {
       auto roi_in = roi_ins[i];
       auto score_in = score_ins[i];
-      auto roi_lod = roi_in->lod().back();
-      lod_size = roi_lod.size() - 1;
-      for (size_t n = 0; n < lod_size; ++n) {
-        for (size_t j = roi_lod[n]; j < roi_lod[n + 1]; ++j) {
-          roi_batch_id_data[index++] = n;
+      if (multi_rois_num.size() > 0) {
+        framework::Tensor temp;
+        TensorCopySync(*multi_rois_num[i], platform::CPUPlace(), &temp);
+        const int* length_in = temp.data<int>();
+        lod_size = multi_rois_num[i]->numel();
+        for (size_t n = 0; n < lod_size; ++n) {
+          for (size_t j = 0; j < length_in[n]; ++j) {
+            roi_batch_id_data[index++] = n;
+          }
+        }
+      } else {
+        auto length_in = roi_in->lod().back();
+        lod_size = length_in.size() - 1;
+        for (size_t n = 0; n < lod_size; ++n) {
+          for (size_t j = length_in[n]; j < length_in[n + 1]; ++j) {
+            roi_batch_id_data[index++] = n;
+          }
         }
       }
 
@@ -190,6 +203,13 @@ class GPUCollectFpnProposalsOpKernel : public framework::OpKernel<T> {
       offset.emplace_back(offset.back() + length_lod_cpu[i]);
     }
 
+    if (ctx.HasOutput("RoisNum")) {
+      auto* rois_num = ctx.Output<Tensor>("RoisNum");
+      int* rois_num_data = rois_num->mutable_data<int>({lod_size}, place);
+      memory::Copy(place, rois_num_data, place, length_lod_data,
+                   lod_size * sizeof(int), dev_ctx.stream());
+    }
+
     framework::LoD lod;
     lod.emplace_back(offset);
     fpn_rois->set_lod(lod);
diff --git a/paddle/fluid/operators/detection/collect_fpn_proposals_op.h b/paddle/fluid/operators/detection/collect_fpn_proposals_op.h
index badd88f0689ba9defcb3f26eb57fef89308aa877..950b8b78933bff6bf1692df61142258dfbc87a8c 100644
--- a/paddle/fluid/operators/detection/collect_fpn_proposals_op.h
+++ b/paddle/fluid/operators/detection/collect_fpn_proposals_op.h
@@ -17,6 +17,7 @@ limitations under the License.*/
 #include <algorithm>
 #include <cmath>
 #include <cstring>
+#include <numeric>
 #include <string>
 #include <vector>
 #include "paddle/fluid/framework/op_registry.h"
@@ -65,6 +66,8 @@ class CollectFpnProposalsOpKernel : public framework::OpKernel<T> {
 
     auto multi_layer_scores =
         context.MultiInput<paddle::framework::LoDTensor>("MultiLevelScores");
+    auto multi_rois_num = context.MultiInput<Tensor>("MultiLevelRoIsNum");
+    int num_size = multi_rois_num.size();
 
     auto* fpn_rois = context.Output<paddle::framework::LoDTensor>("FpnRois");
 
@@ -88,11 +91,21 @@ class CollectFpnProposalsOpKernel : public framework::OpKernel<T> {
     const int num_fpn_level = multi_layer_rois.size();
     std::vector<int> integral_of_all_rois(num_fpn_level + 1, 0);
     for (int i = 0; i < num_fpn_level; ++i) {
-      auto cur_rois_lod = multi_layer_rois[i]->lod().back();
-      integral_of_all_rois[i + 1] =
-          integral_of_all_rois[i] + cur_rois_lod[cur_rois_lod.size() - 1];
+      int all_rois = 0;
+      if (num_size == 0) {
+        auto cur_rois_lod = multi_layer_rois[i]->lod().back();
+        all_rois = cur_rois_lod[cur_rois_lod.size() - 1];
+      } else {
+        const int* cur_rois_num = multi_rois_num[i]->data<int>();
+        all_rois = std::accumulate(
+            cur_rois_num, cur_rois_num + multi_rois_num[i]->numel(), 0);
+      }
+      integral_of_all_rois[i + 1] = integral_of_all_rois[i] + all_rois;
     }
 
+    const int batch_size = (num_size == 0)
+                               ? multi_layer_rois[0]->lod().back().size() - 1
+                               : multi_rois_num[0]->numel();
     // concatenate all fpn rois scores into a list
     // create a vector to store all scores
     std::vector<ScoreWithID<T>> scores_of_all_rois(
@@ -100,11 +113,20 @@ class CollectFpnProposalsOpKernel : public framework::OpKernel<T> {
     for (int i = 0; i < num_fpn_level; ++i) {
       const T* cur_level_scores = multi_layer_scores[i]->data<T>();
       int cur_level_num = integral_of_all_rois[i + 1] - integral_of_all_rois[i];
-      auto cur_scores_lod = multi_layer_scores[i]->lod().back();
       int cur_batch_id = 0;
+      int pre_num = 0;
       for (int j = 0; j < cur_level_num; ++j) {
-        if (static_cast<size_t>(j) >= cur_scores_lod[cur_batch_id + 1]) {
-          cur_batch_id++;
+        if (num_size == 0) {
+          auto cur_scores_lod = multi_layer_scores[i]->lod().back();
+          if (static_cast<size_t>(j) >= cur_scores_lod[cur_batch_id + 1]) {
+            cur_batch_id++;
+          }
+        } else {
+          const int* rois_num_data = multi_rois_num[i]->data<int>();
+          if (j >= pre_num + rois_num_data[cur_batch_id]) {
+            pre_num += rois_num_data[cur_batch_id];
+            cur_batch_id++;
+          }
         }
         int cur_index = j + integral_of_all_rois[i];
         scores_of_all_rois[cur_index].score = cur_level_scores[j];
@@ -134,6 +156,9 @@ class CollectFpnProposalsOpKernel : public framework::OpKernel<T> {
     T* fpn_rois_data = fpn_rois->data<T>();
     std::vector<size_t> lod0(1, 0);
     int cur_batch_id = 0;
+    std::vector<int64_t> num_per_batch;
+    int pre_idx = 0;
+    int cur_num = 0;
     for (int i = 0; i < post_nms_topN; ++i) {
       int cur_fpn_level = scores_of_all_rois[i].level;
       int cur_level_index = scores_of_all_rois[i].index;
@@ -144,6 +169,18 @@ class CollectFpnProposalsOpKernel : public framework::OpKernel<T> {
       if (scores_of_all_rois[i].batch_id != cur_batch_id) {
         cur_batch_id = scores_of_all_rois[i].batch_id;
         lod0.emplace_back(i);
+        cur_num = i - pre_idx;
+        pre_idx = i;
+        num_per_batch.emplace_back(cur_num);
+      }
+    }
+    num_per_batch.emplace_back(post_nms_topN - pre_idx);
+    if (context.HasOutput("RoisNum")) {
+      auto* rois_num = context.Output<Tensor>("RoisNum");
+      int* rois_num_data =
+          rois_num->mutable_data<int>({batch_size}, context.GetPlace());
+      for (int i = 0; i < batch_size; i++) {
+        rois_num_data[i] = num_per_batch[i];
       }
     }
     lod0.emplace_back(post_nms_topN);
diff --git a/paddle/fluid/operators/detection/distribute_fpn_proposals_op.cc b/paddle/fluid/operators/detection/distribute_fpn_proposals_op.cc
index 160d43a917b3c74ff905e070714415d35c5c877c..614b37e703e721337057e04c5611386ff87a1e9e 100644
--- a/paddle/fluid/operators/detection/distribute_fpn_proposals_op.cc
+++ b/paddle/fluid/operators/detection/distribute_fpn_proposals_op.cc
@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "paddle/fluid/operators/detection/distribute_fpn_proposals_op.h"
+#include "paddle/fluid/framework/op_version_registry.h"
 
 namespace paddle {
 namespace operators {
@@ -48,6 +49,14 @@ class DistributeFpnProposalsOp : public framework::OperatorWithKernel {
     }
     ctx->SetOutputsDim("MultiFpnRois", outs_dims);
     ctx->SetOutputDim("RestoreIndex", {-1, 1});
+
+    if (ctx->HasOutputs("MultiLevelRoIsNum")) {
+      std::vector<framework::DDim> outs_num_dims;
+      for (size_t i = 0; i < num_out_rois; ++i) {
+        outs_num_dims.push_back({-1});
+      }
+      ctx->SetOutputsDim("MultiLevelRoIsNum", outs_num_dims);
+    }
     if (!ctx->IsRuntime()) {
       for (size_t i = 0; i < num_out_rois; ++i) {
         ctx->SetLoDLevel("MultiFpnRois", ctx->GetLoDLevel("FpnRois"), i);
@@ -66,12 +75,22 @@ class DistributeFpnProposalsOp : public framework::OperatorWithKernel {
 class DistributeFpnProposalsOpMaker : public framework::OpProtoAndCheckerMaker {
  public:
   void Make() override {
-    AddInput("FpnRois", "(LoDTensor) The rois at all levels in shape (-1, 4)");
+    AddInput("FpnRois", "(LoDTensor) The RoIs at all levels in shape (-1, 4)");
+    AddInput("RoisNum",
+             "(Tensor) The number of RoIs in shape (B),"
+             "B is the number of images")
+        .AsDispensable();
     AddOutput("MultiFpnRois", "(LoDTensor) Output with distribute operator")
         .AsDuplicable();
     AddOutput("RestoreIndex",
               "(Tensor) An array of positive number which is "
               "used to restore the order of FpnRois");
+    AddOutput("MultiLevelRoIsNum",
+              "(List of Tensor) The RoIs' number of each image on multiple "
+              "levels. The number on each level has the shape of (B),"
+              "B is the number of images.")
+        .AsDuplicable()
+        .AsDispensable();
     AddAttr<int>("min_level",
                  "The lowest level of FPN layer where the"
                  " proposals come from");
@@ -105,3 +124,14 @@ REGISTER_OPERATOR(
 REGISTER_OP_CPU_KERNEL(distribute_fpn_proposals,
                        ops::DistributeFpnProposalsOpKernel<float>,
                        ops::DistributeFpnProposalsOpKernel<double>);
+REGISTER_OP_VERSION(distribute_fpn_proposals)
+    .AddCheckpoint(
+        R"ROC(
+              Upgrade distribute_fpn_proposals add a new input
+              [RoisNum] and add a new output [MultiLevelRoIsNum].)ROC",
+        paddle::framework::compatible::OpVersionDesc()
+            .NewInput("RoIsNum", "The number of RoIs in each image.")
+            .NewOutput("MultiLevelRoisNum",
+                       "The RoIs' number of each image on multiple "
+                       "levels. The number on each level has the shape of (B),"
+                       "B is the number of images."));
diff --git a/paddle/fluid/operators/detection/distribute_fpn_proposals_op.cu b/paddle/fluid/operators/detection/distribute_fpn_proposals_op.cu
index 1e3cd9f36c595f978f5b5e5f5c5cf5cad6dc9059..27c06a0f8fb207b5dc85c7875ea91428b16e606c 100644
--- a/paddle/fluid/operators/detection/distribute_fpn_proposals_op.cu
+++ b/paddle/fluid/operators/detection/distribute_fpn_proposals_op.cu
@@ -76,12 +76,20 @@ class GPUDistributeFpnProposalsOpKernel : public framework::OpKernel<T> {
     int num_level = max_level - min_level + 1;
 
     // check that the fpn_rois is not empty
-    PADDLE_ENFORCE_EQ(
-        fpn_rois->lod().size(), 1UL,
-        platform::errors::InvalidArgument("DistributeFpnProposalsOp needs LoD"
-                                          "with one level"));
+    if (!ctx.HasInput("RoisNum")) {
+      PADDLE_ENFORCE_EQ(
+          fpn_rois->lod().size(), 1UL,
+          platform::errors::InvalidArgument("DistributeFpnProposalsOp needs LoD"
+                                            "with one level"));
+    }
 
-    auto fpn_rois_lod = fpn_rois->lod().back();
+    std::vector<size_t> fpn_rois_lod;
+    if (ctx.HasInput("RoisNum")) {
+      auto* rois_num = ctx.Input<Tensor>("RoisNum");
+      fpn_rois_lod = GetLodFromRoisNum(rois_num);
+    } else {
+      fpn_rois_lod = fpn_rois->lod().back();
+    }
     int lod_size = fpn_rois_lod.size() - 1;
     int roi_num = fpn_rois_lod[lod_size];
 
@@ -154,6 +162,8 @@ class GPUDistributeFpnProposalsOpKernel : public framework::OpKernel<T> {
         restore_idx_data, roi_num);
 
     int start = 0;
+    auto multi_rois_num = ctx.MultiOutput<Tensor>("MultiLevelRoIsNum");
+
     for (int i = 0; i < num_level; ++i) {
       Tensor sub_lod = sub_lod_list.Slice(i, i + 1);
       int* sub_lod_data = sub_lod.data<int>();
@@ -180,6 +190,11 @@ class GPUDistributeFpnProposalsOpKernel : public framework::OpKernel<T> {
         multi_fpn_rois[i]->mutable_data<T>({sub_rois_num, kBoxDim},
                                            dev_ctx.GetPlace());
       }
+      if (multi_rois_num.size() > 0) {
+        Tensor* rois_num_t = multi_rois_num[i];
+        TensorCopySync(sub_lod, dev_ctx.GetPlace(), rois_num_t);
+        rois_num_t->Resize({lod_size});
+      }
       framework::LoD lod;
       lod.emplace_back(offset);
       multi_fpn_rois[i]->set_lod(lod);
diff --git a/paddle/fluid/operators/detection/distribute_fpn_proposals_op.h b/paddle/fluid/operators/detection/distribute_fpn_proposals_op.h
index 0c84b385ccbc1dd26453bd957661c0310b7137e3..79498f01536d2fb2616921a2ef1ffa04f13fae64 100644
--- a/paddle/fluid/operators/detection/distribute_fpn_proposals_op.h
+++ b/paddle/fluid/operators/detection/distribute_fpn_proposals_op.h
@@ -28,6 +28,21 @@ namespace operators {
 
 const int kBoxDim = 4;
 
+inline std::vector<size_t> GetLodFromRoisNum(const Tensor* rois_num) {
+  std::vector<size_t> rois_lod;
+  auto* rois_num_data = rois_num->data<int>();
+  Tensor cpu_tensor;
+  if (platform::is_gpu_place(rois_num->place())) {
+    TensorCopySync(*rois_num, platform::CPUPlace(), &cpu_tensor);
+    rois_num_data = cpu_tensor.data<int>();
+  }
+  rois_lod.push_back(static_cast<size_t>(0));
+  for (int i = 0; i < rois_num->numel(); ++i) {
+    rois_lod.push_back(rois_lod.back() + static_cast<size_t>(rois_num_data[i]));
+  }
+  return rois_lod;
+}
+
 template <typename T>
 static inline T BBoxArea(const T* box, bool normalized) {
   if (box[2] < box[0] || box[3] < box[1]) {
@@ -65,13 +80,22 @@ class DistributeFpnProposalsOpKernel : public framework::OpKernel<T> {
     const int num_level = max_level - min_level + 1;
 
     // check that the fpn_rois is not empty
-    PADDLE_ENFORCE_EQ(
-        fpn_rois->lod().size(), 1UL,
-        platform::errors::InvalidArgument("DistributeFpnProposalsOp needs LoD "
-                                          "with one level."));
+    if (!context.HasInput("RoisNum")) {
+      PADDLE_ENFORCE_EQ(fpn_rois->lod().size(), 1UL,
+                        platform::errors::InvalidArgument(
+                            "DistributeFpnProposalsOp needs LoD "
+                            "with one level."));
+    }
 
-    auto fpn_rois_lod = fpn_rois->lod().back();
-    int fpn_rois_num = fpn_rois_lod[fpn_rois_lod.size() - 1];
+    std::vector<size_t> fpn_rois_lod;
+    int fpn_rois_num;
+    if (context.HasInput("RoisNum")) {
+      auto* rois_num = context.Input<Tensor>("RoisNum");
+      fpn_rois_lod = GetLodFromRoisNum(rois_num);
+    } else {
+      fpn_rois_lod = fpn_rois->lod().back();
+    }
+    fpn_rois_num = fpn_rois_lod[fpn_rois_lod.size() - 1];
     std::vector<int> target_level;
     // std::vector<int> target_level(fpn_rois_num, -1);
     // record the number of rois in each level
@@ -136,6 +160,18 @@ class DistributeFpnProposalsOpKernel : public framework::OpKernel<T> {
     for (int i = 0; i < fpn_rois_num; ++i) {
       restore_index_data[restore_index_inter[i]] = i;
     }
+    auto multi_rois_num = context.MultiOutput<Tensor>("MultiLevelRoIsNum");
+    if (multi_rois_num.size() > 0) {
+      int batch_size = fpn_rois_lod.size() - 1;
+      for (int i = 0; i < num_level; ++i) {
+        int* rois_num_data = multi_rois_num[i]->mutable_data<int>(
+            {batch_size}, context.GetPlace());
+        for (int j = 0; j < batch_size; ++j) {
+          rois_num_data[j] = static_cast<int>(multi_fpn_rois_lod0[i][j + 1] -
+                                              multi_fpn_rois_lod0[i][j]);
+        }
+      }
+    }
     // merge lod information into LoDTensor
     for (int i = 0; i < num_level; ++i) {
       framework::LoD lod;
diff --git a/paddle/fluid/operators/detection/generate_proposals_op.cc b/paddle/fluid/operators/detection/generate_proposals_op.cc
index 981a368e8564fbcd3d688bc67d2def8664bcfe8d..06e560f86d4e0a74f7ae04b155829618ce634697 100644
--- a/paddle/fluid/operators/detection/generate_proposals_op.cc
+++ b/paddle/fluid/operators/detection/generate_proposals_op.cc
@@ -17,6 +17,7 @@ limitations under the License. */
 #include <string>
 #include <vector>
 #include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/framework/op_version_registry.h"
 #include "paddle/fluid/operators/gather.h"
 #include "paddle/fluid/operators/math/math_function.h"
 
@@ -61,6 +62,10 @@ class GenerateProposalsOp : public framework::OperatorWithKernel {
 
     ctx->SetOutputDim("RpnRois", {-1, 4});
     ctx->SetOutputDim("RpnRoiProbs", {-1, 1});
+    if (!ctx->IsRuntime()) {
+      ctx->SetLoDLevel("RpnRois", std::max(ctx->GetLoDLevel("Scores"), 1));
+      ctx->SetLoDLevel("RpnRoiProbs", std::max(ctx->GetLoDLevel("Scores"), 1));
+    }
   }
 
  protected:
@@ -347,7 +352,7 @@ class GenerateProposalsKernel : public framework::OpKernel<T> {
     lod0.push_back(0);
     anchors.Resize({anchors.numel() / 4, 4});
     variances.Resize({variances.numel() / 4, 4});
-    std::vector<int64_t> tmp_lod;
+    std::vector<int> tmp_num;
 
     int64_t num_proposals = 0;
     for (int64_t i = 0; i < num; ++i) {
@@ -369,16 +374,16 @@ class GenerateProposalsKernel : public framework::OpKernel<T> {
       AppendProposals(rpn_roi_probs, num_proposals, scores);
       num_proposals += proposals.dims()[0];
       lod0.push_back(num_proposals);
-      tmp_lod.push_back(num_proposals);
+      tmp_num.push_back(proposals.dims()[0]);
     }
-    if (context.HasOutput("RpnRoisLod")) {
-      auto *rpn_rois_lod = context.Output<Tensor>("RpnRoisLod");
-      rpn_rois_lod->mutable_data<int64_t>({num}, context.GetPlace());
-      int64_t *lod_data = rpn_rois_lod->data<int64_t>();
+    if (context.HasOutput("RpnRoisNum")) {
+      auto *rpn_rois_num = context.Output<Tensor>("RpnRoisNum");
+      rpn_rois_num->mutable_data<int>({num}, context.GetPlace());
+      int *num_data = rpn_rois_num->data<int>();
       for (int i = 0; i < num; i++) {
-        lod_data[i] = tmp_lod[i];
+        num_data[i] = tmp_num[i];
       }
-      rpn_rois_lod->Resize({num});
+      rpn_rois_num->Resize({num});
     }
     rpn_rois->set_lod(lod);
     rpn_roi_probs->set_lod(lod);
@@ -433,6 +438,16 @@ class GenerateProposalsKernel : public framework::OpKernel<T> {
 
     Tensor keep;
     FilterBoxes<T>(ctx, &proposals, min_size, im_info_slice, &keep);
+    // Handle the case when there is no keep index left
+    if (keep.numel() == 0) {
+      math::SetConstant<platform::CPUDeviceContext, T> set_zero;
+      bbox_sel.mutable_data<T>({1, 4}, ctx.GetPlace());
+      set_zero(ctx, &bbox_sel, static_cast<T>(0));
+      Tensor scores_filter;
+      scores_filter.mutable_data<T>({1, 1}, ctx.GetPlace());
+      set_zero(ctx, &scores_filter, static_cast<T>(0));
+      return std::make_pair(bbox_sel, scores_filter);
+    }
 
     Tensor scores_filter;
     bbox_sel.mutable_data<T>({keep.numel(), 4}, ctx.GetPlace());
@@ -481,7 +496,8 @@ class GenerateProposalsOpMaker : public framework::OpProtoAndCheckerMaker {
               "(LoDTensor), Output proposals with shape (rois_num, 4).");
     AddOutput("RpnRoiProbs",
               "(LoDTensor) Scores of proposals with shape (rois_num, 1).");
-    AddOutput("RpnRoisLod", "(Tensor), rpn rois's lod info").AsDispensable();
+    AddOutput("RpnRoisNum", "(Tensor), The number of Rpn RoIs in each image")
+        .AsDispensable();
     AddAttr<int>("pre_nms_topN",
                  "Number of top scoring RPN proposals to keep before "
                  "applying NMS.");
@@ -515,3 +531,11 @@ REGISTER_OPERATOR(
     paddle::framework::EmptyGradOpMaker<paddle::imperative::OpBase>);
 REGISTER_OP_CPU_KERNEL(generate_proposals, ops::GenerateProposalsKernel<float>,
                        ops::GenerateProposalsKernel<double>);
+REGISTER_OP_VERSION(generate_proposals)
+    .AddCheckpoint(
+        R"ROC(
+              Upgrade generate_proposals add a new output [RpnRoisNum])ROC",
+        paddle::framework::compatible::OpVersionDesc().NewOutput(
+            "RpnRoisNum",
+            "The number of Rpn RoIs in each image. RpnRoisNum is "
+            "dispensable."));
diff --git a/paddle/fluid/operators/detection/generate_proposals_op.cu b/paddle/fluid/operators/detection/generate_proposals_op.cu
index fa7670f6d680a95da1c1abd5befe1651ccb7265f..485136d8e2f7ab66f6b1c58deb09036ea5d4e1ec 100644
--- a/paddle/fluid/operators/detection/generate_proposals_op.cu
+++ b/paddle/fluid/operators/detection/generate_proposals_op.cu
@@ -330,6 +330,15 @@ static std::pair<Tensor, Tensor> ProposalForOneImage(
   keep_index.Resize({keep_num});
 
   Tensor scores_filter, proposals_filter;
+  // Handle the case when there is no keep index left
+  if (keep_num == 0) {
+    math::SetConstant<platform::CUDADeviceContext, T> set_zero;
+    proposals_filter.mutable_data<T>({1, 4}, ctx.GetPlace());
+    scores_filter.mutable_data<T>({1, 1}, ctx.GetPlace());
+    set_zero(ctx, &proposals_filter, static_cast<T>(0));
+    set_zero(ctx, &scores_filter, static_cast<T>(0));
+    return std::make_pair(proposals_filter, scores_filter);
+  }
   proposals_filter.mutable_data<T>({keep_num, 4}, ctx.GetPlace());
   scores_filter.mutable_data<T>({keep_num, 1}, ctx.GetPlace());
   GPUGather<T>(ctx, proposals, keep_index, &proposals_filter);
@@ -421,7 +430,7 @@ class CUDAGenerateProposalsKernel : public framework::OpKernel<T> {
 
     int64_t num_proposals = 0;
     std::vector<size_t> offset(1, 0);
-    std::vector<int64_t> tmp_lod;
+    std::vector<int> tmp_num;
 
     for (int64_t i = 0; i < num; ++i) {
       Tensor im_info_slice = im_info->Slice(i, i + 1);
@@ -448,15 +457,15 @@ class CUDAGenerateProposalsKernel : public framework::OpKernel<T> {
       dev_ctx.Wait();
       num_proposals += proposals.dims()[0];
       offset.emplace_back(num_proposals);
-      tmp_lod.push_back(num_proposals);
+      tmp_num.push_back(proposals.dims()[0]);
     }
-    if (context.HasOutput("RpnRoisLod")) {
-      auto *rpn_rois_lod = context.Output<Tensor>("RpnRoisLod");
-      rpn_rois_lod->mutable_data<int64_t>({num}, context.GetPlace());
-      int64_t *lod_data = rpn_rois_lod->data<int64_t>();
-      memory::Copy(place, lod_data, cpu_place, &tmp_lod[0],
-                   sizeof(int64_t) * num, dev_ctx.stream());
-      rpn_rois_lod->Resize({num});
+    if (context.HasOutput("RpnRoisNum")) {
+      auto *rpn_rois_num = context.Output<Tensor>("RpnRoisNum");
+      rpn_rois_num->mutable_data<int>({num}, context.GetPlace());
+      int *num_data = rpn_rois_num->data<int>();
+      memory::Copy(place, num_data, cpu_place, &tmp_num[0], sizeof(int) * num,
+                   dev_ctx.stream());
+      rpn_rois_num->Resize({num});
     }
     framework::LoD lod;
     lod.emplace_back(offset);
diff --git a/paddle/fluid/operators/detection/gpc.cc b/paddle/fluid/operators/detection/gpc.cc
index b46d231d0ff7774c64745b3b77953cf2ed8d82f7..6b1b0cd8b3578a344978afae642b66759589ffde 100644
--- a/paddle/fluid/operators/detection/gpc.cc
+++ b/paddle/fluid/operators/detection/gpc.cc
@@ -532,7 +532,8 @@ static int count_contours(polygon_node *polygon) {
 }
 
 static void add_left(polygon_node *p, double x, double y) {
-  PADDLE_ENFORCE_NOT_NULL(p);
+  PADDLE_ENFORCE_NOT_NULL(p, paddle::platform::errors::InvalidArgument(
+                                 "Input polygon node is nullptr."));
   vertex_node *nv = NULL;
 
   /* Create a new vertex node and set its fields */
@@ -588,7 +589,8 @@ static void add_right(polygon_node *p, double x, double y) {
 }
 
 static void merge_right(polygon_node *p, polygon_node *q, polygon_node *list) {
-  PADDLE_ENFORCE_NOT_NULL(p);
+  PADDLE_ENFORCE_NOT_NULL(p, paddle::platform::errors::InvalidArgument(
+                                 "Input polygon node is nullptr."));
   polygon_node *target = NULL;
 
   /* Label contour as external */
@@ -664,7 +666,8 @@ void add_vertex(vertex_node **t, double x, double y) {
 }
 
 void gpc_vertex_create(edge_node *e, int p, int s, double x, double y) {
-  PADDLE_ENFORCE_NOT_NULL(e);
+  PADDLE_ENFORCE_NOT_NULL(e, paddle::platform::errors::InvalidArgument(
+                                 "Input edge node is nullptr."));
   add_vertex(&(e->outp[p]->v[s]), x, y);
   e->outp[p]->active++;
 }
@@ -693,7 +696,8 @@ static bbox *create_contour_bboxes(gpc_polygon *p) {
 
   gpc_malloc<bbox>(box, p->num_contours * sizeof(bbox),
                    const_cast<char *>("Bounding box creation"));
-  PADDLE_ENFORCE_NOT_NULL(box);
+  PADDLE_ENFORCE_NOT_NULL(box, paddle::platform::errors::ResourceExhausted(
+                                   "Failed to malloc box memory."));
 
   /* Construct contour bounding boxes */
   for (c = 0; c < p->num_contours; c++) {
@@ -857,7 +861,9 @@ void gpc_add_contour(gpc_polygon *p, gpc_vertex_list *new_contour, int hole) {
   /* Create an extended hole array */
   gpc_malloc<int>(extended_hole, (p->num_contours + 1) * sizeof(int),
                   const_cast<char *>("contour hole addition"));
-  PADDLE_ENFORCE_NOT_NULL(extended_hole);
+  PADDLE_ENFORCE_NOT_NULL(extended_hole,
+                          paddle::platform::errors::ResourceExhausted(
+                              "Failed to malloc extended hole memory."));
 
   /* Create an extended contour array */
   gpc_malloc<gpc_vertex_list>(extended_contour,
@@ -975,7 +981,9 @@ void gpc_polygon_clip(gpc_op op, gpc_polygon *subj, gpc_polygon *clip,
   /* Build scanbeam table from scanbeam tree */
   gpc_malloc<double>(sbt, sbt_entries * sizeof(double),
                      const_cast<char *>("sbt creation"));
-  PADDLE_ENFORCE_NOT_NULL(sbt);
+  PADDLE_ENFORCE_NOT_NULL(sbt, paddle::platform::errors::ResourceExhausted(
+                                   "Failed to malloc scanbeam table memory."));
+
   build_sbt(&scanbeam, sbt, sbtree);
   scanbeam = 0;
   free_sbtree(&sbtree);
@@ -1017,7 +1025,9 @@ void gpc_polygon_clip(gpc_op op, gpc_polygon *subj, gpc_polygon *clip,
     e0 = aet;
     e1 = aet;
     /* Set up bundle fields of first edge */
-    PADDLE_ENFORCE_NOT_NULL(aet);
+    PADDLE_ENFORCE_NOT_NULL(aet, paddle::platform::errors::InvalidArgument(
+                                     "Edge node AET is nullptr."));
+
     aet->bundle[ABOVE][aet->type] = (aet->top.y != yb);
     aet->bundle[ABOVE][!aet->type] = 0;
     aet->bstate[ABOVE] = UNBUNDLED;
@@ -1612,7 +1622,8 @@ void gpc_tristrip_clip(gpc_op op, gpc_polygon *subj, gpc_polygon *clip,
   /* Build scanbeam table from scanbeam tree */
   gpc_malloc<double>(sbt, sbt_entries * sizeof(double),
                      const_cast<char *>("sbt creation"));
-  PADDLE_ENFORCE_NOT_NULL(sbt);
+  PADDLE_ENFORCE_NOT_NULL(sbt, paddle::platform::errors::ResourceExhausted(
+                                   "Failed to malloc scanbeam table memory."));
   build_sbt(&scanbeam, sbt, sbtree);
   scanbeam = 0;
   free_sbtree(&sbtree);
@@ -1650,7 +1661,8 @@ void gpc_tristrip_clip(gpc_op op, gpc_polygon *subj, gpc_polygon *clip,
     e1 = aet;
 
     /* Set up bundle fields of first edge */
-    PADDLE_ENFORCE_NOT_NULL(aet);
+    PADDLE_ENFORCE_NOT_NULL(aet, paddle::platform::errors::InvalidArgument(
+                                     "Edge node AET is nullptr."));
     aet->bundle[ABOVE][aet->type] = (aet->top.y != yb);
     aet->bundle[ABOVE][!aet->type] = 0;
     aet->bstate[ABOVE] = UNBUNDLED;
diff --git a/paddle/fluid/operators/dist_op.h b/paddle/fluid/operators/dist_op.h
index ca03400cfd1ef9a27ba8e725381515d5e4ebc0ba..a2279e40623b4ba2f0421e73a6148b89eb970e71 100644
--- a/paddle/fluid/operators/dist_op.h
+++ b/paddle/fluid/operators/dist_op.h
@@ -176,14 +176,26 @@ static void DistGradFunction(const framework::ExecutionContext& context) {
   } else if (p == INFINITY || p == -INFINITY) {
     // p=inf or -inf, Lp-norm = |z_i|, the j-th element of dz tends to 0 if
     // j!=i, or equals to sign(z_i) * dout if j=i.
-    grad_t.device(place) =
-        (x_minux_y_abs == out_t.broadcast(out_bcast_dims)).template cast<T>() *
-        sign * out_grad_t.broadcast(out_bcast_dims);
+    if (platform::is_cpu_place(context.GetPlace())) {
+      grad_t.device(place) = (x_minux_y_abs == out_t.broadcast(out_bcast_dims))
+                                 .template cast<T>() *
+                             sign.eval() * out_grad_t.broadcast(out_bcast_dims);
+    } else {
+      grad_t.device(place) = (x_minux_y_abs == out_t.broadcast(out_bcast_dims))
+                                 .template cast<T>() *
+                             sign * out_grad_t.broadcast(out_bcast_dims);
+    }
   } else {
     // dz = pow(abs(x-y)/out, p-1) * sign(x-y) * dout
-    grad_t.device(place) =
-        (x_minux_y_abs / out_t.broadcast(out_bcast_dims)).pow(p - 1) * sign *
-        out_grad_t.broadcast(out_bcast_dims);
+    if (platform::is_cpu_place(context.GetPlace())) {
+      grad_t.device(place) =
+          (x_minux_y_abs / out_t.broadcast(out_bcast_dims)).pow(p - 1) *
+          sign.eval() * out_grad_t.broadcast(out_bcast_dims);
+    } else {
+      grad_t.device(place) =
+          (x_minux_y_abs / out_t.broadcast(out_bcast_dims)).pow(p - 1) * sign *
+          out_grad_t.broadcast(out_bcast_dims);
+    }
   }
 
   Eigen::DSizes<int, Rank * 2> x_reshape_dims;
diff --git a/paddle/fluid/operators/distributed/variable_response.cc b/paddle/fluid/operators/distributed/variable_response.cc
index c9c42e0938d51991c53b74ac6ad59c350f4a3ced..de77121ee3990366771723e3c43e53362c832ef7 100644
--- a/paddle/fluid/operators/distributed/variable_response.cc
+++ b/paddle/fluid/operators/distributed/variable_response.cc
@@ -62,6 +62,34 @@ bool VariableResponse::ReadRaw(::google::protobuf::io::CodedInputStream* input,
     gpu_dev_ctx.Wait();
 #else
     PADDLE_THROW("Unexpected branch");
+#endif
+    return true;
+  } else if (platform::is_xpu_place(place)) {
+#ifdef PADDLE_WITH_XPU
+    auto& xpu_dev_ctx = static_cast<const platform::XPUDeviceContext&>(dev_ctx);
+    platform::CPUPlace cpu;
+    char* p = reinterpret_cast<char*>(dest);
+    while (total_written < length) {
+      if (!input->GetDirectBufferPointer(&data, &size_to_write)) {
+        return false;
+      }
+
+      if (total_written + size_to_write > length) {
+        size_to_write = length - total_written;
+      }
+
+      memory::Copy(BOOST_GET_CONST(platform::XPUPlace, place),
+                   reinterpret_cast<void*>(p), cpu, data, size_to_write);
+      p += size_to_write;
+      total_written += size_to_write;
+      input->Skip(size_to_write);
+    }
+    xpu_dev_ctx.Wait();
+#else
+    PADDLE_ENFORCE_NOT_NULL(
+        nullptr,
+        platform::errors::Unimplemented(
+            "Not supported XPU, please compile with option WITH_XPU=ON."));
 #endif
     return true;
   }
diff --git a/paddle/fluid/operators/distributed_ops/allreduce_op.h b/paddle/fluid/operators/distributed_ops/allreduce_op.h
index c77113ad405e991db20c035371550a1eccaa1971..e486faa575847311c2d668ada5519fe9c047f053 100644
--- a/paddle/fluid/operators/distributed_ops/allreduce_op.h
+++ b/paddle/fluid/operators/distributed_ops/allreduce_op.h
@@ -76,7 +76,8 @@ class AllReduceOpKernel : public framework::OpKernel<T> {
       PADDLE_ENFORCE_CUDA_SUCCESS(cudaStreamSynchronize(stream));
     }
 #else
-    PADDLE_THROW("PaddlePaddle should compile with GPU.");
+    PADDLE_THROW(platform::errors::PreconditionNotMet(
+        "PaddlePaddle should compile with GPU."));
 #endif
   }
 };
diff --git a/paddle/fluid/operators/distributed_ops/broadcast_op.cc b/paddle/fluid/operators/distributed_ops/broadcast_op.cc
index 535cf7014419292863a684eaaebbf15d367671ab..61e27887b68c75f3d5c5cc48b4f1fac11d5f4eae 100644
--- a/paddle/fluid/operators/distributed_ops/broadcast_op.cc
+++ b/paddle/fluid/operators/distributed_ops/broadcast_op.cc
@@ -58,7 +58,8 @@ template <typename T>
 class BroadcastOpKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& ctx) const override {
-    PADDLE_THROW("Broadcast op can run on gpu place only for now.");
+    PADDLE_THROW(platform::errors::PreconditionNotMet(
+        "Broadcast op can run on gpu place only for now."));
   }
 };
 
diff --git a/paddle/fluid/operators/distributed_ops/broadcast_op.cu.cc b/paddle/fluid/operators/distributed_ops/broadcast_op.cu.cc
index f067840e539ac046b53be7d3bc83c783f7c8cf9c..337422f0bd643f131d5044e802851a09d6171c13 100644
--- a/paddle/fluid/operators/distributed_ops/broadcast_op.cu.cc
+++ b/paddle/fluid/operators/distributed_ops/broadcast_op.cu.cc
@@ -68,10 +68,11 @@ class NCCLBroadcastOpKernel : public framework::OpKernel<T> {
             << " From " << root_dev_id << " to " << dev_id;
 
     if (ctx.Attr<bool>("sync_mode")) {
-      PADDLE_ENFORCE(cudaStreamSynchronize(stream));
+      PADDLE_ENFORCE_CUDA_SUCCESS(cudaStreamSynchronize(stream));
     }
 #else
-    PADDLE_THROW("PaddlePaddle should compile with GPU.");
+    PADDLE_THROW(platform::errors::PreconditionNotMet(
+        "PaddlePaddle should compile with GPU."));
 #endif
   }
 };
diff --git a/paddle/fluid/operators/distributed_ops/fake_init_op.cc b/paddle/fluid/operators/distributed_ops/fake_init_op.cc
index 1da164175e1daf8e872964d8fad21c4f3dd614e7..cb27dc75eb2faf15746e596265d1b1e4b3717e52 100644
--- a/paddle/fluid/operators/distributed_ops/fake_init_op.cc
+++ b/paddle/fluid/operators/distributed_ops/fake_init_op.cc
@@ -43,9 +43,9 @@ class FakeInitOp : public framework::OperatorBase {
       tensor = out_var.GetMutable<framework::SelectedRows>()->mutable_value();
       tensor->Resize(framework::make_ddim(Attr<std::vector<int64_t>>("shape")));
     } else {
-      PADDLE_THROW(
+      PADDLE_THROW(platform::errors::InvalidArgument(
           "fake init op's output only"
-          "supports SelectedRows and LoDTensor");
+          "supports SelectedRows and LoDTensor"));
     }
   }
 };
diff --git a/paddle/fluid/operators/distributed_ops/fetch_barrier_op.cc b/paddle/fluid/operators/distributed_ops/fetch_barrier_op.cc
index b064265917b2a36b2261c6c43d355f9891aa9811..c9f9daf3b3c0442e379cd7a22fcf48dbe3acbb5d 100644
--- a/paddle/fluid/operators/distributed_ops/fetch_barrier_op.cc
+++ b/paddle/fluid/operators/distributed_ops/fetch_barrier_op.cc
@@ -48,7 +48,9 @@ class FetchBarrierOp : public framework::OperatorBase {
     }
 
     for (size_t i = 0; i < rets.size(); i++) {
-      PADDLE_ENFORCE_NE(rets[i]->Wait(), 0U, "internal error in RPCClient");
+      PADDLE_ENFORCE_NE(rets[i]->Wait(), 0U,
+                        platform::errors::Unavailable(
+                            "Internal error occurred in RPCClient."));
     }
   }
 };
diff --git a/paddle/fluid/operators/distributed_ops/listen_and_serv_op.cc b/paddle/fluid/operators/distributed_ops/listen_and_serv_op.cc
index 5e1e408eb2c28239fded0d0cf037c94783828b50..43de8488a0e4ac24f7e261671488cd88563a805d 100644
--- a/paddle/fluid/operators/distributed_ops/listen_and_serv_op.cc
+++ b/paddle/fluid/operators/distributed_ops/listen_and_serv_op.cc
@@ -134,7 +134,10 @@ void ListenAndServOp::RunSyncLoop(
   auto optimize_blocks =
       Attr<std::vector<framework::BlockDesc *>>(kOptimizeBlocks);
   PADDLE_ENFORCE_GE(num_blocks, 2,
-                    "server program should have at least 2 blocks");
+                    platform::errors::PreconditionNotMet(
+                        "Invalid number of blocks in server program. Expected "
+                        "equal or greater than 2. Recieved %zu",
+                        num_blocks));
 
   // Prepare all the server block
   std::vector<int> optimize_blocks_list;
@@ -218,7 +221,8 @@ void ListenAndServOp::ResetReceivedVars(framework::Scope *recv_scope,
       VLOG(3) << "reset sparse var: " << varname;
       var->GetMutable<framework::SelectedRows>()->mutable_rows()->clear();
     } else {
-      PADDLE_THROW("The type of sparse var should be SelectedRows");
+      PADDLE_THROW(platform::errors::PreconditionNotMet(
+          "The type of sparse var should be SelectedRows"));
     }
   }
   if (UNLIKELY(reset_all)) {
@@ -235,7 +239,8 @@ void ListenAndServOp::ResetReceivedVars(framework::Scope *recv_scope,
         math::set_constant(*dev_ctx, var->GetMutable<framework::Tensor>(),
                            static_cast<float>(0));
       } else {
-        PADDLE_THROW("The type of dense var should be in [LoDTensor, Tensor]");
+        PADDLE_THROW(platform::errors::PreconditionNotMet(
+            "The type of dense var should be in [LoDTensor, Tensor]"));
       }
     }
   }
@@ -254,8 +259,15 @@ void ListenAndServOp::RunAsyncLoop(framework::Executor *executor,
     std::vector<std::string> pieces;
     split(grad_and_id, ':', &pieces);
     VLOG(3) << "after split, key = " << pieces[0] << ", id=" << pieces[1];
-    PADDLE_ENFORCE_EQ(pieces.size(), 2);
-    PADDLE_ENFORCE_EQ(out_map->count(pieces[0]), 0);
+    PADDLE_ENFORCE_EQ(pieces.size(), 2,
+                      platform::errors::PreconditionNotMet(
+                          "Invalid format of grad_and_id argument. "
+                          "Expected \"grad:block_id\". Recieved %s",
+                          grad_and_id.c_str()));
+    PADDLE_ENFORCE_EQ(out_map->count(pieces[0]), 0,
+                      platform::errors::AlreadyExists(
+                          "The gradient name %s has already existed in out_map",
+                          pieces[0].c_str()));
 
     int block_id = std::stoi(pieces[1]);
     (*out_map)[pieces[0]] = block_id;
@@ -267,7 +279,10 @@ void ListenAndServOp::RunAsyncLoop(framework::Executor *executor,
 
   size_t num_blocks = program->Size();
   PADDLE_ENFORCE_GE(num_blocks, 2,
-                    "server program should have at least 2 blocks");
+                    platform::errors::PreconditionNotMet(
+                        "Invalid number of blocks in server program. Expected "
+                        "equal or greater than 2. Recieved %zu",
+                        num_blocks));
   std::vector<int> block_list;
   for (size_t blkid = 1; blkid < num_blocks; ++blkid) {
     block_list.push_back(blkid);
@@ -342,9 +357,9 @@ void ListenAndServOp::CacheVarsType(const std::vector<std::string> &varnames,
                var->IsType<framework::Tensor>()) {
       dense_vars_.push_back(varname);
     } else {
-      PADDLE_THROW(
+      PADDLE_THROW(platform::errors::PreconditionNotMet(
           "The type of received var should be in [SelectedRows, LoDTensor, "
-          "Tensor].");
+          "Tensor]."));
     }
   }
 }
@@ -450,7 +465,12 @@ void ListenAndServOp::RunImpl(const framework::Scope &scope,
     split(prefetch_var_name_and_id, ':', &pieces);
     VLOG(3) << "after split, prefetch_var = " << pieces[0]
             << ", id=" << pieces[1];
-    PADDLE_ENFORCE_EQ(pieces.size(), 2);
+    PADDLE_ENFORCE_EQ(
+        pieces.size(), 2,
+        platform::errors::PreconditionNotMet(
+            "Invalid format of prefetch_var_name_and_id argument. "
+            "Expected \"xxx:xxx\". Recieved %s",
+            prefetch_var_name_and_id.c_str()));
 
     int block_id = std::stoi(pieces[1]);
     prefetch_block_id_list.push_back(block_id);
@@ -476,7 +496,12 @@ void ListenAndServOp::RunImpl(const framework::Scope &scope,
        sparse_grad_name_to_param_name_str) {
     std::vector<std::string> pieces;
     split(sparse_grad_name_and_param_name, ':', &pieces);
-    PADDLE_ENFORCE_EQ(pieces.size(), 2);
+    PADDLE_ENFORCE_EQ(
+        pieces.size(), 2,
+        platform::errors::PreconditionNotMet(
+            "Invalid format of sparse_grad_name_and_param_name argument. "
+            "Expected \"xxx:xxx\". Recieved %s",
+            sparse_grad_name_and_param_name.c_str()));
     VLOG(3) << "after split, sparse_grad_name = " << pieces[0]
             << ", param_name = " << pieces[1];
     sparse_grad_name_to_param_name[pieces[0]] = pieces[1];
diff --git a/paddle/fluid/operators/distributed_ops/send_recv_util.h b/paddle/fluid/operators/distributed_ops/send_recv_util.h
index c05a1ff1da8803c1ef3161d0e9d8604f9f1e5f3b..7dc0596ac31e2506ae02de11b33bd0532f02cc7a 100644
--- a/paddle/fluid/operators/distributed_ops/send_recv_util.h
+++ b/paddle/fluid/operators/distributed_ops/send_recv_util.h
@@ -34,16 +34,16 @@ inline bool NeedSend(const framework::Scope& scope,
       std::string::npos)
     return false;
   auto* var = scope.FindVar(varname);
-  PADDLE_ENFORCE_NOT_NULL(var, "Can not find variable '%s' in the send side.",
-                          varname);
+  PADDLE_ENFORCE_NOT_NULL(
+      var, platform::errors::NotFound(
+               "Can not find variable '%s' in the send side.", varname));
   if (var->IsType<framework::LoDTensor>()) {
     return var->Get<framework::LoDTensor>().IsInitialized();
   } else if (var->IsType<framework::SelectedRows>()) {
     return var->Get<framework::SelectedRows>().rows().size() > 0UL;
   } else {
-    PADDLE_THROW(
-        "Variable type in send side should be in "
-        "[LodTensor, SelectedRows]");
+    PADDLE_THROW(platform::errors::Unimplemented(
+        "Variable type in send side should be LodTensor or SelectedRows."));
   }
   return false;
 }
diff --git a/paddle/fluid/operators/dropout_op.cu b/paddle/fluid/operators/dropout_op.cu
index 4d5e4c4f600314d307125f9b2031026b6aa94f10..49ad67bbca353acc4a79c9e8912d7ae5a70c0021 100644
--- a/paddle/fluid/operators/dropout_op.cu
+++ b/paddle/fluid/operators/dropout_op.cu
@@ -96,6 +96,42 @@ __global__ void RandomGeneratorWithSeed(const size_t n, const int* seed,
   }
 }
 
+template <typename T, typename MaskType>
+__global__ void RandomGeneratorWithGenerator(const size_t n, uint64_t seed,
+                                             const float dropout_prob,
+                                             const T* src, MaskType* mask_data,
+                                             T* dst, bool is_upscale_in_train,
+                                             uint64_t increment) {
+  curandStatePhilox4_32_10_t state;
+  int idx = blockDim.x * blockIdx.x + threadIdx.x;
+  int step_size = 0;
+
+  MaskType mask;
+  T dest;
+  for (; idx < n; idx += blockDim.x * gridDim.x) {
+    T s = src[idx];
+    if (step_size == 0) {
+      curand_init(seed, idx, increment, &state);
+      step_size = blockDim.x * gridDim.x;
+    } else {
+      curand_init(seed, idx, increment, &state);
+    }
+    if (curand_uniform(&state) < dropout_prob) {
+      mask = 0;
+      dest = 0;
+    } else {
+      mask = 1;
+      if (is_upscale_in_train) {
+        dest = s / static_cast<T>(1.0f - dropout_prob);
+      } else {
+        dest = s;
+      }
+    }
+    mask_data[idx] = mask;
+    dst[idx] = dest;
+  }
+}
+
 // It seems that Eigen::Tensor::setRandom in GPU will SEGFAULT.
 // Use std::random and thrust::random(thrust is a std library in CUDA) to
 // implement uniform random.
@@ -150,6 +186,17 @@ class GPUDropoutKernel : public framework::OpKernel<T> {
             context.Attr<bool>("fix_seed") ? context.Attr<int>("seed") : rnd();
       }
 
+      int device_id = BOOST_GET_CONST(platform::CUDAPlace, context.GetPlace())
+                          .GetDeviceId();
+      auto gen_cuda = framework::GetDefaultCUDAGenerator(device_id);
+      if (gen_cuda->GetIsInitPy() && (!context.Attr<bool>("fix_seed"))) {
+        auto seed_offset = gen_cuda->IncrementOffset(1);
+        RandomGeneratorWithGenerator<T, uint8_t><<<grid, threads, 0, stream>>>(
+            size, seed_offset.first, dropout_prob, x_data, mask_data, y_data,
+            upscale_in_train, seed_offset.second);
+        return;
+      }
+
       RandomGenerator<T, uint8_t><<<grid, threads, 0, stream>>>(
           size, seed_data, dropout_prob, x_data, mask_data, y_data,
           upscale_in_train);
diff --git a/paddle/fluid/operators/dropout_op.h b/paddle/fluid/operators/dropout_op.h
index 9d9eb4a82a075f27764a73d0e976dbf3f7181cb1..161c4282ec277a19c19921267eaa4cb46b859900 100644
--- a/paddle/fluid/operators/dropout_op.h
+++ b/paddle/fluid/operators/dropout_op.h
@@ -29,6 +29,10 @@ template <typename T, int MajorType = Eigen::RowMajor,
           typename IndexType = Eigen::DenseIndex>
 using EigenMatrix = framework::EigenMatrix<T, MajorType, IndexType>;
 
+template <typename T, int MajorType = Eigen::RowMajor,
+          typename IndexType = Eigen::DenseIndex>
+using EigenVector = framework::EigenVector<T, MajorType, IndexType>;
+
 template <typename DeviceContext, typename T>
 class CPUDropoutKernel : public framework::OpKernel<T> {
  public:
@@ -116,9 +120,9 @@ class DropoutGradKernel : public framework::OpKernel<T> {
     auto* mask = context.Input<Tensor>("Mask");
     grad_x->mutable_data<T>(context.GetPlace());
 
-    auto M = EigenMatrix<uint8_t>::Reshape(*mask, 1);
-    auto dX = EigenMatrix<T>::Reshape(*grad_x, 1);
-    auto dY = EigenMatrix<T>::Reshape(*grad_y, 1);
+    auto M = EigenVector<uint8_t>::Flatten(*mask);
+    auto dX = EigenVector<T>::Flatten(*grad_x);
+    auto dY = EigenVector<T>::Flatten(*grad_y);
 
     auto& place =
         *context.template device_context<DeviceContext>().eigen_device();
diff --git a/paddle/fluid/operators/elementwise/elementwise_add_op.cc b/paddle/fluid/operators/elementwise/elementwise_add_op.cc
index 534a19bd94a231f0522dd15d2510917be8c71a4b..97624944ca109f27322f151f0742c72447fd5c39 100644
--- a/paddle/fluid/operators/elementwise/elementwise_add_op.cc
+++ b/paddle/fluid/operators/elementwise/elementwise_add_op.cc
@@ -13,8 +13,11 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "paddle/fluid/operators/elementwise/elementwise_add_op.h"
+
 #include <memory>
 #include <string>
+
+#include "paddle/fluid/framework/op_version_registry.h"
 #include "paddle/fluid/operators/elementwise/elementwise_op.h"
 
 namespace paddle {
@@ -129,3 +132,18 @@ REGISTER_OP_CPU_KERNEL(
                                         int>,
     ops::ElementwiseAddDoubleGradKernel<paddle::platform::CPUDeviceContext,
                                         int64_t>);
+
+// A specialization elementwise_add operator, used in gradient accumulation with
+// inplace addto.
+REGISTER_OPERATOR(
+    grad_add, paddle::operators::ElementwiseOp,
+    paddle::operators::ElementwiseAddOpMaker,
+    paddle::framework::EmptyGradOpMaker<paddle::framework::OpDesc>,
+    paddle::framework::EmptyGradOpMaker<paddle::imperative::OpBase>);
+
+REGISTER_OP_CPU_KERNEL(
+    grad_add,
+    ops::ElementwiseAddKernel<paddle::platform::CPUDeviceContext, float>,
+    ops::ElementwiseAddKernel<paddle::platform::CPUDeviceContext, double>,
+    ops::ElementwiseAddKernel<paddle::platform::CPUDeviceContext, int>,
+    ops::ElementwiseAddKernel<paddle::platform::CPUDeviceContext, int64_t>);
diff --git a/paddle/fluid/operators/elementwise/elementwise_add_op.cu b/paddle/fluid/operators/elementwise/elementwise_add_op.cu
index 71019872802eaca964373fd58a7ccc6445d9c489..a4cbd14388b4dd5ceab6417db79fafeeff41ccb7 100644
--- a/paddle/fluid/operators/elementwise/elementwise_add_op.cu
+++ b/paddle/fluid/operators/elementwise/elementwise_add_op.cu
@@ -111,3 +111,10 @@ REGISTER_OP_CUDA_KERNEL(
     ops::ElementwiseAddDoubleGradKernel<plat::CUDADeviceContext, int64_t>,
     ops::ElementwiseAddDoubleGradKernel<plat::CUDADeviceContext,
                                         plat::float16>);
+
+REGISTER_OP_CUDA_KERNEL(
+    grad_add, ops::ElementwiseAddKernel<plat::CUDADeviceContext, float>,
+    ops::ElementwiseAddKernel<plat::CUDADeviceContext, double>,
+    ops::ElementwiseAddKernel<plat::CUDADeviceContext, int>,
+    ops::ElementwiseAddKernel<plat::CUDADeviceContext, int64_t>,
+    ops::ElementwiseAddKernel<plat::CUDADeviceContext, plat::float16>);
diff --git a/paddle/fluid/operators/elementwise/elementwise_floordiv_op.cc b/paddle/fluid/operators/elementwise/elementwise_floordiv_op.cc
index 457d9e79d7da171ef526d5cab0e59b021cb64f98..5a398fa50febe2efffd588ce8f3612f1f9cec0b6 100644
--- a/paddle/fluid/operators/elementwise/elementwise_floordiv_op.cc
+++ b/paddle/fluid/operators/elementwise/elementwise_floordiv_op.cc
@@ -49,8 +49,6 @@ REGISTER_OP_WITHOUT_GRADIENT(elementwise_floordiv, ops::ElementwiseOp,
 
 REGISTER_OP_CPU_KERNEL(
     elementwise_floordiv,
-    ops::ElementwiseFloorDivKernel<paddle::platform::CPUDeviceContext, float>,
-    ops::ElementwiseFloorDivKernel<paddle::platform::CPUDeviceContext, double>,
     ops::ElementwiseFloorDivKernel<paddle::platform::CPUDeviceContext, int>,
     ops::ElementwiseFloorDivKernel<paddle::platform::CPUDeviceContext,
                                    int64_t>);
diff --git a/paddle/fluid/operators/elementwise/elementwise_floordiv_op.cu b/paddle/fluid/operators/elementwise/elementwise_floordiv_op.cu
index f63d6f037632c1a6a05726b933b2258adc113ee3..60846d1e8fee1c7f68ac101f18355750c2c15a4d 100644
--- a/paddle/fluid/operators/elementwise/elementwise_floordiv_op.cu
+++ b/paddle/fluid/operators/elementwise/elementwise_floordiv_op.cu
@@ -19,7 +19,5 @@ namespace plat = paddle::platform;
 
 REGISTER_OP_CUDA_KERNEL(
     elementwise_floordiv,
-    ops::ElementwiseFloorDivKernel<plat::CUDADeviceContext, float>,
-    ops::ElementwiseFloorDivKernel<plat::CUDADeviceContext, double>,
     ops::ElementwiseFloorDivKernel<plat::CUDADeviceContext, int>,
     ops::ElementwiseFloorDivKernel<plat::CUDADeviceContext, int64_t>);
diff --git a/paddle/fluid/operators/elementwise/elementwise_floordiv_op.h b/paddle/fluid/operators/elementwise/elementwise_floordiv_op.h
index 8afe2133c0488bbe04ec4803aac5dce6573f634d..721c23e38307fddb63d41baddc395afa94f40336 100644
--- a/paddle/fluid/operators/elementwise/elementwise_floordiv_op.h
+++ b/paddle/fluid/operators/elementwise/elementwise_floordiv_op.h
@@ -14,7 +14,6 @@ limitations under the License. */
 
 #pragma once
 
-#include <math.h>
 #include "paddle/fluid/framework/eigen.h"
 #include "paddle/fluid/operators/elementwise/elementwise_op.h"
 #include "paddle/fluid/operators/elementwise/elementwise_op_function.h"
diff --git a/paddle/fluid/operators/elementwise/elementwise_mod_op.h b/paddle/fluid/operators/elementwise/elementwise_mod_op.h
index 47bd6af0b95ace2b9b753e38cfc5f191bc1bb942..87e940e2ed6319c4f2957cd846735adb210cd23d 100644
--- a/paddle/fluid/operators/elementwise/elementwise_mod_op.h
+++ b/paddle/fluid/operators/elementwise/elementwise_mod_op.h
@@ -31,6 +31,15 @@ struct ModFunctor {
   }
 };
 
+template <typename T>
+struct InverseModFunctor {
+  inline HOSTDEVICE T operator()(T a, T b) const {
+    T res = b % a;
+    if ((res != 0) && ((res < 0) != (a < 0))) res += a;
+    return res;
+  }
+};
+
 template <typename T>
 struct ModFunctorFP {
   inline HOSTDEVICE T operator()(T a, T b) const {
@@ -40,13 +49,29 @@ struct ModFunctorFP {
   }
 };
 
+template <typename T>
+struct InverseModFunctorFP {
+  inline HOSTDEVICE T operator()(T a, T b) const {
+    T res = fmod(b, a);
+    if ((res != 0) && ((a < 0) != (res < 0))) res += a;
+    return res;
+  }
+};
+
 template <typename DeviceContext, typename T>
 void elementwise_mod(const framework::ExecutionContext &ctx,
                      const framework::Tensor *x, const framework::Tensor *y,
                      framework::Tensor *z) {
   int axis = ctx.Attr<int>("axis");
-  ElementwiseComputeEx<ModFunctor<T>, DeviceContext, T>(ctx, x, y, axis,
-                                                        ModFunctor<T>(), z);
+  auto x_dims = x->dims();
+  auto y_dims = y->dims();
+  if (x_dims.size() >= y_dims.size()) {
+    ElementwiseComputeEx<ModFunctor<T>, DeviceContext, T>(ctx, x, y, axis,
+                                                          ModFunctor<T>(), z);
+  } else {
+    ElementwiseComputeEx<InverseModFunctor<T>, DeviceContext, T>(
+        ctx, x, y, axis, InverseModFunctor<T>(), z);
+  }
 }
 
 template <typename DeviceContext, typename T>
@@ -54,8 +79,15 @@ void elementwise_mod_fp(const framework::ExecutionContext &ctx,
                         const framework::Tensor *x, const framework::Tensor *y,
                         framework::Tensor *z) {
   int axis = ctx.Attr<int>("axis");
-  ElementwiseComputeEx<ModFunctorFP<T>, DeviceContext, T>(ctx, x, y, axis,
-                                                          ModFunctorFP<T>(), z);
+  auto x_dims = x->dims();
+  auto y_dims = y->dims();
+  if (x_dims.size() >= y_dims.size()) {
+    ElementwiseComputeEx<ModFunctorFP<T>, DeviceContext, T>(
+        ctx, x, y, axis, ModFunctorFP<T>(), z);
+  } else {
+    ElementwiseComputeEx<InverseModFunctorFP<T>, DeviceContext, T>(
+        ctx, x, y, axis, InverseModFunctorFP<T>(), z);
+  }
 }
 
 template <typename DeviceContext, typename T>
diff --git a/paddle/fluid/operators/elementwise/elementwise_mul_op.h b/paddle/fluid/operators/elementwise/elementwise_mul_op.h
index 718321b441b2025afea9d913855b26a82cda8075..e4d3ea6d7291eff8911d8419cda96f2d2738b9a1 100644
--- a/paddle/fluid/operators/elementwise/elementwise_mul_op.h
+++ b/paddle/fluid/operators/elementwise/elementwise_mul_op.h
@@ -33,22 +33,7 @@ class ElementwiseMulOp : public ElementwiseOp {
     auto input_data_type = OperatorWithKernel::IndicateVarDataType(ctx, "X");
 
 #ifdef PADDLE_WITH_MKLDNN
-    using mkldnn::memory;
-    auto CanMKLDNNElementwiseMulBeUsed = [&]() {
-      auto x_dims = ctx.Input<Tensor>("X")->dims();
-      auto y_dims = ctx.Input<Tensor>("Y")->dims();
-      int rankdiff = x_dims.size() - y_dims.size();
-      // TODO(jczaja): Remove this when oneDNN performance for scalar
-      // broadcasting
-      // is improved (Ernie large situation)
-      if (rankdiff != 0 && y_dims.size() == 1 && y_dims[0] == 1) {
-        return false;
-      }
-
-      return true;
-    };
-
-    if (platform::CanMKLDNNBeUsed(ctx) && CanMKLDNNElementwiseMulBeUsed()) {
+    if (platform::CanMKLDNNBeUsed(ctx)) {
       return framework::OpKernelType(input_data_type, ctx.GetPlace(),
                                      framework::DataLayout::kMKLDNN,
                                      framework::LibraryType::kMKLDNN);
diff --git a/paddle/fluid/operators/elementwise/test_elementwise_add_op_inplace.cc b/paddle/fluid/operators/elementwise/test_elementwise_add_op_inplace.cc
index b8163169734bd2c64412bab7286aca9cc5e1b830..6ec8f2c2355ee098aed4a6b92410bcc60bca4736 100644
--- a/paddle/fluid/operators/elementwise/test_elementwise_add_op_inplace.cc
+++ b/paddle/fluid/operators/elementwise/test_elementwise_add_op_inplace.cc
@@ -33,9 +33,12 @@ namespace operators {
 static void Memcpy(void *dst, const void *src, size_t n, bool copy_to_gpu) {
   if (copy_to_gpu) {
 #ifdef PADDLE_WITH_CUDA
-    PADDLE_ENFORCE(cudaMemcpy(dst, src, n, cudaMemcpyHostToDevice));
+    PADDLE_ENFORCE_CUDA_SUCCESS(
+        cudaMemcpy(dst, src, n, cudaMemcpyHostToDevice));
 #else
-    PADDLE_THROW("Not compiled with cuda");
+    PADDLE_THROW(
+        platform::errors::InvalidArgument("Check your paddle version, current "
+                                          "version is not compiled with cuda"));
 #endif
   } else {
     std::memcpy(dst, src, n);
@@ -88,11 +91,22 @@ bool TestMain(const platform::Place &place, const framework::DDim &dims,
 
   framework::LoDTensor cpu_out;
   auto &out_tensor = scope.FindVar(out_name)->Get<framework::LoDTensor>();
-  PADDLE_ENFORCE(scope.kids().empty());
+  PADDLE_ENFORCE_EQ(scope.kids().empty(), true,
+                    platform::errors::InvalidArgument(
+                        "The scope can not have the child scopes,"
+                        "please check your code."));
   if (inplace) {
-    PADDLE_ENFORCE_EQ(&out_tensor, x);
+    PADDLE_ENFORCE_EQ(
+        &out_tensor, x,
+        platform::errors::InvalidArgument(
+            "The output tensor should be same as input x in inplace mode,"
+            " but now is not same."));
   } else {
-    PADDLE_ENFORCE_EQ(&out_tensor, z);
+    PADDLE_ENFORCE_EQ(
+        &out_tensor, z,
+        platform::errors::InvalidArgument(
+            "The output tensor should be same as output z in normal mode,"
+            " but now is not same."));
   }
 
   if (is_gpu_place) {
diff --git a/paddle/fluid/operators/elementwise/test_elementwise_op_grad_grad.h b/paddle/fluid/operators/elementwise/test_elementwise_op_grad_grad.h
index 89849ef92cd19ff5f83f2b57c65c78610d7c2c69..54e7c7d1b6aa9776f5637359b334e6304d7906ce 100644
--- a/paddle/fluid/operators/elementwise/test_elementwise_op_grad_grad.h
+++ b/paddle/fluid/operators/elementwise/test_elementwise_op_grad_grad.h
@@ -92,7 +92,9 @@ class TestElementwiseOpGradGrad {
         auto dst_place = BOOST_GET_CONST(platform::CUDAPlace, place_);
         memory::Copy(dst_place, dst, src_place, src, bytes, nullptr);
 #else
-        PADDLE_THROW("Not compiled with cuda");
+        PADDLE_THROW(platform::errors::InvalidArgument(
+            "Check your paddle version, current version is not compiled with "
+            "cuda"));
 #endif
       }
     }
@@ -107,7 +109,10 @@ class TestElementwiseOpGradGrad {
     op->Run(scope_, place_);
     platform::DeviceContextPool::Instance().Get(place_)->Wait();
     framework::LoDTensor cpu_out;
-    PADDLE_ENFORCE_EQ(scope_.kids().empty(), true, "scope has child scopes");
+    PADDLE_ENFORCE_EQ(scope_.kids().empty(), true,
+                      platform::errors::InvalidArgument(
+                          "The scope can not have the child scopes,"
+                          "please check your code."));
 
     // get outputs from scope and compare them with expected_outs
     bool all_equal = true;
diff --git a/paddle/fluid/operators/empty_op.cc b/paddle/fluid/operators/empty_op.cc
new file mode 100644
index 0000000000000000000000000000000000000000..3d28ca90a5a15fd53a57034a4722a21842dc4b1c
--- /dev/null
+++ b/paddle/fluid/operators/empty_op.cc
@@ -0,0 +1,139 @@
+/* Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/operators/empty_op.h"
+#include "paddle/fluid/framework/op_registry.h"
+
+namespace paddle {
+namespace operators {
+
+class EmptyOpMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  void Make() override {
+    AddInput("ShapeTensor",
+             "(Tensor<int>), optional). The shape of the output."
+             "It has a higher priority than Attr(shape).")
+        .AsDispensable();
+    AddInput("ShapeTensorList",
+             "(vector<Tensor<int>>, optional). The shape of the output. "
+             "It has a higher priority than Attr(shape)."
+             "The shape of the element in vector must be [1].")
+        .AsDuplicable()
+        .AsDispensable();
+    AddAttr<std::vector<int64_t>>("shape",
+                                  "(vector<int64_t>) The shape of the output")
+        .SetDefault({});
+    AddAttr<int>("dtype", "The data type of output tensor, Default is float")
+        .SetDefault(framework::proto::VarType::FP32);
+    AddOutput("Out", "(Tensor) The output tensor.");
+    AddComment(R"DOC(empty operator
+Returns a tensor filled with uninitialized data. The shape of the tensor is
+defined by the variable argument shape.
+
+
+The type of the tensor is specify by `dtype`.
+)DOC");
+  }
+};
+
+class EmptyOp : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+
+  void InferShape(framework::InferShapeContext* context) const override {
+    OP_INOUT_CHECK(context->HasOutput("Out"), "Output", "Out", "empty");
+
+    if (context->HasInput("ShapeTensor")) {
+      auto shape_dims = context->GetInputDim("ShapeTensor");
+      int num_ele = 1;
+      for (int i = 0; i < shape_dims.size(); ++i) {
+        num_ele *= shape_dims[i];
+      }
+      auto vec_dims = std::vector<int>(num_ele, -1);
+      context->SetOutputDim("Out", framework::make_ddim(vec_dims));
+    } else if (context->HasInputs("ShapeTensorList")) {
+      std::vector<int> out_dims;
+      auto dims_list = context->GetInputsDim("ShapeTensorList");
+      for (size_t i = 0; i < dims_list.size(); ++i) {
+        auto& dims = dims_list[i];
+        PADDLE_ENFORCE_EQ(dims, framework::make_ddim({1}),
+                          platform::errors::InvalidArgument(
+                              "The shape of Tensor in list must be [1]. "
+                              "But received the shape is [%s]",
+                              dims));
+
+        out_dims.push_back(-1);
+      }
+
+      context->SetOutputDim("Out", framework::make_ddim(out_dims));
+    } else {
+      auto& shape = context->Attrs().Get<std::vector<int64_t>>("shape");
+      for (size_t i = 0; i < shape.size(); ++i) {
+        PADDLE_ENFORCE_GE(
+            shape[i], 0,
+            platform::errors::InvalidArgument(
+                "Each value of attribute 'shape' is expected to be no less "
+                "than 0. But recieved: shape[%u] = %d; shape = [%s].",
+                i, shape[i], framework::make_ddim(shape)));
+      }
+      context->SetOutputDim("Out", framework::make_ddim(shape));
+    }
+  }
+
+ protected:
+  framework::OpKernelType GetKernelTypeForVar(
+      const std::string& var_name, const framework::Tensor& tensor,
+      const framework::OpKernelType& expected_kernel_type) const override {
+    if (var_name == "ShapeTensor" || var_name == "ShapeTensorList") {
+      return expected_kernel_type;
+    } else {
+      return framework::OpKernelType(expected_kernel_type.data_type_,
+                                     tensor.place(), tensor.layout());
+    }
+  }
+
+  framework::OpKernelType GetExpectedKernelType(
+      const framework::ExecutionContext& context) const override {
+    return framework::OpKernelType(
+        framework::proto::VarType::Type(context.Attr<int>("dtype")),
+        context.GetPlace());
+  }
+};
+
+class EmptyOpVarTypeInference : public framework::VarTypeInference {
+ public:
+  void operator()(framework::InferVarTypeContext* context) const override {
+    auto data_type = static_cast<framework::proto::VarType::Type>(
+        BOOST_GET_CONST(int, context->GetAttr("dtype")));
+    context->SetOutputDataType("Out", data_type);
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+namespace plat = paddle::platform;
+
+REGISTER_OPERATOR(
+    empty, ops::EmptyOp, ops::EmptyOpMaker, ops::EmptyOpVarTypeInference,
+    paddle::framework::EmptyGradOpMaker<paddle::framework::OpDesc>,
+    paddle::framework::EmptyGradOpMaker<paddle::imperative::OpBase>);
+
+REGISTER_OP_CPU_KERNEL(empty, ops::EmptyKernel<plat::CPUDeviceContext, bool>,
+                       ops::EmptyKernel<plat::CPUDeviceContext, int>,
+                       ops::EmptyKernel<plat::CPUDeviceContext, int64_t>,
+                       ops::EmptyKernel<plat::CPUDeviceContext, float>,
+                       ops::EmptyKernel<plat::CPUDeviceContext, double>,
+                       ops::EmptyKernel<plat::CPUDeviceContext, plat::float16>);
diff --git a/paddle/fluid/operators/empty_op.cu.cc b/paddle/fluid/operators/empty_op.cu.cc
new file mode 100644
index 0000000000000000000000000000000000000000..22799e507aeff7940274f729b174f50bfd9132a5
--- /dev/null
+++ b/paddle/fluid/operators/empty_op.cu.cc
@@ -0,0 +1,26 @@
+/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/operators/empty_op.h"
+
+namespace ops = paddle::operators;
+namespace plat = paddle::platform;
+
+REGISTER_OP_CUDA_KERNEL(
+    empty, ops::EmptyKernel<plat::CUDADeviceContext, bool>,
+    ops::EmptyKernel<plat::CUDADeviceContext, int>,
+    ops::EmptyKernel<plat::CUDADeviceContext, int64_t>,
+    ops::EmptyKernel<plat::CUDADeviceContext, float>,
+    ops::EmptyKernel<plat::CUDADeviceContext, double>,
+    ops::EmptyKernel<plat::CUDADeviceContext, plat::float16>);
diff --git a/paddle/fluid/operators/empty_op.h b/paddle/fluid/operators/empty_op.h
new file mode 100644
index 0000000000000000000000000000000000000000..9c91377683870917db28f6f6a5f3f3b1b4a1962f
--- /dev/null
+++ b/paddle/fluid/operators/empty_op.h
@@ -0,0 +1,45 @@
+// Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include <string>
+#include <vector>
+
+#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/operators/utils.h"
+
+namespace paddle {
+namespace operators {
+
+using Tensor = framework::Tensor;
+
+template <typename DeviceContext, typename T>
+class EmptyKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext &context) const override {
+    auto dtype = static_cast<framework::proto::VarType::Type>(
+        context.Attr<int>("dtype"));
+
+    Tensor *out_tensor = context.Output<Tensor>("Out");
+
+    auto shape = GetShape(context);
+    out_tensor->Resize(shape);
+
+    out_tensor->mutable_data(context.GetPlace(), dtype);
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/fluid/operators/expand_op.cc b/paddle/fluid/operators/expand_op.cc
index 3c898ac29f0cab572d199eaafe951751682d4834..83e205367a7af62c52825297d92571c306be2c42 100644
--- a/paddle/fluid/operators/expand_op.cc
+++ b/paddle/fluid/operators/expand_op.cc
@@ -228,6 +228,26 @@ class ExpandGradOpMaker : public framework::SingleGradOpMaker<T> {
   }
 };
 
+template <typename T>
+class ExpandDoubleGradOpMaker : public framework::SingleGradOpMaker<T> {
+ public:
+  using framework::SingleGradOpMaker<T>::SingleGradOpMaker;
+
+ protected:
+  void Apply(GradOpPtr<T> op) const override {
+    op->SetInput("X", this->OutputGrad(framework::GradVarName("X")));
+    op->SetOutput("Out", this->InputGrad(framework::GradVarName("Out")));
+    if (this->HasInput("expand_times_tensor")) {
+      op->SetInput("expand_times_tensor", this->Input("expand_times_tensor"));
+    }
+    if (this->HasInput("ExpandTimes")) {
+      op->SetInput("ExpandTimes", this->Input("ExpandTimes"));
+    }
+    op->SetAttrMap(this->Attrs());
+    op->SetType("expand");
+  }
+};
+
 DECLARE_NO_NEED_BUFFER_VARS_INFERER(ExpandGradNoNeedBufVarsInferer, "X");
 
 }  // namespace operators
@@ -238,6 +258,8 @@ REGISTER_OPERATOR(expand, ops::ExpandOp, ops::ExpandOpMaker,
                   ops::ExpandGradOpMaker<paddle::framework::OpDesc>,
                   ops::ExpandGradOpMaker<paddle::imperative::OpBase>);
 REGISTER_OPERATOR(expand_grad, ops::ExpandGradOp,
+                  ops::ExpandDoubleGradOpMaker<paddle::framework::OpDesc>,
+                  ops::ExpandDoubleGradOpMaker<paddle::imperative::OpBase>,
                   ops::ExpandGradNoNeedBufVarsInferer);
 REGISTER_OP_CPU_KERNEL(
     expand, ops::ExpandKernel<paddle::platform::CPUDeviceContext, float>,
diff --git a/paddle/fluid/operators/expand_v2_op.cc b/paddle/fluid/operators/expand_v2_op.cc
index 359d512c341529579a56dbe840e5eef0aa3062a5..a1ee47b7f93910a481c6e0793c306e2b190c774d 100644
--- a/paddle/fluid/operators/expand_v2_op.cc
+++ b/paddle/fluid/operators/expand_v2_op.cc
@@ -230,6 +230,26 @@ class ExpandV2GradOpMaker : public framework::SingleGradOpMaker<T> {
   }
 };
 
+template <typename T>
+class ExpandV2DoubleGradOpMaker : public framework::SingleGradOpMaker<T> {
+ public:
+  using framework::SingleGradOpMaker<T>::SingleGradOpMaker;
+
+ protected:
+  void Apply(GradOpPtr<T> op) const override {
+    op->SetType("expand_v2");
+    op->SetInput("X", this->OutputGrad(framework::GradVarName("X")));
+    op->SetOutput("Out", this->InputGrad(framework::GradVarName("Out")));
+    if (this->HasInput("expand_shapes_tensor")) {
+      op->SetInput("expand_shapes_tensor", this->Input("expand_shapes_tensor"));
+    }
+    if (this->HasInput("Shape")) {
+      op->SetInput("Shape", this->Input("Shape"));
+    }
+    op->SetAttrMap(this->Attrs());
+  }
+};
+
 DECLARE_NO_NEED_BUFFER_VARS_INFERER(ExpandV2GradNoNeedBufVarsInferer, "X");
 
 }  // namespace operators
@@ -240,6 +260,8 @@ REGISTER_OPERATOR(expand_v2, ops::ExpandV2Op, ops::ExpandV2OpMaker,
                   ops::ExpandV2GradOpMaker<paddle::framework::OpDesc>,
                   ops::ExpandV2GradOpMaker<paddle::imperative::OpBase>);
 REGISTER_OPERATOR(expand_v2_grad, ops::ExpandV2GradOp,
+                  ops::ExpandV2DoubleGradOpMaker<paddle::framework::OpDesc>,
+                  ops::ExpandV2DoubleGradOpMaker<paddle::imperative::OpBase>,
                   ops::ExpandV2GradNoNeedBufVarsInferer);
 REGISTER_OP_CPU_KERNEL(
     expand_v2, ops::ExpandV2Kernel<paddle::platform::CPUDeviceContext, float>,
diff --git a/paddle/fluid/operators/fake_quantize_op.cc b/paddle/fluid/operators/fake_quantize_op.cc
index 04ac4a35208a54361a4f434e68095e9519ee12e9..e9b4c7dacf8b4493fcfa0504ecf7421bd50de90c 100644
--- a/paddle/fluid/operators/fake_quantize_op.cc
+++ b/paddle/fluid/operators/fake_quantize_op.cc
@@ -174,7 +174,64 @@ struct ChannelClipAndFakeQuantFunctor<platform::CPUDeviceContext, T> {
 
 template struct ChannelClipAndFakeQuantFunctor<platform::CPUDeviceContext,
                                                float>;
+template <typename T>
+struct ChannelClipFakeQuantDequantFunctor<platform::CPUDeviceContext, T> {
+  void operator()(const platform::CPUDeviceContext& ctx,
+                  const framework::Tensor& in, const framework::Tensor& scale,
+                  const int bin_cnt, const int quant_axis,
+                  framework::Tensor* out) {
+    PADDLE_ENFORCE_EQ(
+        quant_axis == 0 || quant_axis == 1, true,
+        platform::errors::InvalidArgument("'quant_axis' should be 0 or 1, but "
+                                          "the received is %d",
+                                          quant_axis));
 
+    auto* scale_data = scale.data<T>();
+    auto* in_data = in.data<T>();
+    auto* out_data = out->mutable_data<T>(ctx.GetPlace());
+    auto in_dims = in.dims();
+    const int64_t channel = in_dims[quant_axis];
+    platform::Transform<platform::CPUDeviceContext> trans;
+    if (quant_axis == 0) {
+      const int64_t channel_size = in.numel() / channel;
+      for (int i = 0; i < channel; i++) {
+        T s = scale_data[i];
+        auto* start = in_data + i * channel_size;
+        auto* end = in_data + (i + 1) * channel_size;
+        trans(ctx, start, end, out_data + i * channel_size,
+              ClipFunctor<T>(-s, s));
+      }
+      for (int i = 0; i < channel; i++) {
+        T s = scale_data[i];
+        T inv_s = inverse(s);
+        framework::Tensor one_channel_out = out->Slice(i, i + 1);
+        auto out_e = framework::EigenVector<T>::Flatten(one_channel_out);
+        out_e.device(*ctx.eigen_device()) =
+            (bin_cnt * inv_s * out_e).round() * s / static_cast<T>(bin_cnt);
+      }
+    } else if (quant_axis == 1) {
+      const int64_t step_i = in.numel() / in_dims[0];
+      const int64_t step_j = in.numel() / (in_dims[0] * in_dims[1]);
+      for (int i = 0; i < in_dims[0]; i++) {
+        for (int j = 0; j < in_dims[1]; j++) {
+          T s = scale_data[j];
+          T inv_s = inverse(s);
+          auto* start = in_data + i * step_i + j * step_j;
+          auto* end = in_data + i * step_i + (j + 1) * step_j;
+          auto* cur_out_data = out_data + i * step_i + j * step_j;
+          trans(ctx, start, end, cur_out_data, ClipFunctor<T>(-s, s));
+          for (int k = 0; k < step_j; k++) {
+            cur_out_data[k] = std::round(bin_cnt * inv_s * cur_out_data[k]) *
+                              s / static_cast<T>(bin_cnt);
+          }
+        }
+      }
+    }
+  }
+};
+
+template struct ChannelClipFakeQuantDequantFunctor<platform::CPUDeviceContext,
+                                                   float>;
 template <typename T>
 struct FindRangeAbsMaxFunctor<platform::CPUDeviceContext, T> {
   void operator()(const platform::CPUDeviceContext& ctx,
@@ -360,6 +417,75 @@ $$0 \leq c \lt \ the\ channel\ number\ of\ X$$
   }
 };
 
+class FakeChannelWiseQuantizeDequantizeAbsMaxOp
+    : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+
+  void InferShape(framework::InferShapeContext* ctx) const override {
+    OP_INOUT_CHECK(ctx->HasInput("X"), "Input", "X",
+                   "FakeChannelWiseQuantizeDequantizeAbsMax");
+    OP_INOUT_CHECK(ctx->HasOutput("Out"), "Output", "Out",
+                   "FakeChannelWiseQuantizeDequantizeAbsMax");
+    OP_INOUT_CHECK(ctx->HasOutput("OutScale"), "Output", "OutScale",
+                   "FakeChannelWiseQuantizeDequantizeAbsMax");
+    int quant_axis = ctx->Attrs().Get<int>("quant_axis");
+    ctx->SetOutputDim("Out", ctx->GetInputDim("X"));
+    ctx->SetOutputDim("OutScale", {ctx->GetInputDim("X")[quant_axis]});
+    ctx->ShareLoD("X", /*->*/ "Out");
+  }
+
+ protected:
+  framework::OpKernelType GetExpectedKernelType(
+      const framework::ExecutionContext& ctx) const override {
+    return framework::OpKernelType(
+        OperatorWithKernel::IndicateVarDataType(ctx, "X"), ctx.GetPlace());
+  }
+};
+
+class FakeChannelWiseQuantizeDequantizeAbsMaxOpMaker
+    : public framework::OpProtoAndCheckerMaker {
+ public:
+  void Make() override {
+    AddInput("X", "(Tensor) Input is float data type.");
+    AddOutput("Out",
+              "(Tensor) Output of quantized and dequantized low level tensor, "
+              "saved as float data type.");
+    AddOutput("OutScale", "(Tensor) Current channel wise scale");
+    AddAttr<int>("quant_axis",
+                 "(int, default 0) The axis for quantization. "
+                 "For conv2d, depthwise_conv2d, conv2d_transpose "
+                 "and mul, the quant_axis is equal to the cout axis.")
+        .SetDefault(0)
+        .AddCustomChecker([](const int& quant_axis) {
+          PADDLE_ENFORCE_EQ(quant_axis == 0 || quant_axis == 1, true,
+                            platform::errors::InvalidArgument(
+                                "'quant_axis' should be 0 or 1, but "
+                                "the received is %d",
+                                quant_axis));
+        });
+    AddAttr<int>("bit_length", "(int, default 8)")
+        .SetDefault(8)
+        .AddCustomChecker([](const int& bit_length) {
+          PADDLE_ENFORCE_EQ(bit_length >= 1 && bit_length <= 16, true,
+                            platform::errors::InvalidArgument(
+                                "'bit_length' should be between 1 and 16, but "
+                                "the received is %d",
+                                bit_length));
+        });
+    AddComment(R"DOC(
+The scale of FakeChannelWiseQuantize operator is a vector.
+In detail, each channel of the input X has a scale value.
+
+$$scale_c = max(abs(X_c))$$
+$$range = 2^{bit\_length - 1} - 1$$
+$$Out_c = round(\frac{X_c * range} {scale_c}) * \frac{scale_c} {range}$$
+In above three formulas, the range value of c is as follow:
+$$0 \leq c \lt \ the\ channel\ number\ of\ X$$
+)DOC");
+  }
+};
+
 class FakeQuantizeRangeAbsMaxOp : public framework::OperatorWithKernel {
  public:
   FakeQuantizeRangeAbsMaxOp(const std::string& type,
@@ -666,3 +792,12 @@ REGISTER_OP_CPU_KERNEL(moving_average_abs_max_scale,
 REGISTER_OPERATOR(fake_quantize_dequantize_grad, ops::FakeQuantDequantGradOp);
 REGISTER_OP_CPU_KERNEL(fake_quantize_dequantize_grad,
                        ops::FakeQuantDequantGradKernel<CPU, float>);
+
+REGISTER_OPERATOR(fake_channel_wise_quantize_dequantize_abs_max,
+                  ops::FakeChannelWiseQuantizeDequantizeAbsMaxOp,
+                  ops::FakeChannelWiseQuantizeDequantizeAbsMaxOpMaker,
+                  ops::FakeQuantDequantGradMaker<paddle::framework::OpDesc>,
+                  ops::FakeQuantDequantGradMaker<paddle::imperative::OpBase>);
+REGISTER_OP_CPU_KERNEL(
+    fake_channel_wise_quantize_dequantize_abs_max,
+    ops::FakeChannelWiseQuantizeDequantizeAbsMaxKernel<CPU, float>);
diff --git a/paddle/fluid/operators/fake_quantize_op.cu b/paddle/fluid/operators/fake_quantize_op.cu
index 6ff3c7ec632f236fe4ae6c6504537df3b8a46b7a..8bc14dde8636822354bbaeaf659880ee754dc5b9 100644
--- a/paddle/fluid/operators/fake_quantize_op.cu
+++ b/paddle/fluid/operators/fake_quantize_op.cu
@@ -417,8 +417,90 @@ struct FindMovingAverageAbsMaxFunctor<platform::CUDADeviceContext, T> {
   }
 };
 
-template struct FindMovingAverageAbsMaxFunctor<platform::CUDADeviceContext,
-                                               float>;
+// ChannelClipAndQuantDequantKernel for quant_axis is 0
+template <typename T>
+__global__ void ChannelClipAndQuantDequantKernelQuantAxis0(
+    const T* in, const T* scale, const int bin_cnt, const int n, const int c,
+    T* out) {
+  int tid = threadIdx.x;
+
+  int channel_size = n / c;
+  const T* in_c = in + blockIdx.x * channel_size;
+  T* out_c = out + blockIdx.x * channel_size;
+
+  T s = scale[blockIdx.x];
+  T inv_s = inverse(s);
+
+  for (int i = tid; i < channel_size; i += blockDim.x) {
+    T x = in_c[i];
+    T v = x > s ? s : x;
+    v = v < -s ? -s : v;
+    v = bin_cnt * inv_s * v;
+    out_c[i] = round(v) * s / bin_cnt;
+  }
+}
+
+// ChannelClipAndQuantDequantKernel for quant_axis is 1
+template <typename T>
+__global__ void ChannelClipAndQuantDequantKernelQuantAxis1(
+    const T* in, const T* scale, const int bin_cnt, const int n, const int cin,
+    const int cout, T* out) {
+  T s = scale[blockIdx.x % cout];
+  T inv_s = inverse(s);
+
+  int wh_size = n / (cin * cout);
+  const T* in_c = in + blockIdx.x * wh_size;
+  T* out_c = out + blockIdx.x * wh_size;
+
+  for (int i = threadIdx.x; i < wh_size; i += blockDim.x) {
+    T x = in_c[i];
+    T v = x > s ? s : x;
+    v = v < -s ? -s : v;
+    v = bin_cnt * inv_s * v;
+    out_c[i] = round(v) * s / bin_cnt;
+  }
+}
+
+template <typename T>
+struct ChannelClipFakeQuantDequantFunctor<platform::CUDADeviceContext, T> {
+  void operator()(const platform::CUDADeviceContext& ctx,
+                  const framework::Tensor& in, const framework::Tensor& scale,
+                  const int bin_cnt, const int quant_axis,
+                  framework::Tensor* out) {
+    // At present, channelwise quantization supports conv2d, depthwise_conv2d
+    // conv2d_transpose and mul
+    PADDLE_ENFORCE_EQ(
+        quant_axis == 0 || quant_axis == 1, true,
+        platform::errors::InvalidArgument("'quant_axis' should be 0 or 1, but "
+                                          "the received is %d",
+                                          quant_axis));
+
+    int num = in.numel();
+    auto in_dims = in.dims();
+
+    const T* in_data = in.data<T>();
+    const T* scale_data = scale.data<T>();
+    T* out_data = out->mutable_data<T>(ctx.GetPlace());
+
+    if (quant_axis == 0) {
+      int grid = in_dims[0];
+      int block = 1024;
+      ChannelClipAndQuantDequantKernelQuantAxis0<
+          T><<<grid, block, 0, ctx.stream()>>>(in_data, scale_data, bin_cnt,
+                                               num, in_dims[0], out_data);
+    } else if (quant_axis == 1) {
+      int grid = in_dims[0] * in_dims[1];
+      int block = 1024;
+
+      ChannelClipAndQuantDequantKernelQuantAxis1<
+          T><<<grid, block, 0, ctx.stream()>>>(
+          in_data, scale_data, bin_cnt, num, in_dims[0], in_dims[1], out_data);
+    }
+  }
+};
+
+template struct ChannelClipFakeQuantDequantFunctor<platform::CUDADeviceContext,
+                                                   float>;
 
 }  // namespace operators
 }  // namespace paddle
@@ -443,3 +525,6 @@ REGISTER_OP_CUDA_KERNEL(
     ops::FakeQuantizeDequantizeMovingAverageAbsMaxKernel<CUDA, float>);
 REGISTER_OP_CUDA_KERNEL(fake_quantize_dequantize_grad,
                         ops::FakeQuantDequantGradKernel<CUDA, float>);
+REGISTER_OP_CUDA_KERNEL(
+    fake_channel_wise_quantize_dequantize_abs_max,
+    ops::FakeChannelWiseQuantizeDequantizeAbsMaxKernel<CUDA, float>);
diff --git a/paddle/fluid/operators/fake_quantize_op.h b/paddle/fluid/operators/fake_quantize_op.h
index 5c6e0b1f6e26d84462a18da910b412f03b93285d..2f5afbe0eedf98ac7219772a6705d502069f0385 100644
--- a/paddle/fluid/operators/fake_quantize_op.h
+++ b/paddle/fluid/operators/fake_quantize_op.h
@@ -72,6 +72,13 @@ struct ChannelClipAndFakeQuantFunctor {
                   const int quant_axis, framework::Tensor* out);
 };
 
+template <typename DeviceContext, typename T>
+struct ChannelClipFakeQuantDequantFunctor {
+  void operator()(const DeviceContext& ctx, const framework::Tensor& in,
+                  const framework::Tensor& scale, const int bin_cnt,
+                  const int quant_axis, framework::Tensor* out);
+};
+
 template <typename DeviceContext, typename T>
 struct FindMovingAverageAbsMaxFunctor {
   void operator()(const DeviceContext& ctx, const framework::Tensor& in_accum,
@@ -154,6 +161,30 @@ class FakeChannelWiseQuantizeAbsMaxKernel : public framework::OpKernel<T> {
   }
 };
 
+template <typename DeviceContext, typename T>
+class FakeChannelWiseQuantizeDequantizeAbsMaxKernel
+    : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& context) const override {
+    auto* in = context.Input<framework::Tensor>("X");
+    auto* out = context.Output<framework::Tensor>("Out");
+    auto* out_scale = context.Output<framework::Tensor>("OutScale");
+    T* out_scale_data = out_scale->mutable_data<T>(context.GetPlace());
+    auto& dev_ctx = context.template device_context<DeviceContext>();
+    out->mutable_data<T>(dev_ctx.GetPlace());
+
+    int bit_length = context.Attr<int>("bit_length");
+    int bin_cnt = std::pow(2, bit_length - 1) - 1;
+    int quant_axis = context.Attr<int>("quant_axis");
+
+    FindChannelAbsMaxFunctor<DeviceContext, T>()(dev_ctx, *in, quant_axis,
+                                                 out_scale_data);
+
+    ChannelClipFakeQuantDequantFunctor<DeviceContext, T>()(
+        dev_ctx, *in, *out_scale, bin_cnt, quant_axis, out);
+  }
+};
+
 template <typename DeviceContext, typename T>
 class FakeQuantizeRangeAbsMaxKernel : public framework::OpKernel<T> {
  public:
diff --git a/paddle/fluid/operators/fill_constant_op.h b/paddle/fluid/operators/fill_constant_op.h
index 74939da08b38dc147c156011759757a605db9444..6fea8fe98bf0e19bbbb023c91f4f9900f5ec1859 100644
--- a/paddle/fluid/operators/fill_constant_op.h
+++ b/paddle/fluid/operators/fill_constant_op.h
@@ -27,27 +27,6 @@ namespace operators {
 
 using Tensor = framework::Tensor;
 
-inline framework::DDim GetShape(const framework::ExecutionContext &ctx,
-                                std::string op_type) {
-  // 1. shape is a Tensor
-  if (ctx.HasInput("ShapeTensor")) {
-    auto *shape_tensor = ctx.Input<framework::LoDTensor>("ShapeTensor");
-    auto vec_shape = GetDataFromTensor<int>(shape_tensor);
-    return framework::make_ddim(vec_shape);
-  }
-
-  // 2. shape is a list/tuple containing Tensor
-  auto shape_tensor_list = ctx.MultiInput<framework::Tensor>("ShapeTensorList");
-  if (shape_tensor_list.size() > 0) {
-    auto vec_shape = GetDataFromTensorList(shape_tensor_list);
-    return framework::make_ddim(vec_shape);
-  }
-
-  // 3. shape is a list/tuple without containing Tensor
-  auto vec_shape = ctx.Attr<std::vector<int64_t>>("shape");
-  return framework::make_ddim(vec_shape);
-}
-
 template <typename T>
 class FillConstantKernel : public framework::OpKernel<T> {
  public:
@@ -93,8 +72,7 @@ class FillConstantKernel : public framework::OpKernel<T> {
       }
       value = tensor_data[0];
     }
-    const std::string op_type = "fill_constant";
-    auto shape = GetShape(ctx, op_type);
+    auto shape = GetShape(ctx);
 
     if (out_var->IsType<framework::LoDTensor>()) {
       tensor = out_var->GetMutable<framework::LoDTensor>();
diff --git a/paddle/fluid/operators/fused/CMakeLists.txt b/paddle/fluid/operators/fused/CMakeLists.txt
index 3fc5f3bfc6b1633ffe835606bbac6118e6b32ca6..477a9162fe3f779d4006deb2e20b3a16f70cdf47 100644
--- a/paddle/fluid/operators/fused/CMakeLists.txt
+++ b/paddle/fluid/operators/fused/CMakeLists.txt
@@ -8,7 +8,8 @@ register_operators(EXCLUDES
     multihead_matmul_op
     fused_embedding_eltwise_layernorm_op
     fusion_group_op
-    fusion_gru_op)
+    fusion_gru_op
+    fused_bn_add_activation_op)
 
 # fusion_gru_op does not have CUDA kernel
 op_library(fusion_gru_op)
@@ -47,4 +48,9 @@ if (WITH_GPU)
         file(APPEND ${pybind_file} "USE_CUDA_ONLY_OP(fusion_group);\n")
         cc_test(test_fusion_group_op SRCS fusion_group_op_test.cc DEPS fusion_group_op)
     endif()
+    # fused_bn_add_activation
+    if (NOT ${CUDNN_VERSION} VERSION_LESS 7401)
+    op_library(fused_bn_add_activation_op)
+    file(APPEND ${pybind_file} "USE_CUDA_ONLY_OP(fused_bn_add_activation);\n")
+    endif()
 endif()
diff --git a/paddle/fluid/operators/fused/fused_bn_add_activation_op.cc b/paddle/fluid/operators/fused/fused_bn_add_activation_op.cc
new file mode 100644
index 0000000000000000000000000000000000000000..5b3ed03bb6419cd3c36f6ee2e856f1816d314c75
--- /dev/null
+++ b/paddle/fluid/operators/fused/fused_bn_add_activation_op.cc
@@ -0,0 +1,255 @@
+/* Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/operators/fused/fused_bn_add_activation_op.h"
+#include <memory>
+#include <string>
+#include <unordered_map>
+#include "paddle/fluid/framework/op_registry.h"
+
+namespace paddle {
+namespace operators {
+
+using LoDTensor = framework::LoDTensor;
+
+void FusedBatchNormAddActOp::InferShape(
+    framework::InferShapeContext *ctx) const {
+  OP_INOUT_CHECK(ctx->HasInput("X"), "Input", "X", "FusedBatchNormAddActOp");
+  OP_INOUT_CHECK(ctx->HasInput("Z"), "Input", "Z", "FusedBatchNormAddActOp");
+  OP_INOUT_CHECK(ctx->HasInput("Scale"), "Input", "Scale",
+                 "FusedBatchNormAddActOp");
+  OP_INOUT_CHECK(ctx->HasInput("Bias"), "Input", "Bias",
+                 "FusedBatchNormAddActOp");
+
+  // check output
+  OP_INOUT_CHECK(ctx->HasOutput("Y"), "Output", "Y", "FusedBatchNormAddActOp");
+  OP_INOUT_CHECK(ctx->HasOutput("MeanOut"), "Output", "MeanOut",
+                 "FusedBatchNormAddActOp");
+  OP_INOUT_CHECK(ctx->HasOutput("VarianceOut"), "Output", "VarianceOut",
+                 "FusedBatchNormAddActOp");
+  OP_INOUT_CHECK(ctx->HasOutput("SavedMean"), "Output", "SavedMean",
+                 "FusedBatchNormAddActOp");
+  OP_INOUT_CHECK(ctx->HasOutput("SavedVariance"), "Output", "SavedVariance",
+                 "FusedBatchNormAddActOp");
+
+  const auto x_dims = ctx->GetInputDim("X");
+  const auto z_dims = ctx->GetInputDim("Z");
+  PADDLE_ENFORCE_EQ(x_dims, z_dims,
+                    platform::errors::InvalidArgument(
+                        "ShapeError: the shapes of input "
+                        "must be equal. But received: the shape "
+                        "of input X = [%s], and the shape of "
+                        "input Y = [%s]",
+                        x_dims, z_dims));
+  PADDLE_ENFORCE_GE(x_dims.size(), 2, platform::errors::InvalidArgument(
+                                          "ShapeError: the dimensions of input "
+                                          "must greater than or equal to 2."
+                                          "But received: the shape of input "
+                                          "= [%s], the dimension of input = "
+                                          "[%d]",
+                                          x_dims, x_dims.size()));
+  PADDLE_ENFORCE_LE(x_dims.size(), 5, platform::errors::InvalidArgument(
+                                          "ShapeError: the dimensions of input "
+                                          "must smaller than or equal to 5."
+                                          "But received: the shape of input "
+                                          "= [%s], the dimension of input = "
+                                          "[%d]",
+                                          x_dims, x_dims.size()));
+
+  const int64_t C = x_dims[x_dims.size() - 1];
+
+  auto scale_dim = ctx->GetInputDim("Scale");
+  auto bias_dim = ctx->GetInputDim("Bias");
+
+  PADDLE_ENFORCE_EQ(
+      scale_dim.size(), 1UL,
+      platform::errors::InvalidArgument(
+          "ShapeError: the dimension of scale must equal to 1."
+          "But received: the shape of scale is [%s], the dimension "
+          "of scale is [%d]",
+          scale_dim, scale_dim.size()));
+  PADDLE_ENFORCE_EQ(bias_dim.size(), 1UL,
+                    platform::errors::InvalidArgument(
+                        "ShapeError: the dimension of bias must equal to 1."
+                        "But received: the shape of bias is [%s],the dimension "
+                        "of bias is [%d]",
+                        bias_dim, bias_dim.size()));
+
+  bool check = true;
+  if ((!ctx->IsRuntime()) && (framework::product(scale_dim) <= 0 ||
+                              framework::product(bias_dim) <= 0)) {
+    check = false;
+  }
+
+  if (check) {
+    PADDLE_ENFORCE_EQ(scale_dim[0], C,
+                      platform::errors::InvalidArgument(
+                          "ShapeError: the shape of scale must equal to [%d]"
+                          "But received: the shape of scale is [%d]",
+                          C, scale_dim[0]));
+    PADDLE_ENFORCE_EQ(bias_dim[0], C,
+                      platform::errors::InvalidArgument(
+                          "ShapeError: the shape of bias must equal to [%d]"
+                          "But received: the shape of bias is [%d]",
+                          C, bias_dim[0]));
+  }
+  ctx->SetOutputDim("Y", x_dims);
+  ctx->SetOutputDim("MeanOut", {C});
+  ctx->SetOutputDim("VarianceOut", {C});
+  ctx->SetOutputDim("SavedMean", {C});
+  ctx->SetOutputDim("SavedVariance", {C});
+  ctx->ShareLoD("X", "Y");
+}
+
+framework::OpKernelType FusedBatchNormAddActOp::GetExpectedKernelType(
+    const framework::ExecutionContext &ctx) const {
+  auto input_data_type = OperatorWithKernel::IndicateVarDataType(ctx, "X");
+  // By default, the type of the scale, bias, mean,
+  // and var tensors should be float when input tensor's dtype is float16.
+  auto bn_param_type = framework::proto::VarType::FP32;
+
+  PADDLE_ENFORCE_EQ(
+      bn_param_type, ctx.Input<Tensor>("Scale")->type(),
+      platform::errors::InvalidArgument("Scale input should be of float type"));
+  PADDLE_ENFORCE_EQ(
+      bn_param_type, ctx.Input<Tensor>("Bias")->type(),
+      platform::errors::InvalidArgument("Bias input should be of float type"));
+
+  framework::LibraryType library = framework::LibraryType::kPlain;
+  framework::DataLayout layout = framework::DataLayout::kAnyLayout;
+
+  return framework::OpKernelType(input_data_type, ctx.GetPlace(), layout,
+                                 library);
+}
+
+void FusedBatchNormAddActOpMaker::Make() {
+  AddInput("X", "The input tensor");
+  AddInput("Z", "The input tensor");
+  AddInput("Scale",
+           "Scale is a 1-dimensional tensor of size C "
+           "that is applied to the output");
+  AddInput("Bias",
+           "Bias is a 1-dimensional tensor of size C "
+           "that is applied to the output");
+  AddOutput("Y", "result after normalization");
+  AddOutput("MeanOut",
+            "Share memory with Mean. "
+            "Store the global mean when training");
+  AddOutput("VarianceOut",
+            "Share memory with Variance. "
+            "Store the global Variance when training");
+  AddOutput("SavedMean",
+            "Mean of the current mini batch, "
+            "will apply to output when training")
+      .AsIntermediate();
+  AddOutput("SavedVariance",
+            "Variance of the current mini batch, "
+            "will apply to output when training")
+      .AsIntermediate();
+  AddOutput("ReserveSpace",
+            "Reserve GPU space for triggering the new semi-persistent "
+            "NHWC kernel");
+  AddAttr<float>("momentum", "").SetDefault(0.9);
+  AddAttr<float>("epsilon", "")
+      .SetDefault(1e-5)
+      .AddCustomChecker([](const float &epsilon) {
+        PADDLE_ENFORCE_EQ(epsilon >= 0.0f && epsilon <= 0.001f, true,
+                          platform::errors::InvalidArgument(
+                              "'epsilon' should be between 0.0 and 0.001."));
+      });
+  AddAttr<std::string>("act_type", "The activation type to be fused.")
+      .SetDefault("relu");
+  AddComment(R"DOC(
+Fused Batch Normalization with activation.
+
+Batch Norm has been implemented as discussed in the paper:
+https://arxiv.org/pdf/1502.03167.pdf
+Batch Norm can be used as a normalizer function for conv2d and fully_connected operations.
+Now, the required data format for FusedBatchNormAddActOp is NHWC `[batch, in_height, in_width, in_channels]`.
+
+)DOC");
+}
+
+void FusedBatchNormAddActGradOp::InferShape(
+    framework::InferShapeContext *ctx) const {
+  // check input
+  OP_INOUT_CHECK(ctx->HasInput("X"), "Input", "X",
+                 "FusedBatchNormAddActGradOp");
+  OP_INOUT_CHECK(ctx->HasInput("Z"), "Input", "Z",
+                 "FusedBatchNormAddActGradOp");
+  OP_INOUT_CHECK(ctx->HasInput("Scale"), "Input", "Scale",
+                 "FusedBatchNormAddActGradOp");
+  OP_INOUT_CHECK(ctx->HasInput("SavedMean"), "Input", "SavedMean",
+                 "FusedBatchNormAddActGradOp");
+  OP_INOUT_CHECK(ctx->HasInput("SavedVariance"), "Input", "SavedVariance",
+                 "FusedBatchNormAddActGradOp");
+  OP_INOUT_CHECK(ctx->HasInput(framework::GradVarName("Y")), "Input",
+                 framework::GradVarName("Y"), "FusedBatchNormAddActGradOp");
+
+  // check output
+  OP_INOUT_CHECK(ctx->HasOutput(framework::GradVarName("X")), "Output",
+                 framework::GradVarName("X"), "FusedBatchNormAddActGradOp");
+  OP_INOUT_CHECK(ctx->HasOutput(framework::GradVarName("Z")), "Output",
+                 framework::GradVarName("Z"), "FusedBatchNormAddActGradOp");
+  OP_INOUT_CHECK(ctx->HasOutput(framework::GradVarName("Scale")), "Output",
+                 framework::GradVarName("Scale"), "FusedBatchNormAddActGradOp");
+  OP_INOUT_CHECK(ctx->HasOutput(framework::GradVarName("Bias")), "Output",
+                 framework::GradVarName("Bias"), "FusedBatchNormAddActGradOp");
+
+  const auto in_dims = ctx->GetInputDim("X");
+  const int C = in_dims[in_dims.size() - 1];
+
+  ctx->SetOutputDim(framework::GradVarName("X"), in_dims);
+  ctx->SetOutputDim(framework::GradVarName("Z"), in_dims);
+  ctx->SetOutputDim(framework::GradVarName("Scale"), {C});
+  ctx->SetOutputDim(framework::GradVarName("Bias"), {C});
+}
+
+framework::OpKernelType FusedBatchNormAddActGradOp::GetExpectedKernelType(
+    const framework::ExecutionContext &ctx) const {
+  const auto *var = ctx.InputVar(framework::GradVarName("Y"));
+  if (var == nullptr) {
+    PADDLE_THROW(platform::errors::NotFound(
+        "Can not find Y@GRAD in the execution context."));
+  }
+  const Tensor *t = nullptr;
+  if (var->IsType<Tensor>()) {
+    t = &var->Get<Tensor>();
+  } else if (var->IsType<LoDTensor>()) {
+    t = &var->Get<LoDTensor>();
+  }
+  if (t == nullptr) {
+    PADDLE_THROW(
+        platform::errors::NotFound("Can not get the tensor value of Y@GRAD."));
+  }
+
+  framework::LibraryType library = framework::LibraryType::kPlain;
+  framework::DataLayout layout = framework::DataLayout::kAnyLayout;
+
+  return framework::OpKernelType(
+      OperatorWithKernel::IndicateVarDataType(ctx, "X"), ctx.GetPlace(), layout,
+      library);
+}
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+REGISTER_OPERATOR(
+    fused_bn_add_activation, ops::FusedBatchNormAddActOp,
+    ops::FusedBatchNormAddActOpMaker, ops::FusedBatchNormAddActOpInferVarType,
+    ops::FusedBatchNormAddActGradOpMaker<paddle::framework::OpDesc>,
+    ops::FusedBatchNormAddActGradOpMaker<paddle::imperative::OpBase>);
+REGISTER_OPERATOR(fused_bn_add_activation_grad,
+                  ops::FusedBatchNormAddActGradOp);
diff --git a/paddle/fluid/operators/fused/fused_bn_add_activation_op.cu b/paddle/fluid/operators/fused/fused_bn_add_activation_op.cu
new file mode 100644
index 0000000000000000000000000000000000000000..7f1d297cda3fae54cdde089f25ccdf6715142c5f
--- /dev/null
+++ b/paddle/fluid/operators/fused/fused_bn_add_activation_op.cu
@@ -0,0 +1,338 @@
+// Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <algorithm>
+#include <cfloat>
+#include <string>
+#include <vector>
+#include "paddle/fluid/framework/data_layout.h"
+#include "paddle/fluid/operators/activation_op.h"
+#include "paddle/fluid/operators/fused/fused_bn_add_activation_op.h"
+#include "paddle/fluid/operators/math/math_function.h"
+#include "paddle/fluid/operators/norm_utils.h"
+#include "paddle/fluid/platform/cudnn_helper.h"
+#include "paddle/fluid/platform/float16.h"
+
+DECLARE_bool(cudnn_batchnorm_spatial_persistent);
+
+namespace paddle {
+namespace operators {
+using Tensor = framework::Tensor;
+template <typename T>
+using CudnnDataType = platform::CudnnDataType<T>;
+template <typename T>
+using BatchNormParamType = typename CudnnDataType<T>::BatchNormParamType;
+
+template <typename T>
+class FusedBatchNormAddActKernel<platform::CUDADeviceContext, T>
+    : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext &ctx) const override {
+    PADDLE_ENFORCE_EQ(
+        platform::is_gpu_place(ctx.GetPlace()), true,
+        platform::errors::PreconditionNotMet("It must use CUDAPlace."));
+    double epsilon = static_cast<double>(ctx.Attr<float>("epsilon"));
+    float momentum = ctx.Attr<float>("momentum");
+    std::string act_type = ctx.Attr<std::string>("act_type");
+
+    if (epsilon <= CUDNN_BN_MIN_EPSILON - FLT_EPSILON) {
+      LOG(ERROR) << "Provided epsilon is smaller than "
+                 << "CUDNN_BN_MIN_EPSILON. Setting it to "
+                 << "CUDNN_BN_MIN_EPSILON instead.";
+    }
+    epsilon = std::max(epsilon, CUDNN_BN_MIN_EPSILON);
+
+    // Get the size for each dimension.
+    // NHWC [batch_size, in_height, in_width, in_channels]
+    const auto *x = ctx.Input<Tensor>("X");
+    const auto *z = ctx.Input<Tensor>("Z");
+    const auto &in_dims = x->dims();
+
+    const auto *scale = ctx.Input<Tensor>("Scale");
+    const auto *bias = ctx.Input<Tensor>("Bias");
+
+    auto *mean_out = ctx.Output<Tensor>("MeanOut");
+    auto *variance_out = ctx.Output<Tensor>("VarianceOut");
+    mean_out->mutable_data<BatchNormParamType<T>>(ctx.GetPlace());
+    variance_out->mutable_data<BatchNormParamType<T>>(ctx.GetPlace());
+
+    auto *saved_mean = ctx.Output<Tensor>("SavedMean");
+    auto *saved_variance = ctx.Output<Tensor>("SavedVariance");
+    saved_mean->mutable_data<BatchNormParamType<T>>(ctx.GetPlace());
+    saved_variance->mutable_data<BatchNormParamType<T>>(ctx.GetPlace());
+
+    auto *y = ctx.Output<Tensor>("Y");
+    y->mutable_data<T>(ctx.GetPlace());
+
+    int N, C, H, W, D;
+    const DataLayout data_layout = DataLayout::kNHWC;
+    ExtractNCWHD(in_dims, data_layout, &N, &C, &H, &W, &D);
+
+    auto &dev_ctx = ctx.template device_context<platform::CUDADeviceContext>();
+
+    // ------------------- cudnn descriptors ---------------------
+    auto handle = dev_ctx.cudnn_handle();
+    cudnnTensorDescriptor_t data_desc_;
+    cudnnTensorDescriptor_t bn_param_desc_;
+    cudnnBatchNormMode_t mode_ = CUDNN_BATCHNORM_SPATIAL_PERSISTENT;
+
+    PADDLE_ENFORCE_CUDA_SUCCESS(
+        platform::dynload::cudnnCreateTensorDescriptor(&data_desc_));
+    PADDLE_ENFORCE_CUDA_SUCCESS(
+        platform::dynload::cudnnCreateTensorDescriptor(&bn_param_desc_));
+
+    std::vector<int> dims = {N, C, H, W, D};
+    std::vector<int> strides = {H * W * D * C, 1, W * D * C, D * C, C};
+
+    PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::cudnnSetTensorNdDescriptor(
+        data_desc_, CudnnDataType<T>::type,
+        in_dims.size() > 3 ? in_dims.size() : 4, dims.data(), strides.data()));
+    PADDLE_ENFORCE_CUDA_SUCCESS(
+        platform::dynload::cudnnDeriveBNTensorDescriptor(bn_param_desc_,
+                                                         data_desc_, mode_));
+
+    double this_factor = 1. - momentum;
+    cudnnBatchNormOps_t bnOps_ = CUDNN_BATCHNORM_OPS_BN_ADD_ACTIVATION;
+    platform::ScopedActivationDescriptor scope_act_desc;
+    cudnnActivationDescriptor_t activation_desc_ =
+        scope_act_desc.descriptor<T>(act_type);
+    size_t workspace_size = 0;
+    size_t reserve_space_size = 0;
+    void *reserve_space_ptr = nullptr;
+    void *workspace_ptr = nullptr;
+    Tensor workspace_tensor;
+    // Create reserve space and workspace for batch norm.
+    // Create tensor for each batchnorm op, it will be used in the
+    // backward. Thus this tensor shouldn't be temp.
+    auto *reserve_space = ctx.Output<Tensor>("ReserveSpace");
+    PADDLE_ENFORCE_NOT_NULL(
+        reserve_space,
+        platform::errors::NotFound(
+            "The argument ReserveSpace of batch_norm op is not found."));
+
+    // --------------- cudnn batchnorm workspace ---------------
+    PADDLE_ENFORCE_CUDA_SUCCESS(
+        platform::dynload::
+            cudnnGetBatchNormalizationForwardTrainingExWorkspaceSize(
+                /*handle=*/handle,
+                /*mode=*/mode_,
+                /*bnOps=*/bnOps_,
+                /*xDesc=*/data_desc_,
+                /*zDesc=*/data_desc_,
+                /*yDesc=*/data_desc_,
+                /*bnScaleBiasMeanVarDesc=*/bn_param_desc_,
+                /*activationDesc=*/activation_desc_,
+                /*sizeInBytes=*/&workspace_size));
+
+    // -------------- cudnn batchnorm reserve space --------------
+    PADDLE_ENFORCE_CUDA_SUCCESS(
+        platform::dynload::cudnnGetBatchNormalizationTrainingExReserveSpaceSize(
+            /*handle=*/handle,
+            /*mode=*/mode_,
+            /*bnOps=*/bnOps_,
+            /*activationDesc=*/activation_desc_,
+            /*xDesc=*/data_desc_,
+            /*sizeInBytes=*/&reserve_space_size));
+
+    reserve_space_ptr = reserve_space->mutable_data(ctx.GetPlace(), x->type(),
+                                                    reserve_space_size);
+    workspace_ptr = workspace_tensor.mutable_data(ctx.GetPlace(), x->type(),
+                                                  workspace_size);
+    PADDLE_ENFORCE_CUDA_SUCCESS(
+        platform::dynload::cudnnBatchNormalizationForwardTrainingEx(
+            handle, mode_, bnOps_, CudnnDataType<T>::kOne(),
+            CudnnDataType<T>::kZero(), data_desc_, x->template data<T>(),
+            data_desc_, z->template data<T>(), data_desc_,
+            y->template data<T>(), bn_param_desc_,
+            scale->template data<BatchNormParamType<T>>(),
+            bias->template data<BatchNormParamType<T>>(), this_factor,
+            mean_out->template mutable_data<BatchNormParamType<T>>(
+                ctx.GetPlace()),
+            variance_out->template mutable_data<BatchNormParamType<T>>(
+                ctx.GetPlace()),
+            epsilon, saved_mean->template mutable_data<BatchNormParamType<T>>(
+                         ctx.GetPlace()),
+            saved_variance->template mutable_data<BatchNormParamType<T>>(
+                ctx.GetPlace()),
+            activation_desc_, workspace_ptr, workspace_size, reserve_space_ptr,
+            reserve_space_size));
+
+    // clean when exit.
+    PADDLE_ENFORCE_CUDA_SUCCESS(
+        platform::dynload::cudnnDestroyTensorDescriptor(data_desc_));
+    PADDLE_ENFORCE_CUDA_SUCCESS(
+        platform::dynload::cudnnDestroyTensorDescriptor(bn_param_desc_));
+  }
+};
+
+template <typename T>
+class FusedBatchNormAddActGradKernel<platform::CUDADeviceContext, T>
+    : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext &ctx) const override {
+    PADDLE_ENFORCE_EQ(
+        platform::is_gpu_place(ctx.GetPlace()), true,
+        platform::errors::PreconditionNotMet("It must use CUDAPlace."));
+    double epsilon = static_cast<double>(ctx.Attr<float>("epsilon"));
+    std::string act_type = ctx.Attr<std::string>("act_type");
+
+    const auto *x = ctx.Input<Tensor>("X");
+    const auto *z = ctx.Input<Tensor>("Z");
+    const auto *y = ctx.Input<Tensor>("Y");
+    const auto *d_y = ctx.Input<Tensor>(framework::GradVarName("Y"));
+    const auto *scale = ctx.Input<Tensor>("Scale");
+    const auto *bias = ctx.Input<Tensor>("Bias");
+    const auto *reserve_space = ctx.Input<Tensor>("ReserveSpace");
+
+    const auto &in_dims = x->dims();
+
+    int N, C, H, W, D;
+    const DataLayout data_layout = DataLayout::kNHWC;
+    ExtractNCWHD(in_dims, data_layout, &N, &C, &H, &W, &D);
+
+    // init output
+    auto *d_x = ctx.Output<Tensor>(framework::GradVarName("X"));
+    auto *d_z = ctx.Output<Tensor>(framework::GradVarName("Z"));
+    auto *d_scale = ctx.Output<Tensor>(framework::GradVarName("Scale"));
+    auto *d_bias = ctx.Output<Tensor>(framework::GradVarName("Bias"));
+
+    d_x->mutable_data<T>(ctx.GetPlace());
+    d_z->mutable_data<T>(ctx.GetPlace());
+    PADDLE_ENFORCE_EQ(
+        d_scale && d_bias, true,
+        platform::errors::PreconditionNotMet(
+            "Both the scale grad and the bias grad must not be null."));
+    d_scale->mutable_data<BatchNormParamType<T>>(ctx.GetPlace());
+    d_bias->mutable_data<BatchNormParamType<T>>(ctx.GetPlace());
+    PADDLE_ENFORCE_EQ(scale->dims().size(), 1UL,
+                      platform::errors::PreconditionNotMet(
+                          "The scale only has one dimension."));
+    PADDLE_ENFORCE_EQ(
+        scale->dims()[0], C,
+        platform::errors::PreconditionNotMet(
+            "The size of scale is equal to the channel of Input(X)."));
+
+    auto &dev_ctx = ctx.template device_context<platform::CUDADeviceContext>();
+
+    std::vector<int> dims = {N, C, H, W, D};
+    std::vector<int> strides = {H * W * C * D, 1, W * D * C, D * C, C};
+    // ------------------- cudnn descriptors ---------------------
+    cudnnTensorDescriptor_t data_desc_;
+    cudnnTensorDescriptor_t bn_param_desc_;
+    cudnnBatchNormMode_t mode_ = CUDNN_BATCHNORM_SPATIAL_PERSISTENT;
+
+    PADDLE_ENFORCE_CUDA_SUCCESS(
+        platform::dynload::cudnnCreateTensorDescriptor(&data_desc_));
+    PADDLE_ENFORCE_CUDA_SUCCESS(
+        platform::dynload::cudnnCreateTensorDescriptor(&bn_param_desc_));
+    if (epsilon <= CUDNN_BN_MIN_EPSILON - FLT_EPSILON) {
+      LOG(ERROR) << "Provided epsilon is smaller than "
+                 << "CUDNN_BN_MIN_EPSILON. Setting it to "
+                 << "CUDNN_BN_MIN_EPSILON instead.";
+    }
+    epsilon = std::max(epsilon, CUDNN_BN_MIN_EPSILON);
+
+    PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::cudnnSetTensorNdDescriptor(
+        data_desc_, CudnnDataType<T>::type,
+        in_dims.size() > 3 ? in_dims.size() : 4, dims.data(), strides.data()));
+    PADDLE_ENFORCE_CUDA_SUCCESS(
+        platform::dynload::cudnnDeriveBNTensorDescriptor(bn_param_desc_,
+                                                         data_desc_, mode_));
+
+    const auto *saved_mean = ctx.Input<Tensor>("SavedMean");
+    const auto *saved_var = ctx.Input<Tensor>("SavedVariance");
+    const auto *saved_mean_data =
+        saved_mean->template data<BatchNormParamType<T>>();
+    const auto *saved_var_data =
+        saved_var->template data<BatchNormParamType<T>>();
+
+    size_t workspace_size = 0;
+    void *workspace_ptr = nullptr;
+    Tensor workspace_tensor;
+    auto reserve_space_size = reserve_space->memory_size();
+    cudnnBatchNormOps_t bnOps_ = CUDNN_BATCHNORM_OPS_BN_ADD_ACTIVATION;
+    platform::ScopedActivationDescriptor scope_act_desc;
+    cudnnActivationDescriptor_t activation_desc_ =
+        scope_act_desc.descriptor<T>(act_type);
+    // --------------- cudnn batchnorm workspace ---------------
+    PADDLE_ENFORCE_CUDA_SUCCESS(
+        platform::dynload::cudnnGetBatchNormalizationBackwardExWorkspaceSize(
+            /*handle=*/dev_ctx.cudnn_handle(),
+            /*mode=*/mode_,
+            /*bnOps=*/bnOps_,
+            /*xDesc=*/data_desc_,
+            /*yDesc=*/data_desc_,
+            /*dyDesc=*/data_desc_,
+            /*dzDesc=*/data_desc_,
+            /*dxDesc=*/data_desc_,
+            /*bnScaleBiasMeanVarDesc=*/bn_param_desc_,
+            /*activationDesc=*/activation_desc_,
+            /*sizeInBytes=*/&workspace_size));
+
+    workspace_ptr = workspace_tensor.mutable_data(ctx.GetPlace(), x->type(),
+                                                  workspace_size);
+    PADDLE_ENFORCE_CUDA_SUCCESS(
+        platform::dynload::cudnnBatchNormalizationBackwardEx(
+            /*handle=*/dev_ctx.cudnn_handle(),
+            /*mode=*/mode_,
+            /*bnOps=*/bnOps_,
+            /*alphaDataDiff=*/CudnnDataType<T>::kOne(),
+            /*betaDataDiff=*/CudnnDataType<T>::kZero(),
+            /*alphaParamDiff=*/CudnnDataType<T>::kOne(),
+            /*betaParamDiff=*/CudnnDataType<T>::kZero(),
+            /*xDesc=*/data_desc_,
+            /*xData=*/x->template data<T>(),
+            /*yDesc=*/data_desc_,
+            /*yData=*/y->template data<T>(),
+            /*dyDesc=*/data_desc_,
+            /*dyData=*/d_y->template data<T>(),
+            /*dzDesc=*/data_desc_,
+            /*dzData=*/d_z->template data<T>(),
+            /*dxDesc=*/data_desc_,
+            /*dxData=*/d_x->template data<T>(),
+            /*dBnScaleBiasDesc=*/bn_param_desc_,
+            /*bnScaleData=*/scale->template data<BatchNormParamType<T>>(),
+            /*bnBiasData=*/bias->template data<BatchNormParamType<T>>(),
+            /*dBnScaleData=*/d_scale->template data<BatchNormParamType<T>>(),
+            /*dBnBiasData=*/d_bias->template data<BatchNormParamType<T>>(),
+            /*epsilon=*/epsilon,
+            /*savedMean=*/saved_mean_data,
+            /*savedInvVariance=*/saved_var_data,
+            /*activationDesmc=*/activation_desc_,
+            /*workspace=*/workspace_ptr,
+            /*workSpaceSizeInBytes=*/workspace_size,
+            /*reserveSpace=*/const_cast<T *>(reserve_space->template data<T>()),
+            /*reserveSpaceSizeInBytes=*/reserve_space_size));
+
+    // clean when exit.
+    PADDLE_ENFORCE_CUDA_SUCCESS(
+        platform::dynload::cudnnDestroyTensorDescriptor(data_desc_));
+    PADDLE_ENFORCE_CUDA_SUCCESS(
+        platform::dynload::cudnnDestroyTensorDescriptor(bn_param_desc_));
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+#if CUDNN_VERSION >= 7401
+namespace ops = paddle::operators;
+namespace plat = paddle::platform;
+REGISTER_OP_CUDA_KERNEL(
+    fused_bn_add_activation,
+    ops::FusedBatchNormAddActKernel<plat::CUDADeviceContext, plat::float16>);
+REGISTER_OP_CUDA_KERNEL(fused_bn_add_activation_grad,
+                        ops::FusedBatchNormAddActGradKernel<
+                            plat::CUDADeviceContext, plat::float16>);
+#endif
diff --git a/paddle/fluid/operators/fused/fused_bn_add_activation_op.h b/paddle/fluid/operators/fused/fused_bn_add_activation_op.h
new file mode 100644
index 0000000000000000000000000000000000000000..5c7df96e60dd89b74058ead837bb75555f3674ad
--- /dev/null
+++ b/paddle/fluid/operators/fused/fused_bn_add_activation_op.h
@@ -0,0 +1,106 @@
+/* Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include <memory>
+#include <string>
+#include <unordered_map>
+#include "paddle/fluid/framework/grad_op_desc_maker.h"
+#include "paddle/fluid/framework/op_proto_maker.h"
+#include "paddle/fluid/framework/operator.h"
+#include "paddle/fluid/framework/tensor.h"
+#include "paddle/fluid/framework/var_type_inference.h"
+
+namespace paddle {
+namespace operators {
+using Tensor = framework::Tensor;
+
+class FusedBatchNormAddActOp : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+  void InferShape(framework::InferShapeContext* ctx) const override;
+
+ protected:
+  framework::OpKernelType GetExpectedKernelType(
+      const framework::ExecutionContext& ctx) const override;
+};
+
+class FusedBatchNormAddActGradOp : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+  void InferShape(framework::InferShapeContext* ctx) const override;
+
+ protected:
+  framework::OpKernelType GetExpectedKernelType(
+      const framework::ExecutionContext& ctx) const override;
+};
+
+class FusedBatchNormAddActOpMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  void Make() override;
+};
+
+template <typename T>
+class FusedBatchNormAddActGradOpMaker : public framework::SingleGradOpMaker<T> {
+ public:
+  using framework::SingleGradOpMaker<T>::SingleGradOpMaker;
+
+ protected:
+  void Apply(GradOpPtr<T> op) const override {
+    op->SetType(this->ForwardOpType() + "_grad");
+    op->SetInput("X", this->Input("X"));
+    op->SetInput("Z", this->Input("Z"));
+    op->SetInput("Y", this->Output("Y"));
+    op->SetInput(framework::GradVarName("Y"), this->OutputGrad("Y"));
+
+    op->SetInput("Scale", this->Input("Scale"));
+    op->SetInput("Bias", this->Input("Bias"));
+    op->SetInput("SavedMean", this->Output("SavedMean"));
+    op->SetInput("SavedVariance", this->Output("SavedVariance"));
+    op->SetInput("ReserveSpace", this->Output("ReserveSpace"));
+
+    op->SetAttrMap(this->Attrs());
+
+    op->SetOutput(framework::GradVarName("X"), this->InputGrad("X"));
+    op->SetOutput(framework::GradVarName("Z"), this->InputGrad("Z"));
+    op->SetOutput(framework::GradVarName("Scale"), this->InputGrad("Scale"));
+    op->SetOutput(framework::GradVarName("Bias"), this->InputGrad("Bias"));
+  }
+};
+
+class FusedBatchNormAddActOpInferVarType
+    : public framework::PassInDtypeAndVarTypeToOutput {
+ protected:
+  std::unordered_map<std::string, std::string>& GetInputOutputWithSameType()
+      const override {
+    static std::unordered_map<std::string, std::string> m{{"X", /*->*/ "Y"}};
+    return m;
+  }
+};
+
+template <typename DeviceContext, typename T>
+class FusedBatchNormAddActKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override;
+};
+
+template <typename DeviceContext, typename T>
+class FusedBatchNormAddActGradKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override;
+};
+
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/fluid/operators/fused/fused_embedding_fc_lstm_op.cc b/paddle/fluid/operators/fused/fused_embedding_fc_lstm_op.cc
index c698cb1405fd6f049e01b23613e175ba39c4976e..79fa268f3884b2710fe08eb2907dbd989479d7e6 100644
--- a/paddle/fluid/operators/fused/fused_embedding_fc_lstm_op.cc
+++ b/paddle/fluid/operators/fused/fused_embedding_fc_lstm_op.cc
@@ -367,8 +367,13 @@ class FusedEmbeddingFCLSTMKernel : public framework::OpKernel<T> {
     auto blas = math::GetBlas<DeviceContext, T>(ctx);
 
     for (int64_t i = 0; i < ids_numel; ++i) {
-      PADDLE_ENFORCE_LT(ids_data[i], row_number);
-      PADDLE_ENFORCE_GE(ids_data[i], 0, "ids %d", i);
+      PADDLE_ENFORCE_LT(
+          ids_data[i], row_number,
+          platform::errors::OutOfRange(
+              "Value of Ids %d should less than dict size %d.", i, row_number));
+      PADDLE_ENFORCE_GE(ids_data[i], 0,
+                        platform::errors::OutOfRange(
+                            "Value of Ids %d should greater than ZERO.", i));
       memcpy(xx_data + i * row_width, embeddings_data + ids_data[i] * row_width,
              row_width * sizeof(T));
     }
@@ -473,8 +478,13 @@ class FusedEmbeddingFCLSTMKernel : public framework::OpKernel<T> {
     auto blas = math::GetBlas<DeviceContext, T>(dev_ctx);
 
     for (int64_t i = 0; i < ids_numel; ++i) {
-      PADDLE_ENFORCE_LT(ids_data[i], row_number);
-      PADDLE_ENFORCE_GE(ids_data[i], 0, "ids %d", i);
+      PADDLE_ENFORCE_LT(
+          ids_data[i], row_number,
+          platform::errors::OutOfRange(
+              "Value of Ids %d should less than dict size %d.", i, row_number));
+      PADDLE_ENFORCE_GE(ids_data[i], 0,
+                        platform::errors::OutOfRange(
+                            "Value of Ids %d should greater than ZERO.", i));
       memcpy(xx_data + i * row_width, embeddings_data + ids_data[i] * row_width,
              row_width * sizeof(T));
     }
diff --git a/paddle/fluid/operators/fused/fusion_gru_op.cc b/paddle/fluid/operators/fused/fusion_gru_op.cc
index d0920098f606e49d4d1a3e4cb6d8a2b6c44ca267..e3776a80b316089891282136022a4e6656360c6e 100644
--- a/paddle/fluid/operators/fused/fusion_gru_op.cc
+++ b/paddle/fluid/operators/fused/fusion_gru_op.cc
@@ -15,6 +15,7 @@ limitations under the License. */
 #include "paddle/fluid/operators/fused/fusion_gru_op.h"
 #include <cstring>  // for memcpy
 #include <string>
+#include <vector>
 #include "paddle/fluid/operators/jit/kernels.h"
 #include "paddle/fluid/operators/math/blas.h"
 #include "paddle/fluid/operators/math/fc.h"
@@ -30,16 +31,18 @@ void FusionGRUOp::InferShape(framework::InferShapeContext* ctx) const {
   OP_INOUT_CHECK(ctx->HasInput("X"), "Input", "X", "fusion_gru");
   OP_INOUT_CHECK(ctx->HasInput("WeightX"), "Input", "WeightX", "fusion_gru");
   OP_INOUT_CHECK(ctx->HasInput("WeightH"), "Input", "WeightH", "fusion_gru");
-
   OP_INOUT_CHECK(ctx->HasOutput("XX"), "Output", "XX", "fusion_gru");
   OP_INOUT_CHECK(ctx->HasOutput("Hidden"), "Output", "Hidden", "fusion_gru");
-
   auto x_dims = ctx->GetInputDim("X");
-  PADDLE_ENFORCE_EQ(x_dims.size(), 2,
-                    platform::errors::InvalidArgument(
-                        "Input(X)'s rank must be 2, but received input dim "
-                        "size is:%d, input dim is:[%s]",
-                        x_dims.size(), x_dims));
+  auto x_mat_dims = (x_dims.size() == 3 && x_dims[1] == 1)
+                        ? framework::flatten_to_2d(x_dims, 1)
+                        : x_dims;
+  PADDLE_ENFORCE_EQ(
+      x_mat_dims.size(), 2,
+      platform::errors::InvalidArgument("The size of input X dims should be 2, "
+                                        "or 3 with second dimension equal to "
+                                        "1, but now Input X dim is:[%s] ",
+                                        x_dims));
 
   auto wx_dims = ctx->GetInputDim("WeightX");
   PADDLE_ENFORCE_EQ(wx_dims.size(), 2,
@@ -47,12 +50,14 @@ void FusionGRUOp::InferShape(framework::InferShapeContext* ctx) const {
                         "The rank of Input(WeightX) should be 2, but received "
                         "WeightX dim size is:%d, WeightX dim is:[%s] ",
                         wx_dims.size(), wx_dims));
-  PADDLE_ENFORCE_EQ(wx_dims[0], x_dims[1],
-                    platform::errors::InvalidArgument(
-                        "The first dimension of Input(WeightX) "
-                        "should equal to second dimension of input x, but "
-                        "received WeightX dimension is:%d, x dimension is:%d",
-                        wx_dims[0], x_dims[1]));
+  PADDLE_ENFORCE_EQ(
+      wx_dims[0], x_mat_dims[1],
+      platform::errors::InvalidArgument(
+          "The first dimension of flattened WeightX"
+          "should equal to last dimension of flattened input X, but "
+          "received fattened WeightX dimension is:%d, flattened X dimension "
+          "is:%d",
+          wx_dims[0], x_mat_dims[1]));
 
   int frame_size = wx_dims[1] / 3;
   auto wh_dims = ctx->GetInputDim("WeightH");
@@ -102,24 +107,24 @@ void FusionGRUOp::InferShape(framework::InferShapeContext* ctx) const {
                           "received bias dim is:[%s], frame size is:%d",
                           b_dims, frame_size));
   }
-  framework::DDim out_dims({x_dims[0], frame_size});
+  framework::DDim out_dims({x_mat_dims[0], frame_size});
   ctx->SetOutputDim("Hidden", out_dims);
   ctx->ShareLoD("X", "Hidden");
   int xx_width;
   if (ctx->Attrs().Get<bool>("use_seq")) {
     xx_width = wx_dims[1];
   } else {
-    xx_width = x_dims[1] > wx_dims[1] ? wx_dims[1] : x_dims[1];
+    xx_width = x_mat_dims[1] > wx_dims[1] ? wx_dims[1] : x_mat_dims[1];
     OP_INOUT_CHECK(ctx->HasOutput("ReorderedH0"), "Output", "ReorderedH0",
                    "fusion_gru");
     OP_INOUT_CHECK(ctx->HasOutput("BatchedInput"), "Output", "BatchedInput",
                    "fusion_gru");
     OP_INOUT_CHECK(ctx->HasOutput("BatchedOut"), "Output", "BatchedOut",
                    "fusion_gru");
-    ctx->SetOutputDim("BatchedInput", {x_dims[0], wx_dims[1]});
+    ctx->SetOutputDim("BatchedInput", {x_mat_dims[0], wx_dims[1]});
     ctx->SetOutputDim("BatchedOut", out_dims);
   }
-  ctx->SetOutputDim("XX", {x_dims[0], xx_width});
+  ctx->SetOutputDim("XX", {x_mat_dims[0], xx_width});
   ctx->ShareLoD("X", "XX");
 }
 
@@ -202,6 +207,27 @@ void FusionGRUOpMaker::Make() {
   AddAttr<bool>("use_mkldnn",
                 "(bool, default false) Only used in mkldnn kernel")
       .SetDefault(false);
+  AddAttr<std::string>(
+      "mkldnn_data_type",
+      "(string, default \"float32\"). Data type of mkldnn kernel")
+      .SetDefault("float32")
+      .InEnum({"float32", "int8", "bfloat16"});
+  AddAttr<float>("Scale_data",
+                 "Scale to be used for int8 input/output data."
+                 "Only used with MKL-DNN INT8.")
+      .SetDefault(1.0f);
+  AddAttr<float>("Shift_data",
+                 "Shift to be used for int8 input/output data."
+                 "Only used with MKL-DNN INT8.")
+      .SetDefault(0.0f);
+  AddAttr<std::vector<float>>("Scale_weights",
+                              "Scale_weights to be used for int8 weights data."
+                              "Only used with MKL-DNN INT8.")
+      .SetDefault({1.0f});
+  AddAttr<bool>("force_fp32_output",
+                "(bool, default false) Force INT8 kernel output FP32, only "
+                "used in MKL-DNN INT8")
+      .SetDefault(false);
   AddComment(R"DOC(
 The Fusion complete GRU Operator.
 This operator fuse the fully-connected operator into GRU, 
@@ -220,14 +246,17 @@ class FusionGRUKernel : public framework::OpKernel<T> {
     }
   }
 
-#define INIT_BASE_DEFINES                  \
-  auto* x = ctx.Input<LoDTensor>("X");     \
-  auto* wh = ctx.Input<Tensor>("WeightH"); \
-  auto* xx = ctx.Output<LoDTensor>("XX");  \
-  auto x_lod = x->lod();                   \
-  auto x_dims = x->dims();   /* T x M*/    \
-  auto wh_dims = wh->dims(); /* D x 3D*/   \
-  const int total_T = x_dims[0];           \
+#define INIT_BASE_DEFINES                                     \
+  auto* x = ctx.Input<LoDTensor>("X");                        \
+  auto* wh = ctx.Input<Tensor>("WeightH");                    \
+  auto* xx = ctx.Output<LoDTensor>("XX");                     \
+  auto x_lod = x->lod();                                      \
+  auto x_dims = x->dims(); /* T x M*/                         \
+  auto x_mat_dims = (x_dims.size() == 3 && x_dims[1] == 1)    \
+                        ? framework::flatten_to_2d(x_dims, 1) \
+                        : x_dims;                             \
+  auto wh_dims = wh->dims(); /* D x 3D*/                      \
+  const int total_T = x_mat_dims[0];                          \
   const int D3 = wh_dims[1]
 
 #define INIT_OTHER_DEFINES                                                   \
@@ -236,7 +265,7 @@ class FusionGRUKernel : public framework::OpKernel<T> {
   auto* bias = ctx.Input<Tensor>("Bias");                                    \
   auto* hidden_out = ctx.Output<LoDTensor>("Hidden");                        \
   bool is_reverse = ctx.Attr<bool>("is_reverse");                            \
-  const int M = x_dims[1];                                                   \
+  const int M = x_mat_dims[1];                                               \
   const int D = wh_dims[0];                                                  \
   const int D2 = D * 2;                                                      \
   const jit::gru_attr_t attr(                                                \
diff --git a/paddle/fluid/operators/fused/fusion_seqconv_eltadd_relu_op.cc b/paddle/fluid/operators/fused/fusion_seqconv_eltadd_relu_op.cc
index a6c9a137b5438d840ae283b72fc9e85903c83775..c5a291f10b2eaa32aa4b98d73004008bae89a5c9 100644
--- a/paddle/fluid/operators/fused/fusion_seqconv_eltadd_relu_op.cc
+++ b/paddle/fluid/operators/fused/fusion_seqconv_eltadd_relu_op.cc
@@ -192,6 +192,9 @@ class FusionSeqConvEltAddReluKernel : public framework::OpKernel<T> {
           copy_size += src_mat_w_sz;
         }
         // fill data
+        if (context_start > 0) {
+          src_data += context_start * src_mat_w;
+        }
         for (int j = 0; j < seq_len - up_pad - down_pad; ++j) {
           std::memcpy(dst_data, src_data, copy_size);
           dst_data += col_mat_w;
@@ -201,18 +204,15 @@ class FusionSeqConvEltAddReluKernel : public framework::OpKernel<T> {
         std::memset(dst_data, 0, down_pad * col_mat_w_sz);
         copy_size -= src_mat_w_sz;
         for (int j = 0; j < down_pad; ++j) {
+          if (copy_size < 0) {
+            copy_size = 0;
+          }
           std::memcpy(dst_data, src_data, copy_size);
           dst_data += col_mat_w;
           src_data += src_mat_w;
           copy_size -= src_mat_w_sz;
         }
       } else {
-        PADDLE_ENFORCE_GE(context_length, up_pad + down_pad + 1,
-                          platform::errors::InvalidArgument(
-                              "context length must be bigger or equal than "
-                              "up_pad + down_pad + 1, but received context "
-                              "length is: %d, up_pad is: %d, down_pad is: %d.",
-                              context_length, up_pad, down_pad));
         std::memset(dst_data, 0, seq_len * col_mat_w_sz);
         dst_data = dst_data + up_pad * src_mat_w;
         int zero_sz = up_pad * src_mat_w_sz;
@@ -226,9 +226,15 @@ class FusionSeqConvEltAddReluKernel : public framework::OpKernel<T> {
         // from bottom
         dst_data = col_data + ed * col_mat_w;
         src_data = x_data + st * src_mat_w;
+        if (context_start > 0) {
+          src_data += context_start * src_mat_w;
+        }
         zero_sz = down_pad * src_mat_w_sz;
         for (int j = 1; j <= std::min(down_pad, seq_len); ++j) {
           int copy_size = std::min(cur_src_sz, col_mat_w_sz - zero_sz);
+          if (copy_size < 0) {
+            copy_size = 0;
+          }
           std::memcpy(dst_data - (zero_sz + copy_size) / sizeof(T),
                       src_data + std::max(seq_len - j - up_pad, 0) * src_mat_w,
                       copy_size);
diff --git a/paddle/fluid/operators/fused/fusion_seqpool_cvm_concat_op.cc b/paddle/fluid/operators/fused/fusion_seqpool_cvm_concat_op.cc
index f64e4f134d62f125e3e781ebf43163a566587d58..ecb7db46a9d8159b8da124e941cc69522f64cd57 100644
--- a/paddle/fluid/operators/fused/fusion_seqpool_cvm_concat_op.cc
+++ b/paddle/fluid/operators/fused/fusion_seqpool_cvm_concat_op.cc
@@ -24,20 +24,27 @@ void FusionSeqPoolCVMConcatOp::InferShape(
     framework::InferShapeContext* ctx) const {
   PADDLE_ENFORCE_GE(
       ctx->Inputs("X").size(), 1UL,
-      "Inputs(X) of FusionSeqPoolCVMConcatOp should not be empty.");
-  PADDLE_ENFORCE(ctx->HasOutput("Out"),
-                 "Output(Out) of FusionSeqPoolCVMConcatOp should not be null.");
+      paddle::platform::errors::InvalidArgument(
+          "Inputs(X) of FusionSeqPoolCVMConcatOp should not be empty."));
+  PADDLE_ENFORCE(
+      ctx->HasOutput("Out"),
+      paddle::platform::errors::InvalidArgument(
+          "Output(Out) of FusionSeqPoolCVMConcatOp should not be null."));
   int axis = ctx->Attrs().Get<int>("axis");
   PADDLE_ENFORCE_EQ(
-      axis, 1, "FusionSeqPoolCVMConcatOp only supports concat axis=1 yet.");
+      axis, 1,
+      paddle::platform::errors::InvalidArgument(
+          "FusionSeqPoolCVMConcatOp only supports concat axis=1 yet."));
   bool use_cvm = ctx->Attrs().Get<bool>("use_cvm");
   PADDLE_ENFORCE_EQ(
       use_cvm, true,
-      "FusionSeqPoolCVMConcatOp only supports use_cvm is true yet.");
+      paddle::platform::errors::InvalidArgument(
+          "FusionSeqPoolCVMConcatOp only supports use_cvm is true yet."));
 
   auto ins_dims = ctx->GetInputsDim("X");
   const size_t n = ins_dims.size();
-  PADDLE_ENFORCE_GT(n, 0UL, "Input tensors count should > 0.");
+  PADDLE_ENFORCE_GT(n, 0UL, paddle::platform::errors::InvalidArgument(
+                                "Input tensors count should > 0."));
   if (n == 1) {
     LOG(WARNING) << "Only have one input, may waste memory";
   }
@@ -45,7 +52,8 @@ void FusionSeqPoolCVMConcatOp::InferShape(
   // The output height should be confirmed in Compute,
   // since input lod is not accessible here.
   PADDLE_ENFORCE_EQ(ins_dims[0].size(), 2,
-                    "The dims size of first input should be 2.");
+                    paddle::platform::errors::InvalidArgument(
+                        "The dims size of first input should be 2."));
   ctx->SetOutputDim("Out", {-1, ins_dims[0][axis] * static_cast<int>(n)});
 }
 
@@ -99,7 +107,8 @@ class FusionSeqPoolCVMConcatKernel : public framework::OpKernel<T> {
 
     int w = ins[0]->numel() / x0_dims[0];
     PADDLE_ENFORCE_EQ(y_dims[1] % w, 0,
-                      "The output of dims[1] should be dividable of w");
+                      paddle::platform::errors::InvalidArgument(
+                          "The output of dims[1] should be dividable of w"));
     jit::seq_pool_attr_t attr(w, jit::SeqPoolType::kSum);
     if (pooltype == "AVERAGE") {
       attr.type = jit::SeqPoolType::kAvg;
@@ -117,9 +126,11 @@ class FusionSeqPoolCVMConcatKernel : public framework::OpKernel<T> {
       const T* src = ins[i]->data<T>();
       T* dst = y_data + i * w;
       PADDLE_ENFORCE_EQ(static_cast<int>(ins[i]->numel() / x_dims[0]), w,
-                        "Width of all inputs should be equal.");
+                        paddle::platform::errors::InvalidArgument(
+                            "Width of all inputs should be equal."));
       PADDLE_ENFORCE_EQ(x_lod.size(), bs + 1,
-                        "Batchsize of all inputs should be equal.");
+                        paddle::platform::errors::InvalidArgument(
+                            "Batchsize of all inputs should be equal."));
       for (size_t j = 0; j < bs; ++j) {
         attr.h = static_cast<int>(x_lod[j + 1] - x_lod[j]);
         seqpool(src, dst, &attr);
diff --git a/paddle/fluid/operators/fused/mkldnn/fusion_gru_mkldnn_op.cc b/paddle/fluid/operators/fused/mkldnn/fusion_gru_mkldnn_op.cc
index 3940aae53b8ef70c15311305ce13f8929400d405..5fad1b116de6437e62e311318832ad77e24a40cc 100644
--- a/paddle/fluid/operators/fused/mkldnn/fusion_gru_mkldnn_op.cc
+++ b/paddle/fluid/operators/fused/mkldnn/fusion_gru_mkldnn_op.cc
@@ -21,11 +21,12 @@ namespace operators {
 using paddle::framework::LoDTensor;
 using paddle::framework::Tensor;
 using paddle::platform::CPUDeviceContext;
+using paddle::platform::CreateKey;
 using paddle::platform::MKLDNNGetDataType;
 using paddle::platform::MKLDNNMemDesc;
 using platform::to_void_cast;
 
-template <typename T>
+template <typename T, typename T_out = T>
 class GRUMKLDNNHandler : public platform::MKLDNNHandlerT<T, dnnl::gru_forward> {
  public:
   GRUMKLDNNHandler(const paddle::framework::ExecutionContext& ctx,
@@ -38,7 +39,7 @@ class GRUMKLDNNHandler : public platform::MKLDNNHandlerT<T, dnnl::gru_forward> {
                    const std::string& unique_name)
       : platform::MKLDNNHandlerT<T, dnnl::gru_forward>(
             dev_ctx, dev_ctx.GetEngine(), cpu_place,
-            platform::CreateKey(unique_name, Ti)),
+            CreateKey(unique_name, MKLDNNGetDataType<T>(), Ti)),
         N(N),
         Ti(Ti),
         IC(IC),
@@ -47,9 +48,29 @@ class GRUMKLDNNHandler : public platform::MKLDNNHandlerT<T, dnnl::gru_forward> {
     // do not depend on Ti size but primitive and input/output memory do
     if (platform::MKLDNNDeviceContext::tls().get_cur_mkldnn_session_id() !=
         platform::MKLDNNDeviceContextThreadLocals::kMKLDNNSessionID_Default) {
-      memory_key_ = unique_name;
+      memory_key_ = CreateKey(unique_name, MKLDNNGetDataType<T>());
     } else {
-      memory_key_ = unique_name + "-t:" + platform::ThreadIDasStr();
+      memory_key_ = CreateKey(unique_name, MKLDNNGetDataType<T>(), "-t:",
+                              platform::ThreadIDasStr());
+    }
+
+    // Is it int8 kernel
+    const bool is_INT8 = std::is_same<T, uint8_t>::value;
+
+    if (is_INT8) {
+      // Int8 attributes
+      const float scale_data = ctx.Attr<float>("Scale_data");
+      const float shift_data = ctx.Attr<float>("Shift_data");
+      const auto scale_weights = ctx.Attr<std::vector<float>>("Scale_weights");
+
+      const int weights_scale_mask =
+          0 +
+          (1 << 3)  // bit, indicating the unique scales for `g` dim in `ldigo`
+          +
+          (1 << 4);  // bit, indicating the unique scales for `o` dim in `ldigo`
+
+      attr_.set_rnn_data_qparams(scale_data, shift_data);
+      attr_.set_rnn_weights_qparams(weights_scale_mask, scale_weights);
     }
 
     if (!this->isCached()) {
@@ -63,6 +84,10 @@ class GRUMKLDNNHandler : public platform::MKLDNNHandlerT<T, dnnl::gru_forward> {
           platform::errors::Unimplemented(
               "oneDNN fusion_gru supports only tanh as an activation."));
 
+      // Weights for int8 kernel are of a type s8
+      const auto weights_dt =
+          is_INT8 ? dnnl::memory::data_type::s8 : dnnl::memory::data_type::f32;
+
       // oneDNN RNN dimensions
       const int64_t D = 1;  // Directions
       const int64_t L = 1;  // Layers (PP supports only 1 stacked layer)
@@ -71,19 +96,16 @@ class GRUMKLDNNHandler : public platform::MKLDNNHandlerT<T, dnnl::gru_forward> {
       // Create memory descriptors
       auto input_md = MKLDNNMemDesc({Ti, N, IC}, MKLDNNGetDataType<T>(),
                                     MKLDNNMemoryFormat::any);
-      auto weight_x_md = MKLDNNMemDesc(
-          {L, D, IC, G, OC}, MKLDNNGetDataType<T>(), MKLDNNMemoryFormat::any);
-      auto weight_h_md = MKLDNNMemDesc(
-          {L, D, OC, G, OC}, MKLDNNGetDataType<T>(), MKLDNNMemoryFormat::any);
+      auto weight_x_md =
+          MKLDNNMemDesc({L, D, IC, G, OC}, weights_dt, MKLDNNMemoryFormat::any);
+      auto weight_h_md =
+          MKLDNNMemDesc({L, D, OC, G, OC}, weights_dt, MKLDNNMemoryFormat::any);
       auto bias_md = MKLDNNMemDesc({L, D, G, OC}, MKLDNNGetDataType<float>(),
                                    MKLDNNMemoryFormat::ldgo);
-      auto hidden_md = MKLDNNMemDesc({Ti, N, OC}, MKLDNNGetDataType<T>(),
+      auto hidden_md = MKLDNNMemDesc({Ti, N, OC}, MKLDNNGetDataType<T_out>(),
                                      MKLDNNMemoryFormat::any);
-      auto h0_md = dnnl::memory::desc();
-      if (h0) {
-        h0_md = MKLDNNMemDesc({L, D, N, OC}, MKLDNNGetDataType<T>(),
-                              MKLDNNMemoryFormat::ldnc);
-      }
+      auto h0_md = MKLDNNMemDesc({L, D, N, OC}, MKLDNNGetDataType<T>(),
+                                 MKLDNNMemoryFormat::ldnc);
 
       // Create GRU oneDNN primitive
       const auto direction =
@@ -91,7 +113,7 @@ class GRUMKLDNNHandler : public platform::MKLDNNHandlerT<T, dnnl::gru_forward> {
                      : dnnl::rnn_direction::unidirectional_left2right;
 
       this->AcquireForwardPrimitiveDescriptor(
-          dnnl::prop_kind::forward_inference, direction, input_md, h0_md,
+          attr_, dnnl::prop_kind::forward_inference, direction, input_md, h0_md,
           weight_x_md, weight_h_md, bias_md, hidden_md, dnnl::memory::desc());
     }
   }
@@ -101,29 +123,31 @@ class GRUMKLDNNHandler : public platform::MKLDNNHandlerT<T, dnnl::gru_forward> {
             dnnl::memory::format_tag::ntc);
   }
 
-  void reorderRNNdata(const T* input_data, T* output_data,
+  void reorderRNNdata(void* input_data, void* output_data,
                       std::vector<size_t> lod, const bool is_reverse,
                       platform::RNNReorderType reorder_type) {
     switch (reorder_type) {
       // Reorder input memory [WORDS, C] + LoD -> [N, T, C]
       case platform::RNNReorderType::PP_NTC: {
-        auto* input_data_iter = input_data;
+        auto* input_data_iter = reinterpret_cast<T*>(input_data);
+        auto* output_data_iter = reinterpret_cast<T*>(output_data);
         for (int n = 0; n < N; ++n) {
           const auto num_elements = (lod[n + 1] - lod[n]) * IC;
           const auto offset = is_reverse ? (Ti * IC - num_elements) : 0;
-          memcpy(output_data + n * Ti * IC + offset, input_data_iter,
+          memcpy(output_data_iter + n * Ti * IC + offset, input_data_iter,
                  sizeof(T) * num_elements);
           input_data_iter += num_elements;
         }
       } break;
       // Reorder input memory [WORDS, C] + LoD -> [T, N, C]
       case platform::RNNReorderType::PP_TNC: {
-        auto* input_data_iter = input_data;
+        auto* input_data_iter = reinterpret_cast<T*>(input_data);
+        auto* output_data_iter = reinterpret_cast<T*>(output_data);
         for (int n = 0; n < N; ++n) {
           const auto num_elements = (lod[n + 1] - lod[n]);
           const auto offset = is_reverse ? (Ti - num_elements) : 0;
           for (size_t t = 0; t < num_elements; ++t) {
-            memcpy(output_data + (t + offset) * N * IC + n * IC,
+            memcpy(output_data_iter + (t + offset) * N * IC + n * IC,
                    input_data_iter, sizeof(T) * IC);
             input_data_iter += IC;
           }
@@ -131,24 +155,27 @@ class GRUMKLDNNHandler : public platform::MKLDNNHandlerT<T, dnnl::gru_forward> {
       } break;
       // Reorder output values to PP format [N, T, C] -> [WORDS, C]
       case platform::RNNReorderType::NTC_PP: {
-        auto* output_data_iter = output_data;
+        auto* input_data_iter = reinterpret_cast<T_out*>(input_data);
+        auto* output_data_iter = reinterpret_cast<T_out*>(output_data);
         for (int n = 0; n < N; ++n) {
           const auto num_elements = (lod[n + 1] - lod[n]) * OC;
           const auto offset = is_reverse ? (Ti * OC - num_elements) : 0;
-          memcpy(output_data_iter, input_data + n * Ti * OC + offset,
-                 sizeof(T) * num_elements);
+          memcpy(output_data_iter, input_data_iter + n * Ti * OC + offset,
+                 sizeof(T_out) * num_elements);
           output_data_iter += num_elements;
         }
       } break;
       // Reorder output values to PP format [T, N, C] -> [WORDS, C]
       case platform::RNNReorderType::TNC_PP: {
-        auto* output_data_iter = output_data;
+        auto* input_data_iter = reinterpret_cast<T_out*>(input_data);
+        auto* output_data_iter = reinterpret_cast<T_out*>(output_data);
         for (int n = 0; n < N; ++n) {
           const auto num_elements = lod[n + 1] - lod[n];
           const auto offset = is_reverse ? (Ti - num_elements) : 0;
           for (size_t t = 0; t < num_elements; ++t) {
             memcpy(output_data_iter,
-                   input_data + (t + offset) * N * OC + n * OC, sizeof(T) * OC);
+                   input_data_iter + (t + offset) * N * OC + n * OC,
+                   sizeof(T_out) * OC);
             output_data_iter += OC;
           }
         }
@@ -169,9 +196,9 @@ class GRUMKLDNNHandler : public platform::MKLDNNHandlerT<T, dnnl::gru_forward> {
     }
 
     const auto& input_lod = input->lod()[0];
-    auto* x_data = input->data<T>();
+    auto* x_data = to_void_cast(input->data<T>());
 
-    auto* x_onednn_data = reinterpret_cast<T*>(memory_p->get_data_handle());
+    auto* x_onednn_data = memory_p->get_data_handle();
     memset(x_onednn_data, 0, sizeof(T) * N * Ti * IC);
 
     if (platform::GetMKLDNNFormat(this->fwd_pd_->src_desc()) ==
@@ -198,19 +225,35 @@ class GRUMKLDNNHandler : public platform::MKLDNNHandlerT<T, dnnl::gru_forward> {
     return memory_p;
   }
 
+  // TODO(grygielski) H0 is for now persistable
   std::shared_ptr<dnnl::memory> AcquireH0Memory(const Tensor* h0) {
     const std::string h0_key = memory_key_ + "@h0";
     auto memory_p =
         std::static_pointer_cast<dnnl::memory>(this->dev_ctx_.GetBlob(h0_key));
 
-    auto* h0_data = to_void_cast(h0->data<T>());
-
     if (!memory_p) {
-      memory_p = std::make_shared<dnnl::memory>(
-          this->fwd_pd_->weights_layer_desc(), this->engine_, h0_data);
+      auto user_h0_memory = dnnl::memory();
+      if (h0) {
+        user_h0_memory =
+            dnnl::memory({{1, 1, N, OC},
+                          MKLDNNGetDataType<float>(),
+                          MKLDNNMemoryFormat::ldnc},
+                         this->engine_, to_void_cast(h0->data<float>()));
+      } else {
+        user_h0_memory = dnnl::memory({{1, 1, N, OC},
+                                       MKLDNNGetDataType<float>(),
+                                       MKLDNNMemoryFormat::ldnc},
+                                      this->engine_);
+        memset(user_h0_memory.get_data_handle(), 0, sizeof(float) * N * OC);
+      }
+      memory_p = std::make_shared<dnnl::memory>(this->fwd_pd_->src_iter_desc(),
+                                                this->engine_);
+
+      dnnl::stream astream(this->engine_);
+      dnnl::reorder(user_h0_memory, *memory_p, attr_)
+          .execute(astream, user_h0_memory, *memory_p);
+
       this->dev_ctx_.SetBlob(h0_key, memory_p);
-    } else {
-      memory_p->set_data_handle(h0_data);
     }
     return memory_p;
   }
@@ -245,7 +288,7 @@ class GRUMKLDNNHandler : public platform::MKLDNNHandlerT<T, dnnl::gru_forward> {
           this->fwd_pd_->weights_layer_desc(), this->engine_);
 
       dnnl::stream astream(this->engine_);
-      dnnl::reorder(user_memory, *memory_p)
+      dnnl::reorder(user_memory, *memory_p, attr_)
           .execute(astream, user_memory, *memory_p);
 
       this->dev_ctx_.SetBlob(wx_key, memory_p);
@@ -298,7 +341,7 @@ class GRUMKLDNNHandler : public platform::MKLDNNHandlerT<T, dnnl::gru_forward> {
           this->fwd_pd_->weights_iter_desc(), this->engine_);
 
       dnnl::stream astream(this->engine_);
-      dnnl::reorder(user_memory, *memory_p)
+      dnnl::reorder(user_memory, *memory_p, attr_)
           .execute(astream, user_memory, *memory_p);
 
       this->dev_ctx_.SetBlob(wh_key, memory_p);
@@ -347,12 +390,26 @@ class GRUMKLDNNHandler : public platform::MKLDNNHandlerT<T, dnnl::gru_forward> {
   // Memory size of weights, bias and h0 does not depend
   // on Ti size, thus we need another key to cache them
   std::string memory_key_;
+  dnnl::primitive_attr attr_;
 };
 
 template <typename T>
 class FusionGRUMKLDNNKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& ctx) const override {
+    const bool is_INT8 = std::is_same<T, uint8_t>::value;
+    const bool force_fp32_output = ctx.Attr<bool>("force_fp32_output");
+
+    // TODO(grygielski) Add option for bfloat
+    if (!is_INT8 || force_fp32_output) {
+      RunKernel<float>(ctx);
+    } else {
+      RunKernel<uint8_t>(ctx);
+    }
+  }
+
+  template <typename Tout = T>
+  void RunKernel(const framework::ExecutionContext& ctx) const {
     auto& dev_ctx =
         ctx.template device_context<platform::MKLDNNDeviceContext>();
     const auto& mkldnn_engine = dev_ctx.GetEngine();
@@ -364,13 +421,16 @@ class FusionGRUMKLDNNKernel : public framework::OpKernel<T> {
     const auto* weight_h = ctx.Input<Tensor>("WeightH");
     const auto* bias = ctx.Input<Tensor>("Bias");
     auto* hidden = ctx.Output<LoDTensor>("Hidden");
-
+    auto x_dims = input->dims();
+    auto x_mat_dims = (x_dims.size() == 3 && x_dims[1] == 1)
+                          ? framework::flatten_to_2d(x_dims, 1)
+                          : x_dims;
     // Get attributes
     const bool is_reverse = ctx.Attr<bool>("is_reverse");
     const bool origin_mode = ctx.Attr<bool>("origin_mode");
 
     // Get tensor dimensions
-    const auto x_dims = framework::vectorize(input->dims());
+    const auto x_mat_dims_vec = framework::vectorize(x_mat_dims);
     const auto weight_h_dims = framework::vectorize(weight_h->dims());
     const auto& input_lod = input->lod()[0];
 
@@ -384,15 +444,17 @@ class FusionGRUMKLDNNKernel : public framework::OpKernel<T> {
           }
           return res;
         }();
-    const int64_t IC = x_dims[1];         // Input channels
-    const int64_t OC = weight_h_dims[0];  // Output channels
+    const int64_t IC = x_mat_dims_vec[1];  // Input channels
+    const int64_t OC = weight_h_dims[0];   // Output channels
 
-    GRUMKLDNNHandler<T> handler(ctx, dev_ctx, mkldnn_engine, ctx.GetPlace(),
-                                input, weight_h, h0, is_reverse, N, Ti, IC, OC,
-                                ctx.InputName("X") + ctx.InputName("WeightH"));
+    GRUMKLDNNHandler<T, Tout> handler(
+        ctx, dev_ctx, mkldnn_engine, ctx.GetPlace(), input, weight_h, h0,
+        is_reverse, N, Ti, IC, OC,
+        ctx.InputName("X") + ctx.InputName("WeightH"));
 
     auto input_memory_p =
         handler.AcquireInputMemoryWithReorder(input, is_reverse);
+    auto h0_memory_p = handler.AcquireH0Memory(h0);
     auto weight_x_memory_p =
         handler.AcquireWeightXMemory(weight_x, origin_mode);
     auto weight_h_memory_p =
@@ -402,25 +464,21 @@ class FusionGRUMKLDNNKernel : public framework::OpKernel<T> {
 
     std::unordered_map<int, dnnl::memory> gru_args = {
         {DNNL_ARG_SRC_LAYER, *input_memory_p},
+        {DNNL_ARG_SRC_ITER, *h0_memory_p},
         {DNNL_ARG_WEIGHTS_LAYER, *weight_x_memory_p},
         {DNNL_ARG_WEIGHTS_ITER, *weight_h_memory_p},
         {DNNL_ARG_BIAS, *bias_memory_p},
         {DNNL_ARG_DST_LAYER, *hidden_onednn_memory_p}};
 
-    if (h0) {
-      auto h0_memory_p = handler.AcquireH0Memory(h0);
-      gru_args.insert({DNNL_ARG_SRC_ITER, *h0_memory_p});
-    }
-
     auto gru_forward_p = handler.AcquireForwardPrimitive();
 
     dnnl::stream astream(mkldnn_engine);
     gru_forward_p->execute(astream, gru_args);
     astream.wait();
 
-    auto* hidden_onednn_data =
-        reinterpret_cast<T*>(hidden_onednn_memory_p->get_data_handle());
-    auto* hidden_data = hidden->mutable_data<T>(ctx.GetPlace());
+    auto* hidden_onednn_data = hidden_onednn_memory_p->get_data_handle();
+    auto* hidden_data =
+        to_void_cast(hidden->mutable_data<Tout>(ctx.GetPlace()));
     if (handler.is_NTC()) {
       handler.reorderRNNdata(hidden_onednn_data, hidden_data, input_lod,
                              is_reverse, platform::RNNReorderType::NTC_PP);
@@ -436,4 +494,5 @@ class FusionGRUMKLDNNKernel : public framework::OpKernel<T> {
 
 namespace ops = paddle::operators;
 REGISTER_OP_KERNEL(fusion_gru, MKLDNN, paddle::platform::CPUPlace,
-                   ops::FusionGRUMKLDNNKernel<float>);
+                   ops::FusionGRUMKLDNNKernel<float>,
+                   ops::FusionGRUMKLDNNKernel<uint8_t>);
diff --git a/paddle/fluid/operators/gather_op.cc b/paddle/fluid/operators/gather_op.cc
index 28afeb6f541c68fe7e0719a782fd8c9147b15163..a99879316d684ca95e73ce8db43e988efcbab4c4 100644
--- a/paddle/fluid/operators/gather_op.cc
+++ b/paddle/fluid/operators/gather_op.cc
@@ -37,8 +37,21 @@ class GatherOp : public framework::OperatorWithKernel {
                           "Output(Out) of GatherOp should not be null."));
 
     auto index_dims = ctx->GetInputDim("Index");
-    PADDLE_ENFORCE(index_dims.size() == 1 ||
-                   (index_dims.size() == 2 && index_dims[1] == 1));
+
+    if (index_dims.size() == 2) {
+      PADDLE_ENFORCE_EQ(
+          index_dims[1], 1,
+          platform::errors::InvalidArgument(
+              "The last dim of index should be 1 when it is 2D, but we get %d",
+              index_dims[1]));
+    } else {
+      PADDLE_ENFORCE_EQ(
+          index_dims.size(), 1,
+          platform::errors::InvalidArgument(
+              "The index should be 1D, when it is not 2D, but we get %d",
+              index_dims.size()));
+    }
+
     int batch_size = ctx->GetInputDim("Index")[0];
     framework::DDim output_dims(ctx->GetInputDim("X"));
     output_dims[0] = batch_size;
diff --git a/paddle/fluid/operators/gaussian_random_op.cc b/paddle/fluid/operators/gaussian_random_op.cc
index 4f128463375b91803a7a4d02a27dd78157961aac..17a71c67b8a084c114497eb97568e9b536161711 100644
--- a/paddle/fluid/operators/gaussian_random_op.cc
+++ b/paddle/fluid/operators/gaussian_random_op.cc
@@ -34,8 +34,7 @@ class CPUGaussianRandomKernel : public framework::OpKernel<T> {
     auto* tensor = context.Output<framework::Tensor>("Out");
 
     std::normal_distribution<T> dist(mean, std);
-    const std::string op_type = "gaussian_random";
-    auto shape = GetShape(context, op_type);
+    auto shape = GetShape(context);
     tensor->Resize(shape);
     int64_t size = tensor->numel();
     T* data = tensor->mutable_data<T>(context.GetPlace());
diff --git a/paddle/fluid/operators/gaussian_random_op.cu b/paddle/fluid/operators/gaussian_random_op.cu
index c144481f8dedc9317f7657a22ce82e56022d5b89..7a0c93eb1b2eaa7afaae7f0a604a0da5ac0fd75d 100644
--- a/paddle/fluid/operators/gaussian_random_op.cu
+++ b/paddle/fluid/operators/gaussian_random_op.cu
@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 #include <thrust/random.h>
 #include <thrust/transform.h>
+#include "paddle/fluid/framework/generator.h"
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/framework/operator.h"
 #include "paddle/fluid/operators/fill_constant_op.h"
@@ -24,15 +25,20 @@ template <typename T>
 struct GaussianGenerator {
   T mean_, std_;
   unsigned int seed_;
+  unsigned int offset_ = 0;
 
   __host__ __device__ GaussianGenerator(T mean, T std, int seed)
       : mean_(mean), std_(std), seed_(seed) {}
 
+  __host__ __device__ GaussianGenerator(T mean, T std, int seed, int offset)
+      : mean_(mean), std_(std), seed_(seed), offset_(offset) {}
+
   __host__ __device__ T operator()(const unsigned int n) const {
     thrust::minstd_rand rng;
     rng.seed(seed_);
     thrust::normal_distribution<T> dist(mean_, std_);
-    rng.discard(n);
+    unsigned int new_n = n + offset_;
+    rng.discard(new_n);
     return dist(rng);
   }
 };
@@ -43,22 +49,37 @@ class GPUGaussianRandomKernel : public framework::OpKernel<T> {
   void Compute(const framework::ExecutionContext& context) const override {
     auto* tensor = context.Output<framework::Tensor>("Out");
     unsigned int seed = static_cast<unsigned int>(context.Attr<int>("seed"));
+    bool seed_flag = false;
     if (seed == 0) {
       std::random_device rd;
       seed = rd();
+      seed_flag = true;
     }
     T mean = static_cast<T>(context.Attr<float>("mean"));
     T std = static_cast<T>(context.Attr<float>("std"));
     thrust::counting_iterator<unsigned int> index_sequence_begin(0);
-    const std::string op_type = "gaussian_random";
-    auto shape = GetShape(context, op_type);
+    auto shape = GetShape(context);
     tensor->Resize(shape);
     T* data = tensor->mutable_data<T>(context.GetPlace());
 
     int64_t size = tensor->numel();
-    thrust::transform(index_sequence_begin, index_sequence_begin + size,
-                      thrust::device_ptr<T>(data),
-                      GaussianGenerator<T>(mean, std, seed));
+
+    int device_id =
+        BOOST_GET_CONST(platform::CUDAPlace, context.GetPlace()).GetDeviceId();
+    auto gen_cuda = framework::GetDefaultCUDAGenerator(device_id);
+
+    if (gen_cuda->GetIsInitPy() && seed_flag) {
+      auto seed_offset = gen_cuda->IncrementOffset(1);
+      int gen_offset = size * seed_offset.second;
+      thrust::transform(
+          index_sequence_begin, index_sequence_begin + size,
+          thrust::device_ptr<T>(data),
+          GaussianGenerator<T>(mean, std, seed_offset.first, gen_offset));
+    } else {
+      thrust::transform(index_sequence_begin, index_sequence_begin + size,
+                        thrust::device_ptr<T>(data),
+                        GaussianGenerator<T>(mean, std, seed));
+    }
   }
 };
 
@@ -69,17 +90,33 @@ class GPUGaussianRandomBatchSizeLikeKernel : public framework::OpKernel<T> {
     auto* tensor = context.Output<framework::Tensor>("Out");
     T* data = tensor->mutable_data<T>(context.GetPlace());
     unsigned int seed = static_cast<unsigned int>(context.Attr<int>("seed"));
+    bool seed_flag = false;
     if (seed == 0) {
       std::random_device rd;
       seed = rd();
+      seed_flag = true;
     }
     T mean = static_cast<T>(context.Attr<float>("mean"));
     T std = static_cast<T>(context.Attr<float>("std"));
     thrust::counting_iterator<unsigned int> index_sequence_begin(0);
     int64_t size = tensor->numel();
-    thrust::transform(index_sequence_begin, index_sequence_begin + size,
-                      thrust::device_ptr<T>(data),
-                      GaussianGenerator<T>(mean, std, seed));
+
+    int device_id =
+        BOOST_GET_CONST(platform::CUDAPlace, context.GetPlace()).GetDeviceId();
+    auto gen_cuda = framework::GetDefaultCUDAGenerator(device_id);
+
+    if (gen_cuda->GetIsInitPy() && seed_flag) {
+      auto seed_offset = gen_cuda->IncrementOffset(1);
+      int gen_offset = size * seed_offset.second;
+      thrust::transform(index_sequence_begin, index_sequence_begin + size,
+                        thrust::device_ptr<T>(data),
+                        GaussianGenerator<T>(mean, std, seed_offset.first,
+                                             seed_offset.second));
+    } else {
+      thrust::transform(index_sequence_begin, index_sequence_begin + size,
+                        thrust::device_ptr<T>(data),
+                        GaussianGenerator<T>(mean, std, seed));
+    }
   }
 };
 }  // namespace operators
diff --git a/paddle/fluid/operators/grid_sampler_op.cc b/paddle/fluid/operators/grid_sampler_op.cc
index deb71b807128e5c0b173b517e60832894ced41e5..f5224239eb2ded9a156aadc9185eca89f4e3396f 100644
--- a/paddle/fluid/operators/grid_sampler_op.cc
+++ b/paddle/fluid/operators/grid_sampler_op.cc
@@ -115,7 +115,7 @@ class GridSampleOpMaker : public framework::OpProtoAndCheckerMaker {
     AddAttr<std::string>(
         "padding_mode",
         "(bool, default true) The padding method used when source"
-        "index is out of input images. It can be 'zeros', 'reflect' and "
+        "index is out of input images. It can be 'zeros', 'reflection' and "
         "'border'.")
         .SetDefault("zeros");
 
@@ -174,6 +174,10 @@ class GridSampleOpGrad : public framework::OperatorWithKernel {
  public:
   using framework::OperatorWithKernel::OperatorWithKernel;
   void InferShape(framework::InferShapeContext* ctx) const override {
+    OP_INOUT_CHECK(ctx->HasOutput(framework::GradVarName("X")), "Output",
+                   framework::GradVarName("X"), "grid_sampler");
+    OP_INOUT_CHECK(ctx->HasOutput(framework::GradVarName("Grid")), "Output",
+                   framework::GradVarName("Grid"), "grid_sampler");
     auto input_dims = ctx->GetInputDim("X");
     auto grid_dims = ctx->GetInputDim("Grid");
     if (ctx->HasOutput(framework::GradVarName("X"))) {
diff --git a/paddle/fluid/operators/grid_sampler_op.cu b/paddle/fluid/operators/grid_sampler_op.cu
index 999f990448ca6370dadacbdaee5bf3bcadcaca0e..4e61d0c2ea7f91e4199c3e9daa3e93ac45bc0eb8 100644
--- a/paddle/fluid/operators/grid_sampler_op.cu
+++ b/paddle/fluid/operators/grid_sampler_op.cu
@@ -268,7 +268,7 @@ class GridSampleOpCUDAKernel : public framework::OpKernel<T> {
     Mode mode;
     if (padding_mode_s == "border") {
       padding_mode = PaddingMode::border;
-    } else if (padding_mode_s == "reflect") {
+    } else if (padding_mode_s == "reflection") {
       padding_mode = PaddingMode::reflect;
     } else {
       padding_mode = PaddingMode::zeros;
@@ -432,7 +432,7 @@ class GridSampleGradOpCUDAKernel : public framework::OpKernel<T> {
     Mode mode;
     if (padding_mode_s == "border") {
       padding_mode = PaddingMode::border;
-    } else if (padding_mode_s == "reflect") {
+    } else if (padding_mode_s == "reflection") {
       padding_mode = PaddingMode::reflect;
     } else {
       padding_mode = PaddingMode::zeros;
diff --git a/paddle/fluid/operators/grid_sampler_op.h b/paddle/fluid/operators/grid_sampler_op.h
index eda800e78faf5da2bb379b8101e4823c5bc2d2f8..b8faef759ae90e14d1e83b66130bfe957b51907b 100644
--- a/paddle/fluid/operators/grid_sampler_op.h
+++ b/paddle/fluid/operators/grid_sampler_op.h
@@ -76,7 +76,7 @@ static inline void clip(const platform::CPUDeviceContext& ctx,
   if (padding_mode == "border") {
     grid_slice_t.device(place) = grid_slice_t.cwiseMax(static_cast<T>(0))
                                      .cwiseMin(static_cast<T>(max_val));
-  } else if (padding_mode == "reflect") {
+  } else if (padding_mode == "reflection") {
     if (align_corners) {
       auto double_range = static_cast<T>(max_val * 2);
       auto grid_abs = grid_slice_t.abs();
@@ -117,7 +117,7 @@ static inline void clipWithMask(const platform::CPUDeviceContext& ctx,
     auto in_bound = (res == grid_slice_t);
     grid_scale_t.device(place) = grid_scale_t * in_bound.template cast<T>();
     grid_slice_t.device(place) = res;
-  } else if (padding_mode == "reflect") {
+  } else if (padding_mode == "reflection") {
     if (align_corners) {
       auto double_range = static_cast<T>(max_val * 2);
       auto is_neg = (grid_slice_t < static_cast<T>(0));
diff --git a/paddle/fluid/operators/gru_unit_op.h b/paddle/fluid/operators/gru_unit_op.h
index 712ef05d8631ac74b92795321202cb5590286e82..4865a02c5292ffb9d079d0711f0bf7d6e927c441 100644
--- a/paddle/fluid/operators/gru_unit_op.h
+++ b/paddle/fluid/operators/gru_unit_op.h
@@ -47,7 +47,9 @@ class GRUUnitKernel : public framework::OpKernel<T> {
     else if (act_type == relu)
       ReluFunctor<T>()(d, x, y);
     else
-      PADDLE_THROW("unsupported activation type");
+      PADDLE_THROW(platform::errors::Unimplemented(
+          "Unsupported activation type, only supports identity, sigmoid, tanh "
+          "and relu."));
   }
 
   void Compute(const framework::ExecutionContext& context) const override {
@@ -137,7 +139,9 @@ class GRUUnitGradKernel : public framework::OpKernel<T> {
     else if (act_type == relu)
       ReluGradFunctor<T>()(d, x, y, dy, dx);
     else
-      PADDLE_THROW("unsupported activation type");
+      PADDLE_THROW(platform::errors::Unimplemented(
+          "Unsupported activation type, only supports identity, sigmoid, tanh "
+          "and relu."));
   }
 
   void Compute(const framework::ExecutionContext& context) const override {
diff --git a/paddle/fluid/operators/instance_norm_op.cc b/paddle/fluid/operators/instance_norm_op.cc
index f72f7e8b85b873d9be57c8ff348e6adb2251d65d..a5b270c1dfef14bc92697c29bfeafa0fe08211d7 100644
--- a/paddle/fluid/operators/instance_norm_op.cc
+++ b/paddle/fluid/operators/instance_norm_op.cc
@@ -595,9 +595,13 @@ class InstanceNormDoubleGradKernel<platform::CPUDeviceContext, T>
 
         first_grad_arr +=
             inv_var_tile_data *
-            (dy_arr - dy_arr.colwise().sum() / sample_size -
+            (dy_arr -
+             dy_arr.colwise().sum().replicate(sample_size, 1) / sample_size -
              x_sub_mean_mul_invstd_arr *
-                 (dy_arr * x_sub_mean_mul_invstd_arr).colwise().sum() /
+                 (dy_arr * x_sub_mean_mul_invstd_arr)
+                     .colwise()
+                     .sum()
+                     .replicate(sample_size, 1) /
                  sample_size);
         first_grad_arr = first_grad_arr * ddx_arr;
         for (int nc = 0; nc < NxC; ++nc) {
diff --git a/paddle/fluid/operators/interpolate_op.cc b/paddle/fluid/operators/interpolate_op.cc
index 1e99e22e12b2a23685dad742f175fd2b0684d334..e8a9ed878e9bd502b9bd7e7d82f574fb5740bb5d 100644
--- a/paddle/fluid/operators/interpolate_op.cc
+++ b/paddle/fluid/operators/interpolate_op.cc
@@ -104,12 +104,13 @@ static void Interpolate2DInferShapeCheck(framework::InferShapeContext* ctx) {
   auto dim_x = ctx->GetInputDim("X");
   auto interp_method = ctx->Attrs().Get<std::string>("interp_method");
 
-  PADDLE_ENFORCE(
-      "bilinear" == interp_method || "nearest" == interp_method ||
-          "bicubic" == interp_method,
-      "Interpolation method can only be \"bilinear\" or \"nearest\" when "
-      "Input(X) dimension is 4, but got method = %s .",
-      interp_method);
+  PADDLE_ENFORCE_EQ("bilinear" == interp_method || "nearest" == interp_method ||
+                        "bicubic" == interp_method,
+                    true, platform::errors::InvalidArgument(
+                              "Interpolation method can only be \"bilinear\" "
+                              "or \"nearest\" or \"bicubic\" when "
+                              "Input(X) dimension is 4, but got method is %s.",
+                              interp_method));
   const DataLayout data_layout = framework::StringToDataLayout(
       ctx->Attrs().Get<std::string>("data_layout"));
 
@@ -169,13 +170,13 @@ static void Interpolate2DInferShapeCheck(framework::InferShapeContext* ctx) {
     auto out_size_dim = ctx->GetInputDim("OutSize");
     PADDLE_ENFORCE_EQ(
         out_size_dim.size(), 1,
-        platform::errors::InvalidArgument(
-            "OutSize's dimension size must be 1, but got dimension = %d .",
-            out_size_dim.size()));
+        platform::errors::InvalidArgument("OutSize's dimension size must be 1, "
+                                          "but got dimension size is %d .",
+                                          out_size_dim.size()));
     PADDLE_ENFORCE_EQ(
         out_size_dim[0], 2,
         platform::errors::InvalidArgument(
-            "OutSize's dim[0] must be 2, but got dimention = %d .",
+            "OutSize's dimension[0] must be 2, but got dimension[0] is %d .",
             out_size_dim[0]));
     ctx->ShareLoD("X", "Out");
     return;
@@ -264,12 +265,15 @@ static void Interpolate3DInferShapeCheck(framework::InferShapeContext* ctx) {
 
   if (ctx->HasInput("OutSize") && ctx->IsRuntime()) {
     auto out_size_dim = ctx->GetInputDim("OutSize");
-    PADDLE_ENFORCE_EQ(out_size_dim.size(), 1,
-                      "OutSize's dimension size must be 1, but got size =%d .",
-                      out_size_dim.size());
+    PADDLE_ENFORCE_EQ(
+        out_size_dim.size(), 1,
+        platform::errors::InvalidArgument(
+            "OutSize's dimension size must be 1, but got size is %d.",
+            out_size_dim.size()));
     PADDLE_ENFORCE_EQ(out_size_dim[0], 3,
-                      "OutSize's dim[0] must be 3, but got size = %d .",
-                      out_size_dim[0]);
+                      platform::errors::InvalidArgument(
+                          "OutSize's dim[0] must be 3, but got size is %d.",
+                          out_size_dim[0]));
     ctx->ShareLoD("X", "Out");
     return;
   }
@@ -289,10 +293,8 @@ class InterpolateOp : public framework::OperatorWithKernel {
 
  protected:
   void InferShape(framework::InferShapeContext* ctx) const override {
-    PADDLE_ENFORCE(ctx->HasInput("X"),
-                   "Input(X) of InterpolateOp should not be null.");
-    PADDLE_ENFORCE(ctx->HasOutput("Out"),
-                   "Output(Out) of InterpolationOp should not be null.");
+    OP_INOUT_CHECK(ctx->HasInput("X"), "Input", "X", "Interpolate");
+    OP_INOUT_CHECK(ctx->HasOutput("Out"), "Output", "Out", "Interpolate");
 
     auto dim_x = ctx->GetInputDim("X");  // NCHW format
     PADDLE_ENFORCE(
@@ -534,9 +536,10 @@ class InterpolateOpGrad : public framework::OperatorWithKernel {
 
  protected:
   void InferShape(framework::InferShapeContext* ctx) const override {
-    PADDLE_ENFORCE(ctx->HasInput("X"), "Input(X) should not be null");
-    PADDLE_ENFORCE(ctx->HasInput(framework::GradVarName("Out")),
-                   "Input(Out@GRAD) should not be null");
+    OP_INOUT_CHECK(ctx->HasInput("X"), "Input", "X", "InterpolateGrad");
+    OP_INOUT_CHECK(ctx->HasInput(framework::GradVarName("Out")), "Input",
+                   "Out@GRAD", "InterpolateGrad");
+
     auto dim_x = ctx->GetInputDim("X");
     if (ctx->HasOutput(framework::GradVarName("X"))) {
       ctx->SetOutputDim(framework::GradVarName("X"), dim_x);
diff --git a/paddle/fluid/operators/interpolate_v2_op.cc b/paddle/fluid/operators/interpolate_v2_op.cc
index 12733a0d9f1689a020f77d23cc31b0d19b412746..1f7dde9b931dafa4b8e0bee211e64461b1c21dc5 100644
--- a/paddle/fluid/operators/interpolate_v2_op.cc
+++ b/paddle/fluid/operators/interpolate_v2_op.cc
@@ -67,7 +67,7 @@ static void Interpolate1DInferShapeCheck(framework::InferShapeContext* ctx) {
         scale_tensor[0], 1,
         platform::errors::InvalidArgument(
             "Scale's shape must be 1, but got shape = %d .", scale_tensor[0]));
-    // out_w = -1;
+    out_w = -1;
   } else {
     auto scale = ctx->Attrs().Get<std::vector<float>>("scale");
     if (scale.size() > 0) {
@@ -159,8 +159,8 @@ static void Interpolate2DInferShapeCheck(framework::InferShapeContext* ctx) {
                       platform::errors::InvalidArgument(
                           "Scale's shape must be 2 or 1, but got shape = %d .",
                           scale_tensor[0]));
-    // out_h = -1;
-    // out_w = -1;
+    out_h = -1;
+    out_w = -1;
   } else {
     auto scale = ctx->Attrs().Get<std::vector<float>>("scale");
     if (scale.size() > 0) {
@@ -264,9 +264,9 @@ static void Interpolate3DInferShapeCheck(framework::InferShapeContext* ctx) {
                       platform::errors::InvalidArgument(
                           "Scale's shape must be 3 or 1, but got shape = %d .",
                           scale_tensor[0]));
-    // out_d = -1;
-    // out_h = -1;
-    // out_w = -1;
+    out_d = -1;
+    out_h = -1;
+    out_w = -1;
   } else {
     auto scale = ctx->Attrs().Get<std::vector<float>>("scale");
     if (scale.size() > 0) {
@@ -633,6 +633,9 @@ DECLARE_NO_NEED_BUFFER_VARS_INFERER(InterpolateV2GradNoNeedBufferVarsInferer,
 }  // namespace operators
 }  // namespace paddle
 
+// interp_v2 support scale_factor whose input type is list, this operation is
+// not
+// compatible with interp_op, so a new one is added in paddle2.0
 namespace ops = paddle::operators;
 REGISTER_OPERATOR(bilinear_interp_v2, ops::InterpolateV2Op,
                   ops::InterpolateV2OpMaker,
diff --git a/paddle/fluid/operators/interpolate_v2_op.cu b/paddle/fluid/operators/interpolate_v2_op.cu
index 6cb8104638dea458743374014e7bef35df2dbfcc..816539c3b5fdb805d16fb8224b7c960f797613cb 100644
--- a/paddle/fluid/operators/interpolate_v2_op.cu
+++ b/paddle/fluid/operators/interpolate_v2_op.cu
@@ -836,12 +836,12 @@ static void Interpolate1DCUDAFwd(const framework::ExecutionContext& ctx,
   int out_w = ctx.Attr<int>("out_w");
 
   auto list_new_shape_tensor = ctx.MultiInput<framework::Tensor>("SizeTensor");
+  float scale_w = -1;
   if (list_new_shape_tensor.size() > 0) {
     // have size tensor
     auto new_size = get_new_shape(list_new_shape_tensor);
     out_w = new_size[0];
   } else {
-    float scale_w = -1;
     auto scale_tensor = ctx.Input<Tensor>("Scale");
     auto scale = ctx.Attr<std::vector<float>>("scale");
     if (scale_tensor != nullptr) {
@@ -887,8 +887,11 @@ static void Interpolate1DCUDAFwd(const framework::ExecutionContext& ctx,
 
   float ratio_w = 0.f;
   if (out_w > 1) {
+    float new_scale_w = 0.f;
+    new_scale_w = (scale_w > 0) ? static_cast<float>(1. / scale_w)
+                                : static_cast<float>(in_w) / out_w;
     ratio_w = (align_corners) ? static_cast<float>(in_w - 1.0) / (out_w - 1.0)
-                              : static_cast<float>(in_w) / out_w;
+                              : static_cast<float>(new_scale_w);
   }
 
   int in_cw = c * in_w;
@@ -924,14 +927,14 @@ static void Interpolate2DCUDAFwd(const framework::ExecutionContext& ctx,
   int out_w = ctx.Attr<int>("out_w");
 
   auto list_new_shape_tensor = ctx.MultiInput<framework::Tensor>("SizeTensor");
+  float scale_w = -1;
+  float scale_h = -1;
   if (list_new_shape_tensor.size() > 0) {
     // have size tensor
     auto new_size = get_new_shape(list_new_shape_tensor);
     out_h = new_size[0];
     out_w = new_size[1];
   } else {
-    float scale_h = -1;
-    float scale_w = -1;
     auto scale_tensor = ctx.Input<Tensor>("Scale");
     auto scale = ctx.Attr<std::vector<float>>("scale");
     if (scale_tensor != nullptr) {
@@ -993,12 +996,18 @@ static void Interpolate2DCUDAFwd(const framework::ExecutionContext& ctx,
   float ratio_h = 0.f;
   float ratio_w = 0.f;
   if (out_h > 1) {
+    float new_scale_h = 0.f;
+    new_scale_h = (scale_h > 0) ? static_cast<float>(1. / scale_h)
+                                : static_cast<float>(in_h) / out_h;
     ratio_h = (align_corners) ? static_cast<float>(in_h - 1) / (out_h - 1)
-                              : static_cast<float>(in_h) / out_h;
+                              : static_cast<float>(new_scale_h);
   }
   if (out_w > 1) {
+    float new_scale_w = 0.f;
+    new_scale_w = (scale_w > 0) ? static_cast<float>(1. / scale_w)
+                                : static_cast<float>(in_w) / out_w;
     ratio_w = (align_corners) ? static_cast<float>(in_w - 1) / (out_w - 1)
-                              : static_cast<float>(in_w) / out_w;
+                              : static_cast<float>(new_scale_w);
   }
 
   int in_hw = in_h * in_w;
@@ -1048,6 +1057,9 @@ static void Interpolate3DCUDAFwd(const framework::ExecutionContext& ctx,
   int out_w = ctx.Attr<int>("out_w");
 
   auto list_new_shape_tensor = ctx.MultiInput<framework::Tensor>("SizeTensor");
+  float scale_w = -1;
+  float scale_d = -1;
+  float scale_h = -1;
   if (list_new_shape_tensor.size() > 0) {
     // have size tensor
     auto new_size = get_new_shape(list_new_shape_tensor);
@@ -1055,9 +1067,6 @@ static void Interpolate3DCUDAFwd(const framework::ExecutionContext& ctx,
     out_h = new_size[1];
     out_w = new_size[2];
   } else {
-    float scale_d = -1;
-    float scale_h = -1;
-    float scale_w = -1;
     auto scale_tensor = ctx.Input<Tensor>("Scale");
     auto scale = ctx.Attr<std::vector<float>>("scale");
     if (scale_tensor != nullptr) {
@@ -1129,16 +1138,25 @@ static void Interpolate3DCUDAFwd(const framework::ExecutionContext& ctx,
   float ratio_h = 0.f;
   float ratio_w = 0.f;
   if (out_d > 1) {
+    float new_scale_d = 0.f;
+    new_scale_d = (scale_d > 0) ? static_cast<float>(1. / scale_d)
+                                : static_cast<float>(in_d) / out_d;
     ratio_d = (align_corners) ? static_cast<float>(in_d - 1) / (out_d - 1)
-                              : static_cast<float>(in_d) / out_d;
+                              : static_cast<float>(new_scale_d);
   }
   if (out_h > 1) {
+    float new_scale_h = 0.f;
+    new_scale_h = (scale_h > 0) ? static_cast<float>(1. / scale_h)
+                                : static_cast<float>(in_h) / out_h;
     ratio_h = (align_corners) ? static_cast<float>(in_h - 1) / (out_h - 1)
-                              : static_cast<float>(in_h) / out_h;
+                              : static_cast<float>(new_scale_h);
   }
   if (out_w > 1) {
+    float new_scale_w = 0.f;
+    new_scale_w = (scale_w > 0) ? static_cast<float>(1. / scale_w)
+                                : static_cast<float>(in_w) / out_w;
     ratio_w = (align_corners) ? static_cast<float>(in_w - 1) / (out_w - 1)
-                              : static_cast<float>(in_w) / out_w;
+                              : static_cast<float>(new_scale_w);
   }
 
   int in_dhw = in_d * in_h * in_w;
@@ -1230,8 +1248,11 @@ static void Interpolate1DCUDABwd(const framework::ExecutionContext& ctx,
 
   float ratio_w = 0.f;
   if (out_w > 1) {
+    float new_scale_w = 0.f;
+    new_scale_w = (scale_w > 0) ? static_cast<float>(1. / scale_w)
+                                : static_cast<float>(in_w) / out_w;
     ratio_w = (align_corners) ? static_cast<float>(in_w - 1) / (out_w - 1)
-                              : static_cast<float>(in_w) / out_w;
+                              : static_cast<float>(new_scale_w);
   }
   int in_cw = c * in_w;
   int out_cw = c * out_w;
@@ -1333,12 +1354,18 @@ static void Interpolate2DCUDABwd(const framework::ExecutionContext& ctx,
   float ratio_h = 0.f;
   float ratio_w = 0.f;
   if (out_h > 1) {
+    float new_scale_h = 0.f;
+    new_scale_h = (scale_h > 0) ? static_cast<float>(1. / scale_h)
+                                : static_cast<float>(in_h) / out_h;
     ratio_h = (align_corners) ? static_cast<float>(in_h - 1) / (out_h - 1)
-                              : static_cast<float>(in_h) / out_h;
+                              : static_cast<float>(new_scale_h);
   }
   if (out_w > 1) {
+    float new_scale_w = 0.f;
+    new_scale_w = (scale_w > 0) ? static_cast<float>(1. / scale_w)
+                                : static_cast<float>(in_w) / out_w;
     ratio_w = (align_corners) ? static_cast<float>(in_w - 1) / (out_w - 1)
-                              : static_cast<float>(in_w) / out_w;
+                              : static_cast<float>(new_scale_w);
   }
 
   int in_hw = in_h * in_w;
@@ -1464,16 +1491,25 @@ static void Interpolate3DCUDABwd(const framework::ExecutionContext& ctx,
   float ratio_h = 0.f;
   float ratio_w = 0.f;
   if (out_d > 1) {
+    float new_scale_d = 0.f;
+    new_scale_d = (scale_d > 0) ? static_cast<float>(1. / scale_d)
+                                : static_cast<float>(in_d) / out_d;
     ratio_d = (align_corners) ? static_cast<float>(in_d - 1) / (out_d - 1)
-                              : static_cast<float>(in_d) / out_d;
+                              : static_cast<float>(new_scale_d);
   }
   if (out_h > 1) {
+    float new_scale_h = 0.f;
+    new_scale_h = (scale_h > 0) ? static_cast<float>(1. / scale_h)
+                                : static_cast<float>(in_h) / out_h;
     ratio_h = (align_corners) ? static_cast<float>(in_h - 1) / (out_h - 1)
-                              : static_cast<float>(in_h) / out_h;
+                              : static_cast<float>(new_scale_h);
   }
   if (out_w > 1) {
+    float new_scale_w = 0.f;
+    new_scale_w = (scale_w > 0) ? static_cast<float>(1. / scale_w)
+                                : static_cast<float>(in_w) / out_w;
     ratio_w = (align_corners) ? static_cast<float>(in_w - 1) / (out_w - 1)
-                              : static_cast<float>(in_w) / out_w;
+                              : static_cast<float>(new_scale_w);
   }
 
   int in_dhw = in_d * in_h * in_w;
diff --git a/paddle/fluid/operators/interpolate_v2_op.h b/paddle/fluid/operators/interpolate_v2_op.h
index 111766934b8300c0a7b46ae9a065b8c42460e577..4e4fd9ff63ba47b41363a81d6cc527486671d695 100644
--- a/paddle/fluid/operators/interpolate_v2_op.h
+++ b/paddle/fluid/operators/interpolate_v2_op.h
@@ -783,12 +783,13 @@ static void Interpolate1DCPUFwd(const framework::ExecutionContext& ctx,
 
   int out_w = ctx.Attr<int>("out_w");
   auto list_new_size_tensor = ctx.MultiInput<framework::Tensor>("SizeTensor");
+  float scale_w = -1.;
   if (list_new_size_tensor.size() > 0) {
     // have size tensor
     auto new_size = get_new_shape(list_new_size_tensor);
     out_w = new_size[0];
   } else {
-    float scale_w = -1;
+    // float scale_w = -1;
     auto scale_tensor = ctx.Input<Tensor>("Scale");
     auto scale = ctx.Attr<std::vector<float>>("scale");
     if (scale_tensor != nullptr) {
@@ -833,8 +834,11 @@ static void Interpolate1DCPUFwd(const framework::ExecutionContext& ctx,
 
   float ratio_w = 0.f;
   if (out_w > 1) {
+    float new_scale_w = 0.f;
+    new_scale_w = (scale_w > 0) ? static_cast<float>(1. / scale_w)
+                                : static_cast<float>(in_w) / out_w;
     ratio_w = (align_corners) ? static_cast<float>(in_w - 1) / (out_w - 1)
-                              : static_cast<float>(in_w) / out_w;
+                              : static_cast<float>(new_scale_w);
   }
   if ("linear" == interp_method) {
     LinearInterpolation<T>(input, output, ratio_w, in_w, n, c, out_w,
@@ -856,6 +860,8 @@ static void Interpolate2DCPUFwd(const framework::ExecutionContext& ctx,
 
   int out_h = ctx.Attr<int>("out_h");
   int out_w = ctx.Attr<int>("out_w");
+  float scale_h = -1;
+  float scale_w = -1;
 
   auto list_new_size_tensor = ctx.MultiInput<framework::Tensor>("SizeTensor");
   if (list_new_size_tensor.size() > 0) {
@@ -864,8 +870,6 @@ static void Interpolate2DCPUFwd(const framework::ExecutionContext& ctx,
     out_h = new_size[0];
     out_w = new_size[1];
   } else {
-    float scale_h = -1;
-    float scale_w = -1;
     auto scale_tensor = ctx.Input<Tensor>("Scale");
     auto scale = ctx.Attr<std::vector<float>>("scale");
     if (scale_tensor != nullptr) {
@@ -925,12 +929,18 @@ static void Interpolate2DCPUFwd(const framework::ExecutionContext& ctx,
   float ratio_h = 0.f;
   float ratio_w = 0.f;
   if (out_h > 1) {
+    float new_scale_h = 0.f;
+    new_scale_h = (scale_h > 0) ? static_cast<float>(1. / scale_h)
+                                : static_cast<float>(in_h) / out_h;
     ratio_h = (align_corners) ? static_cast<float>(in_h - 1) / (out_h - 1)
-                              : static_cast<float>(in_h) / out_h;
+                              : static_cast<float>(new_scale_h);
   }
   if (out_w > 1) {
+    float new_scale_w = 0.f;
+    new_scale_w = (scale_w > 0) ? static_cast<float>(1. / scale_w)
+                                : static_cast<float>(in_w) / out_w;
     ratio_w = (align_corners) ? static_cast<float>(in_w - 1) / (out_w - 1)
-                              : static_cast<float>(in_w) / out_w;
+                              : static_cast<float>(new_scale_w);
   }
 
   if ("bilinear" == interp_method) {
@@ -962,6 +972,10 @@ static void Interpolate3DCPUFwd(const framework::ExecutionContext& ctx,
   int out_h = ctx.Attr<int>("out_h");
   int out_w = ctx.Attr<int>("out_w");
 
+  float scale_d = -1;
+  float scale_h = -1;
+  float scale_w = -1;
+
   auto list_new_size_tensor = ctx.MultiInput<framework::Tensor>("SizeTensor");
   if (list_new_size_tensor.size() > 0) {
     // have size tensor
@@ -970,9 +984,6 @@ static void Interpolate3DCPUFwd(const framework::ExecutionContext& ctx,
     out_h = new_size[1];
     out_w = new_size[2];
   } else {
-    float scale_d = -1;
-    float scale_h = -1;
-    float scale_w = -1;
     auto scale_tensor = ctx.Input<Tensor>("Scale");
     auto scale = ctx.Attr<std::vector<float>>("scale");
     if (scale_tensor != nullptr) {
@@ -1043,16 +1054,25 @@ static void Interpolate3DCPUFwd(const framework::ExecutionContext& ctx,
   float ratio_h = 0.f;
   float ratio_w = 0.f;
   if (out_d > 1) {
+    float new_scale_d = 0.f;
+    new_scale_d = (scale_d > 0) ? static_cast<float>(1. / scale_d)
+                                : static_cast<float>(in_d) / out_d;
     ratio_d = (align_corners) ? static_cast<float>(in_d - 1) / (out_d - 1)
-                              : static_cast<float>(in_d) / out_d;
+                              : static_cast<float>(new_scale_d);
   }
   if (out_h > 1) {
+    float new_scale_h = 0.f;
+    new_scale_h = (scale_h > 0) ? static_cast<float>(1. / scale_h)
+                                : static_cast<float>(in_h) / out_h;
     ratio_h = (align_corners) ? static_cast<float>(in_h - 1) / (out_h - 1)
-                              : static_cast<float>(in_h) / out_h;
+                              : static_cast<float>(new_scale_h);
   }
   if (out_w > 1) {
+    float new_scale_w = 0.f;
+    new_scale_w = (scale_w > 0) ? static_cast<float>(1. / scale_w)
+                                : static_cast<float>(in_w) / out_w;
     ratio_w = (align_corners) ? static_cast<float>(in_w - 1) / (out_w - 1)
-                              : static_cast<float>(in_w) / out_w;
+                              : static_cast<float>(new_scale_w);
   }
 
   if ("trilinear" == interp_method) {
@@ -1127,8 +1147,11 @@ static void Interpolate1DCPUBwd(const framework::ExecutionContext& ctx,
 
   float ratio_w = 0.f;
   if (out_w > 1) {
+    float new_scale_w = 0.f;
+    new_scale_w = (scale_w > 0) ? static_cast<float>(1. / scale_w)
+                                : static_cast<float>(in_w) / out_w;
     ratio_w = (align_corners) ? static_cast<float>(in_w - 1) / (out_w - 1)
-                              : static_cast<float>(in_w) / out_w;
+                              : static_cast<float>(new_scale_w);
   }
   if ("linear" == interp_method) {
     LinearInterpolationGrad<T>(output_grad, input_grad, ratio_w, in_w, n, c,
@@ -1216,12 +1239,18 @@ static void Interpolate2DCPUBwd(const framework::ExecutionContext& ctx,
   float ratio_h = 0.f;
   float ratio_w = 0.f;
   if (out_h > 1) {
+    float new_scale_h = 0.f;
+    new_scale_h = (scale_h > 0) ? static_cast<float>(1. / scale_h)
+                                : static_cast<float>(in_h) / out_h;
     ratio_h = (align_corners) ? static_cast<float>(in_h - 1) / (out_h - 1)
-                              : static_cast<float>(in_h) / out_h;
+                              : static_cast<float>(new_scale_h);
   }
   if (out_w > 1) {
+    float new_scale_w = 0.f;
+    new_scale_w = (scale_w > 0) ? static_cast<float>(1. / scale_w)
+                                : static_cast<float>(in_w) / out_w;
     ratio_w = (align_corners) ? static_cast<float>(in_w - 1) / (out_w - 1)
-                              : static_cast<float>(in_w) / out_w;
+                              : static_cast<float>(new_scale_w);
   }
 
   if ("bilinear" == interp_method) {
@@ -1327,16 +1356,25 @@ static void Interpolate3DCPUBwd(const framework::ExecutionContext& ctx,
   float ratio_h = 0.f;
   float ratio_w = 0.f;
   if (out_d > 1) {
+    float new_scale_d = 0.f;
+    new_scale_d = (scale_d > 0) ? static_cast<float>(1. / scale_d)
+                                : static_cast<float>(in_d) / out_d;
     ratio_d = (align_corners) ? static_cast<float>(in_d - 1) / (out_d - 1)
-                              : static_cast<float>(in_d) / out_d;
+                              : static_cast<float>(new_scale_d);
   }
   if (out_h > 1) {
+    float new_scale_h = 0.f;
+    new_scale_h = (scale_h > 0) ? static_cast<float>(1. / scale_h)
+                                : static_cast<float>(in_h) / out_h;
     ratio_h = (align_corners) ? static_cast<float>(in_h - 1) / (out_h - 1)
-                              : static_cast<float>(in_h) / out_h;
+                              : static_cast<float>(new_scale_h);
   }
   if (out_w > 1) {
+    float new_scale_w = 0.f;
+    new_scale_w = (scale_w > 0) ? static_cast<float>(1. / scale_w)
+                                : static_cast<float>(in_w) / out_w;
     ratio_w = (align_corners) ? static_cast<float>(in_w - 1) / (out_w - 1)
-                              : static_cast<float>(in_w) / out_w;
+                              : static_cast<float>(new_scale_w);
   }
 
   if ("trilinear" == interp_method) {
diff --git a/paddle/fluid/operators/isfinite_op.cc b/paddle/fluid/operators/isfinite_op.cc
index af737ec42f631c534bb26ad38901e03d804d07b3..9b92ce3e538aa660dedda67de0cabaa4adbdc8c7 100644
--- a/paddle/fluid/operators/isfinite_op.cc
+++ b/paddle/fluid/operators/isfinite_op.cc
@@ -43,7 +43,11 @@ class OverflowOp : public framework::OperatorWithKernel {
     } else if (x_var->IsType<framework::SelectedRows>()) {
       dtype = x_var->Get<framework::SelectedRows>().value().type();
     } else {
-      PADDLE_THROW("Cannot find the input data type by all input data");
+      PADDLE_ENFORCE_EQ(
+          true, false,
+          platform::errors::InvalidArgument(
+              "The input type mismatch, the type of Input(X) must be Tensor or "
+              "SelectedRows, please check your input."));
     }
     return framework::OpKernelType(framework::proto::VarType::Type(dtype),
                                    ctx.GetPlace());
diff --git a/paddle/fluid/operators/isfinite_op.h b/paddle/fluid/operators/isfinite_op.h
index 83b080856366ac3332c5856a19b721893bb80eb3..2fc0d58669bae428d811c7200e025f36f087b905 100644
--- a/paddle/fluid/operators/isfinite_op.h
+++ b/paddle/fluid/operators/isfinite_op.h
@@ -57,7 +57,11 @@ class OverflowKernel : public framework::OpKernel<T> {
       auto& in = ctx.Input<framework::SelectedRows>("X")->value();
       functor(in, out);
     } else {
-      PADDLE_THROW("Unsupported input type.");
+      PADDLE_ENFORCE_EQ(
+          true, false,
+          platform::errors::InvalidArgument(
+              "The input type mismatch, the type of Input(X) must be Tensor or "
+              "SelectedRows, please check your input."));
     }
   }
 };
diff --git a/paddle/fluid/operators/kldiv_loss_op.h b/paddle/fluid/operators/kldiv_loss_op.h
index 369fdb4872b4184d706a5264b58f70f63051fca1..857ecda303c2607b1b6fb9a5d2ec132b335d6c29 100644
--- a/paddle/fluid/operators/kldiv_loss_op.h
+++ b/paddle/fluid/operators/kldiv_loss_op.h
@@ -72,7 +72,11 @@ class KLDivLossKernel : public framework::OpKernel<T> {
       loss_t.device(place) = output;
     } else if ("batchmean" == reduction) {
       auto output_sum = output.sum();
-      loss_t.device(place) = output_sum / output_sum.constant(n);
+      if (n > 0) {
+        loss_t.device(place) = output_sum / output_sum.constant(n);
+      } else {
+        loss_t.device(place) = output_sum;
+      }
     } else if ("mean" == reduction) {
       loss_t.device(place) = output.mean();
     } else if ("sum" == reduction) {
diff --git a/paddle/fluid/operators/linspace_op.cc b/paddle/fluid/operators/linspace_op.cc
index 2c3172d2a1112e2c79a3c1215ccd0d3f08d59451..7cc07383bfa5f67a2404b220cb481d9017b40fd8 100644
--- a/paddle/fluid/operators/linspace_op.cc
+++ b/paddle/fluid/operators/linspace_op.cc
@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "paddle/fluid/operators/linspace_op.h"
+#include <string>
 
 namespace paddle {
 namespace operators {
@@ -21,9 +22,7 @@ class LinspaceOp : public framework::OperatorWithKernel {
  public:
   using framework::OperatorWithKernel::OperatorWithKernel;
 
-  void InferShape(framework::InferShapeContext* ctx) const override {
-    PADDLE_ENFORCE(ctx->HasInput("Start"),
-                   "Input(Start) of LinspaceOp should not be null.");
+  void InferShape(framework::InferShapeContext *ctx) const override {
     OP_INOUT_CHECK(ctx->HasInput("Start"), "Input", "Start", "linspace");
     OP_INOUT_CHECK(ctx->HasInput("Stop"), "Input", "Stop", "linspace");
     OP_INOUT_CHECK(ctx->HasInput("Num"), "Input", "Num", "linspace");
@@ -52,11 +51,17 @@ class LinspaceOp : public framework::OperatorWithKernel {
 
  protected:
   framework::OpKernelType GetExpectedKernelType(
-      const framework::ExecutionContext& ctx) const override {
+      const framework::ExecutionContext &ctx) const override {
     return framework::OpKernelType(
         framework::proto::VarType::Type(ctx.Attr<int>("dtype")),
         ctx.GetPlace());
   }
+
+  framework::OpKernelType GetKernelTypeForVar(
+      const std::string &var_name, const framework::Tensor &tensor,
+      const framework::OpKernelType &expected_kernel_type) const override {
+    return expected_kernel_type;
+  }
 };
 
 class LinspaceOpMaker : public framework::OpProtoAndCheckerMaker {
diff --git a/paddle/fluid/operators/linspace_op.cu b/paddle/fluid/operators/linspace_op.cu
index 8aca892a81d41b1e0a9f7f9c14169c2817ae9452..a4f0693323297c286d24b169f1120e4017992a9b 100644
--- a/paddle/fluid/operators/linspace_op.cu
+++ b/paddle/fluid/operators/linspace_op.cu
@@ -12,6 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
+#include "paddle/fluid/framework/data_type_transform.h"
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/operators/linspace_op.h"
 #include "paddle/fluid/platform/cuda_primitives.h"
@@ -19,10 +20,19 @@ limitations under the License. */
 namespace paddle {
 namespace operators {
 
+using Tensor = framework::Tensor;
+
 template <typename T>
-__global__ void LinspaceKernel(T start, double step, int64_t size, T* out) {
-  CUDA_KERNEL_LOOP(index, size) {
-    out[index] = static_cast<T>(start + step * index);
+__global__ void LinspaceKernel(T start, T stop, double step, int64_t size,
+                               T* out) {
+  int64_t index = blockIdx.x * blockDim.x + threadIdx.x;
+
+  for (; index < size; index += blockDim.x * gridDim.x) {
+    if (index < size / 2) {
+      out[index] = static_cast<T>(start + step * index);
+    } else {
+      out[index] = static_cast<T>(stop - step * (size - index - 1));
+    }
   }
 }
 
@@ -35,33 +45,52 @@ template <typename T>
 class CUDALinspaceKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& context) const override {
-    auto* start_t = context.Input<framework::Tensor>("Start");
-    auto* stop_t = context.Input<framework::Tensor>("Stop");
+    auto* pre_start = context.Input<framework::Tensor>("Start");
+    auto* pre_stop = context.Input<framework::Tensor>("Stop");
     auto* num_t = context.Input<framework::Tensor>("Num");
     auto* out = context.Output<framework::Tensor>("Out");
+    auto dtype = static_cast<framework::proto::VarType::Type>(
+        context.Attr<int>("dtype"));
 
-    framework::Tensor n;
-    framework::TensorCopy(*start_t, platform::CPUPlace(), &n);
-    T start = n.data<T>()[0];
-    framework::TensorCopy(*stop_t, platform::CPUPlace(), &n);
-    T stop = n.data<T>()[0];
-    framework::TensorCopy(*num_t, platform::CPUPlace(), &n);
-    int32_t num = n.data<int32_t>()[0];
+    Tensor start_t;
+    Tensor stop_t;
+    auto start_dtype =
+        framework::OpKernelType(pre_start->type(), context.GetPlace());
+    auto stop_dtype =
+        framework::OpKernelType(pre_stop->type(), context.GetPlace());
+    auto out_dtype = framework::OpKernelType(dtype, context.GetPlace());
+    framework::TransDataType(start_dtype, out_dtype, *pre_start, &start_t);
+    framework::TransDataType(stop_dtype, out_dtype, *pre_stop, &stop_t);
 
-    PADDLE_ENFORCE(num > 0, "The num of linspace op should be larger than 0.");
+    framework::Tensor n_start;
+    framework::Tensor n_stop;
+    framework::Tensor n_num;
+    framework::TensorCopy(start_t, platform::CPUPlace(), &n_start);
+    T start = n_start.data<T>()[0];
+    framework::TensorCopy(stop_t, platform::CPUPlace(), &n_stop);
+    T stop = n_stop.data<T>()[0];
+    framework::TensorCopy(*num_t, platform::CPUPlace(), &n_num);
+    int64_t num = static_cast<int64_t>(n_num.data<int32_t>()[0]);
+
+    PADDLE_ENFORCE_GT(num, 0, platform::errors::InvalidArgument(
+                                  "The num of linspace op should be larger "
+                                  "than 0, but received num is %d",
+                                  num));
 
     out->Resize(framework::make_ddim({num}));
     T* out_data = out->mutable_data<T>(context.GetPlace());
 
     double step = 0;
-    if (num != 1) {
-      step = (static_cast<double>(stop - start)) / (num - 1);
-    }
-
     auto stream = context.cuda_device_context().stream();
     int block = 512;
     int grid = (num + block - 1) / block;
-    LinspaceKernel<T><<<grid, block, 0, stream>>>(start, step, num, out_data);
+    if (num != 1) {
+      step = (static_cast<double>(stop - start)) / (num - 1);
+      LinspaceKernel<T><<<grid, block, 0, stream>>>(start, stop, step, num,
+                                                    out_data);
+    } else {
+      LinspaceSpecialKernel<T><<<grid, block, 0, stream>>>(start, out_data);
+    }
   }
 };
 
diff --git a/paddle/fluid/operators/linspace_op.h b/paddle/fluid/operators/linspace_op.h
index 9fb4960375ed7be60598d558c65310bd4a4b84bc..d8e0fefe175869171cac9c8d3798880e844dbe35 100644
--- a/paddle/fluid/operators/linspace_op.h
+++ b/paddle/fluid/operators/linspace_op.h
@@ -14,30 +14,57 @@ limitations under the License. */
 
 #pragma once
 #include <functional>
+#include "paddle/fluid/framework/data_type_transform.h"
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/operators/math/math_function.h"
 
 namespace paddle {
 namespace operators {
 
+using Tensor = framework::Tensor;
+
 template <typename T>
 class CPULinspaceKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& context) const override {
-    T start = context.Input<framework::Tensor>("Start")->data<T>()[0];
-    T stop = context.Input<framework::Tensor>("Stop")->data<T>()[0];
+    auto* pre_start = context.Input<framework::Tensor>("Start");
+    auto* pre_stop = context.Input<framework::Tensor>("Stop");
     int32_t num = context.Input<framework::Tensor>("Num")->data<int32_t>()[0];
     auto* out = context.Output<framework::Tensor>("Out");
-    PADDLE_ENFORCE(num > 0, "The num of linspace op should be larger than 0.");
+    auto dtype = static_cast<framework::proto::VarType::Type>(
+        context.Attr<int>("dtype"));
+
+    Tensor start_t;
+    Tensor stop_t;
+    auto start_dtype =
+        framework::OpKernelType(pre_start->type(), context.GetPlace());
+    auto stop_dtype =
+        framework::OpKernelType(pre_stop->type(), context.GetPlace());
+    auto out_dtype = framework::OpKernelType(dtype, context.GetPlace());
+    framework::TransDataType(start_dtype, out_dtype, *pre_start, &start_t);
+    framework::TransDataType(stop_dtype, out_dtype, *pre_stop, &stop_t);
+
+    T start = start_t.data<T>()[0];
+    T stop = stop_t.data<T>()[0];
+    PADDLE_ENFORCE_GT(num, 0, platform::errors::InvalidArgument(
+                                  "The num of linspace op should be larger "
+                                  "than 0, but received num is %d",
+                                  num));
 
     out->Resize(framework::make_ddim({num}));
 
     T* out_data = out->mutable_data<T>(context.GetPlace());
 
     if (num > 1) {
+      // step should be of double type for all types
       double step = (static_cast<double>(stop - start)) / (num - 1);
+      int half_num = num / 2;
       for (int i = 0; i < num; ++i) {
-        out_data[i] = static_cast<T>(start + step * i);
+        if (i < half_num) {
+          out_data[i] = static_cast<T>(start + step * i);
+        } else {
+          out_data[i] = static_cast<T>(stop - step * (num - i - 1));
+        }
       }
     } else {
       out_data[0] = static_cast<T>(start);
diff --git a/paddle/fluid/operators/lite/lite_engine_op.h b/paddle/fluid/operators/lite/lite_engine_op.h
index a920bf7c3f505b839f8f1fd252c9f8505393f3a9..f6d65704388e6ec90c9209475e5f4b19061085fa 100644
--- a/paddle/fluid/operators/lite/lite_engine_op.h
+++ b/paddle/fluid/operators/lite/lite_engine_op.h
@@ -39,7 +39,7 @@ class LiteEngineOp : public framework::OperatorBase {
  private:
   std::vector<std::string> in_names_;
   std::vector<std::string> out_names_;
-  paddle::lite::Predictor *engine_;
+  paddle::lite_api::PaddlePredictor *engine_;
   framework::proto::VarType::Type precision_;
   bool use_gpu_;
   bool zero_copy_;
@@ -78,10 +78,10 @@ class LiteEngineOp : public framework::OperatorBase {
       framework::LoDTensor src_t =
           inference::analysis::GetFromScope<framework::LoDTensor>(scope,
                                                                   in_names_[i]);
-      paddle::lite::Tensor *dst_t = engine_->GetInput(i);
+      paddle::lite_api::Tensor dst_t = *(engine_->GetInput(i));
       VLOG(3) << "== fluid -> lite (" << in_names_[i] << " -> "
               << engine_->GetInputNames()[i] << ")";
-      inference::lite::utils::TensorCopy(dst_t, &src_t, *ctx, zero_copy_);
+      inference::lite::utils::TensorCopy(&dst_t, &src_t, *ctx, zero_copy_);
     }
 #ifdef PADDLE_WITH_CUDA
     if (platform::is_gpu_place(dev_place)) {
@@ -93,7 +93,7 @@ class LiteEngineOp : public framework::OperatorBase {
     engine_->Run();
     VLOG(3) << "lite engine run done";
     for (size_t i = 0; i < out_names_.size(); i++) {
-      paddle::lite::Tensor src_t = *(engine_->GetOutput(i));
+      paddle::lite_api::Tensor src_t = *(engine_->GetOutput(i));
       framework::LoDTensor *dst_t =
           &inference::analysis::GetFromScope<framework::LoDTensor>(
               scope, out_names_[i]);
diff --git a/paddle/fluid/operators/lite/lite_engine_op_test.cc b/paddle/fluid/operators/lite/lite_engine_op_test.cc
index fb5c0dcb3514de815b97944d0fdbf3bd7853b628..76c963ac652687cb0f65a0497b5c994f82d0d7aa 100644
--- a/paddle/fluid/operators/lite/lite_engine_op_test.cc
+++ b/paddle/fluid/operators/lite/lite_engine_op_test.cc
@@ -84,10 +84,10 @@ TEST(LiteEngineOp, engine_op) {
   inference::lite::EngineConfig config;
   config.valid_places = {
 #ifdef PADDLE_WITH_CUDA
-      paddle::lite::Place({TARGET(kCUDA), PRECISION(kFloat)}),
+      paddle::lite_api::Place({TARGET(kCUDA), PRECISION(kFloat)}),
 #endif
-      paddle::lite::Place({TARGET(kHost), PRECISION(kAny)}),
-      paddle::lite::Place({TARGET(kX86), PRECISION(kFloat)}),
+      paddle::lite_api::Place({TARGET(kX86), PRECISION(kFloat)}),
+      paddle::lite_api::Place({TARGET(kHost), PRECISION(kAny)}),
   };
   serialize_params(&(config.param), &scope, repetitive_params);
   config.model = program.Proto()->SerializeAsString();
diff --git a/paddle/fluid/operators/math/CMakeLists.txt b/paddle/fluid/operators/math/CMakeLists.txt
index 3a19c7edff3569d503480fd060a6432dc59d2108..24ed4fcf6684980b217aad35dc124acef653c9b9 100644
--- a/paddle/fluid/operators/math/CMakeLists.txt
+++ b/paddle/fluid/operators/math/CMakeLists.txt
@@ -9,7 +9,11 @@ function(math_library TARGET)
     set(hip_srcs)
     set(math_common_deps device_context framework_proto enforce)
     if (WITH_GPU)
-        list(APPEND math_common_deps cub)
+        if (${CMAKE_CUDA_COMPILER_VERSION} LESS 11.0)	
+            list(APPEND math_common_deps cub)
+	else()
+            list(APPEND math_common_deps)
+	endif()
     endif()
     set(multiValueArgs DEPS)
     cmake_parse_arguments(math_library "${options}" "${oneValueArgs}"
@@ -72,6 +76,7 @@ math_library(prelu)
 math_library(bert_encoder_functor)
 math_library(tree2col DEPS math_function)
 math_library(matrix_inverse)
+math_library(segment_pooling)
 
 cc_test(math_function_test SRCS math_function_test.cc DEPS math_function)
 cc_test(selected_rows_functor_test SRCS selected_rows_functor_test.cc DEPS selected_rows_functor)
diff --git a/paddle/fluid/operators/math/beam_search.cc b/paddle/fluid/operators/math/beam_search.cc
index 0155ef188ef967fbf67505d28beeeaf956bb3a70..550de1aadde2935fae34226dba78cc06d82cd1f3 100644
--- a/paddle/fluid/operators/math/beam_search.cc
+++ b/paddle/fluid/operators/math/beam_search.cc
@@ -87,7 +87,10 @@ class BeamSearchFunctor<platform::CPUDeviceContext, T> {
     lod[0].assign(high_level.begin(), high_level.end());
     lod[1].assign(low_level.begin(), low_level.end());
     if (!framework::CheckLoD(lod)) {
-      PADDLE_THROW("lod %s is not right", framework::LoDToString(lod));
+      PADDLE_THROW(platform::errors::InvalidArgument(
+          "lod %s is not right in"
+          " beam_search, please check your code.",
+          framework::LoDToString(lod)));
     }
     selected_ids->set_lod(lod);
     selected_scores->set_lod(lod);
diff --git a/paddle/fluid/operators/math/beam_search.cu b/paddle/fluid/operators/math/beam_search.cu
index cf6d44c1abc531da9d00738bba22f70a4c68bbab..ed3ead47d171efb4128a294c7d7a24324c7187b7 100644
--- a/paddle/fluid/operators/math/beam_search.cu
+++ b/paddle/fluid/operators/math/beam_search.cu
@@ -400,7 +400,10 @@ class BeamSearchFunctor<platform::CUDADeviceContext, T> {
 
     context.Wait();
     if (!framework::CheckLoD(selected_lod)) {
-      PADDLE_THROW("lod %s is not right", framework::LoDToString(selected_lod));
+      PADDLE_THROW(platform::errors::InvalidArgument(
+          "lod %s is not right in"
+          " beam_search, please check your code.",
+          framework::LoDToString(selected_lod)));
     }
 
     selected_ids->set_lod(selected_lod);
diff --git a/paddle/fluid/operators/math/blas.cc b/paddle/fluid/operators/math/blas.cc
index 6a143b3c056455595fdedc131b0c5f4ee756e1e0..2a7ce83967f0f74f4c2178dd4277e6a1687b5ec7 100644
--- a/paddle/fluid/operators/math/blas.cc
+++ b/paddle/fluid/operators/math/blas.cc
@@ -20,7 +20,11 @@ namespace operators {
 namespace math {
 MatDescriptor CreateMatrixDescriptor(const framework::DDim &tensor_dim,
                                      int num_flatten_cols, bool trans) {
-  PADDLE_ENFORCE_GT(tensor_dim.size(), 1);
+  PADDLE_ENFORCE_GT(
+      tensor_dim.size(), 1,
+      platform::errors::InvalidArgument("The tensor dim size should be greater "
+                                        "than 1, but reveived dim size is %d",
+                                        tensor_dim.size()));
   MatDescriptor retv;
   if (num_flatten_cols > 1) {
     auto flatten_dim = framework::flatten_to_2d(tensor_dim, num_flatten_cols);
diff --git a/paddle/fluid/operators/math/blas_impl.cu.h b/paddle/fluid/operators/math/blas_impl.cu.h
index d0c5f74d4efb8248b41d8b2af285e8dd7ec4d479..a0464cf70e2dcc44c42fc2ca7440680ef8a53e6e 100644
--- a/paddle/fluid/operators/math/blas_impl.cu.h
+++ b/paddle/fluid/operators/math/blas_impl.cu.h
@@ -60,7 +60,8 @@ struct CUBlas<float> {
     PADDLE_ENFORCE_CUDA_SUCCESS(
         platform::dynload::cublasSgemmStridedBatched(args...));
 #else
-    PADDLE_THROW("SgemmStridedBatched is not supported on cuda <= 7.5");
+    PADDLE_THROW(platform::errors::Unimplemented(
+        "SgemmStridedBatched is not supported on cuda <= 7.5"));
 #endif
   }
 
@@ -85,7 +86,8 @@ struct CUBlas<float> {
           beta, C, Ctype, ldc));
     });
 #else
-    PADDLE_THROW("cublasSgemmEx is supported on cuda >= 8.0");
+    PADDLE_THROW(platform::errors::Unimplemented(
+        "cublasSgemmEx is not supported on cuda <= 7.5"));
 #endif
   }
 
@@ -146,13 +148,15 @@ struct CUBlas<double> {
     PADDLE_ENFORCE_CUDA_SUCCESS(
         platform::dynload::cublasDgemmStridedBatched(args...));
 #else
-    PADDLE_THROW("DgemmStridedBatched is not supported on cuda <= 7.5");
+    PADDLE_THROW(platform::errors::Unimplemented(
+        "DgemmStridedBatched is not supported on cuda <= 7.5"));
 #endif
   }
 
   template <typename... ARGS>
   static void GEMM_EX(ARGS... args) {
-    PADDLE_THROW("Currently there are not cublasDgemmEx.");
+    PADDLE_THROW(platform::errors::Unimplemented(
+        "Currently there are not cublasDgemmEx."));
   }
 
   template <typename... ARGS>
@@ -216,7 +220,8 @@ struct CUBlas<platform::float16> {
         reinterpret_cast<const __half *>(beta), reinterpret_cast<__half *>(C),
         ldc, strideC, batchCount));
 #else
-    PADDLE_THROW("HgemmStridedBatched is not supported on cuda <= 7.5");
+    PADDLE_THROW(platform::errors::Unimplemented(
+        "HgemmStridedBatched is not supported on cuda <= 7.5"));
 #endif
   }
 
@@ -247,7 +252,8 @@ struct CUBlas<platform::float16> {
           beta, C, Ctype, ldc, computeType, algo));
     });
 #else
-    PADDLE_THROW("cublasGemmEx is supported on cuda >= 8.0");
+    PADDLE_THROW(platform::errors::Unimplemented(
+        "cublasGemmEx is not supported on cuda <= 7.5"));
 #endif
   }
 };
@@ -302,8 +308,12 @@ inline void Blas<platform::CUDADeviceContext>::GEMM(
       (transB == CblasNoTrans) ? CUBLAS_OP_N : CUBLAS_OP_T;
 
   // TODO(kexinzhao): add processing code for compute capability < 53 case
-  PADDLE_ENFORCE_GE(context_.GetComputeCapability(), 53,
-                    "cublas fp16 gemm requires GPU compute capability >= 53");
+  PADDLE_ENFORCE_GE(
+      context_.GetComputeCapability(), 53,
+      platform::errors::InvalidArgument(
+          "cublas fp16 gemm requires GPU compute capability >= 53,"
+          "but received %d",
+          context_.GetComputeCapability()));
 
   float h_alpha = static_cast<float>(alpha);
   float h_beta = static_cast<float>(beta);
diff --git a/paddle/fluid/operators/math/blas_impl.h b/paddle/fluid/operators/math/blas_impl.h
index 892bf15738141bfbb7e75fa6b37c0cda53a8e098..515d6a2435e86fe07ffe1309628ef2fbeefdc6f0 100644
--- a/paddle/fluid/operators/math/blas_impl.h
+++ b/paddle/fluid/operators/math/blas_impl.h
@@ -29,7 +29,8 @@ template <>
 struct CBlas<int8_t> {
   template <typename... ARGS>
   static void VCOPY(ARGS... args) {
-    PADDLE_THROW("Blas VCOPY don't support int8_t");
+    PADDLE_THROW(platform::errors::Unimplemented(
+        "Blas VCOPY do not supported on CPU, please check your code"));
   }
 };
 
@@ -347,22 +348,47 @@ struct CBlas<double> {
 
 template <>
 struct CBlas<platform::float16> {
-  static void GEMM(...) { PADDLE_THROW("float16 GEMM not supported on CPU"); }
+  static void GEMM(...) {
+    PADDLE_THROW(platform::errors::Unimplemented(
+        "float16 GEMM not supported on CPU, please check your code"));
+  }
+
   static void SMM_GEMM(...) {
-    PADDLE_THROW("float16 SMM_GEMM not supported on CPU");
+    PADDLE_THROW(platform::errors::Unimplemented(
+        "float16 SMM_GEMM not supported on CPU, please check your code"));
   }
-  static void VMUL(...) { PADDLE_THROW("float16 VMUL not supported on CPU"); }
-  static void VEXP(...) { PADDLE_THROW("float16 VEXP not supported on CPU"); }
-  static void VSQUARE(...) {
-    PADDLE_THROW("float16 VSQUARE not supported on CPU");
+  static void VMUL(...) {
+    PADDLE_THROW(platform::errors::Unimplemented(
+        "float16 VMUL not supported on CPU, please check your code"));
   }
-  static void VPOW(...) { PADDLE_THROW("float16 VPOW not supported on CPU"); }
-  static void DOT(...) { PADDLE_THROW("float16 DOT not supported on CPU"); };
-  static void SCAL(...) { PADDLE_THROW("float16 SCAL not supported on CPU"); };
-  static void ASUM(...) { PADDLE_THROW("float16 ASUM not supported on CPU"); };
+  static void VEXP(...) {
+    PADDLE_THROW(platform::errors::Unimplemented(
+        "float16 VEXP not supported on CPU, please check your code"));
+  }
+  static void VSQUARE(...) {
+    PADDLE_THROW(platform::errors::Unimplemented(
+        "float16 VSQUARE not supported on CPU, please check your code"));
+  }
+  static void VPOW(...) {
+    PADDLE_THROW(platform::errors::Unimplemented(
+        "float16 VPOW not supported on CPU, please check your code"));
+  }
+  static void DOT(...) {
+    PADDLE_THROW(platform::errors::Unimplemented(
+        "float16 DOT not supported on CPU, please check your code"));
+  };
+  static void SCAL(...) {
+    PADDLE_THROW(platform::errors::Unimplemented(
+        "float16 SCAL not supported on CPU, please check your code"));
+  };
+  static void ASUM(...) {
+    PADDLE_THROW(platform::errors::Unimplemented(
+        "float16 ASUM not supported on CPU, please check your code"));
+  };
 #ifdef PADDLE_WITH_MKLML
   static void GEMM_BATCH(...) {
-    PADDLE_THROW("float16 GEMM_BATCH not supported on CPU");
+    PADDLE_THROW(platform::errors::Unimplemented(
+        "float16 GEMM_BATCH not supported on CPU, please check your code"));
   }
 #endif
 };
@@ -446,11 +472,18 @@ void Blas<DeviceContext>::MatMul(const framework::Tensor &mat_a, bool trans_a,
   auto dim_a = mat_a.dims();
   auto dim_b = mat_b.dims();
   auto dim_out = mat_out->dims();
-  PADDLE_ENFORCE(dim_a.size() == 2 && dim_b.size() == 2 && dim_out.size() == 2,
-                 "The input and output of matmul be matrix");
-  PADDLE_ENFORCE(
-      mat_a.place() == mat_b.place() && mat_a.place() == mat_out->place(),
-      "The places of matrices must be same");
+  PADDLE_ENFORCE_EQ(
+      dim_a.size() == 2 && dim_b.size() == 2 && dim_out.size() == 2, true,
+      platform::errors::InvalidArgument(
+          "The input and output of matmul should be matrix, the dim size must "
+          "be 2,"
+          "but received dim size input_a:%d, input_b:%d, output:%d",
+          dim_a.size(), dim_b.size(), dim_out.size()));
+  PADDLE_ENFORCE_EQ(
+      mat_a.place() == mat_b.place() && mat_a.place() == mat_out->place(), true,
+      platform::errors::InvalidArgument("The places of matrices in the matmul "
+                                        "should be same, please check your "
+                                        "code."));
 
   int M = dim_out[0];
   int N = dim_out[1];
@@ -715,7 +748,13 @@ void Blas<platform::CPUDeviceContext>::BatchedGEMMWithHead(
     }
 
   } else {
-    PADDLE_ENFORCE_EQ(W1, H2);
+    PADDLE_ENFORCE_EQ(
+        W1, H2,
+        platform::errors::InvalidArgument(
+            "The fisrt matrix width should be same as second matrix height,"
+            "but received fisrt matrix width %d"
+            ", second matrix height %d",
+            W1, H2));
     int ldc = W2 * head_number;
     int sub_width = W1 / head_number;
 
@@ -785,7 +824,14 @@ void Blas<DeviceContext>::MatMul(const framework::Tensor &mat_a,
                                  const framework::Tensor &mat_b,
                                  const MatDescriptor &dim_b, T alpha,
                                  framework::Tensor *mat_out, T beta) const {
-  PADDLE_ENFORCE_EQ(dim_a.width_, dim_b.height_);
+  PADDLE_ENFORCE_EQ(
+      dim_a.width_, dim_b.height_,
+      platform::errors::InvalidArgument(
+          "The fisrt matrix width should be same as second matrix height,"
+          "but received fisrt matrix width %d"
+          ", second matrix height %d",
+          dim_a.width_, dim_b.height_));
+
   CBLAS_TRANSPOSE transA = !dim_a.trans_ ? CblasNoTrans : CblasTrans;
   CBLAS_TRANSPOSE transB = !dim_b.trans_ ? CblasNoTrans : CblasTrans;
   if (dim_a.batch_size_ == 0 && dim_b.batch_size_ == 0) {
@@ -793,12 +839,14 @@ void Blas<DeviceContext>::MatMul(const framework::Tensor &mat_a,
                            dim_a.width_, alpha, mat_a.data<T>(),
                            mat_b.data<T>(), beta, mat_out->data<T>());
   } else {
-    PADDLE_ENFORCE(dim_a.batch_size_ == dim_b.batch_size_ ||
-                       dim_a.batch_size_ == 0 || dim_b.batch_size_ == 0,
-                   "dim_a.batch_size should be equal to dim_b.batch_size, or "
-                   "one of dim_a.batch_size and dim_b.batch_size should be 0. "
-                   "But got dim_a.batch_size = %d, dim_b.batch_size = %d.",
-                   dim_a.batch_size_, dim_b.batch_size_);
+    PADDLE_ENFORCE_EQ(
+        dim_a.batch_size_ == dim_b.batch_size_ || dim_a.batch_size_ == 0 ||
+            dim_b.batch_size_ == 0,
+        true, platform::errors::InvalidArgument(
+                  "dim_a.batch_size should be equal to dim_b.batch_size, or "
+                  "one of dim_a.batch_size and dim_b.batch_size should be 0. "
+                  "But got dim_a.batch_size = %d, dim_b.batch_size = %d.",
+                  dim_a.batch_size_, dim_b.batch_size_));
     this->template BatchedGEMM<T>(
         transA, transB, dim_a.height_, dim_b.width_, dim_a.width_, alpha,
         mat_a.data<T>(), mat_b.data<T>(), beta, mat_out->data<T>(),
@@ -834,15 +882,42 @@ void Blas<DeviceContext>::MatMulWithHead(const framework::Tensor &mat_a,
                                          int head_number,
                                          framework::Tensor *mat_out, T beta,
                                          bool mat_b_split_vertical) const {
-  PADDLE_ENFORCE_EQ(dim_a.width_ % head_number, 0);
-  PADDLE_ENFORCE_GE(head_number, 1);
-  PADDLE_ENFORCE_LE(head_number, dim_a.width_);
+  PADDLE_ENFORCE_EQ(
+      dim_a.width_ % head_number, 0,
+      platform::errors::InvalidArgument(
+          "The first input width must be some times the head number"
+          "but received first input width %d"
+          ",  head_number %d",
+          dim_a.width_, head_number));
+  PADDLE_ENFORCE_GE(head_number, 1,
+                    platform::errors::InvalidArgument(
+                        "The head number should be greater equal 1,"
+                        "but received head number %d",
+                        head_number));
+  PADDLE_ENFORCE_LE(
+      head_number, dim_a.width_,
+      platform::errors::InvalidArgument(
+          "The head number should be less equal first input width,"
+          "but received first input width %d"
+          ",  head_number %d",
+          dim_a.width_, head_number));
   CBLAS_TRANSPOSE transA = !dim_a.trans_ ? CblasNoTrans : CblasTrans;
   CBLAS_TRANSPOSE transB = !dim_b.trans_ ? CblasNoTrans : CblasTrans;
 
   if (mat_b_split_vertical) {
-    PADDLE_ENFORCE_EQ(dim_b.height_, dim_a.width_ / head_number);
-    PADDLE_ENFORCE_EQ(dim_b.width_ % head_number, 0);
+    PADDLE_ENFORCE_EQ(
+        dim_b.height_, dim_a.width_ / head_number,
+        platform::errors::InvalidArgument(
+            "The second input height should be equal than first input width,"
+            "but received second input height %d, first input width %d",
+            dim_b.height_, dim_a.width_ / head_number));
+    PADDLE_ENFORCE_EQ(
+        dim_a.width_ % head_number, 0,
+        platform::errors::InvalidArgument(
+            "The second input width should be some times the head number"
+            "but received second input width %d"
+            ",  head_number %d",
+            dim_b.width_, head_number));
   }
 
   if (dim_a.batch_size_ == 0 && dim_b.batch_size_ == 0) {
@@ -888,9 +963,16 @@ void Blas<DeviceContext>::MatMulWithHead(const framework::Tensor &mat_a,
                              mat_out->data<T>() + sub_matC_offset, ldc);
     }
   } else {
-    PADDLE_ENFORCE_EQ((dim_a.batch_size_ == dim_b.batch_size_ ||
-                       dim_a.batch_size_ == 0 || dim_b.batch_size_ == 0),
-                      true);
+    PADDLE_ENFORCE_EQ(
+        (dim_a.batch_size_ == dim_b.batch_size_ || dim_a.batch_size_ == 0 ||
+         dim_b.batch_size_ == 0),
+        true,
+        platform::errors::InvalidArgument(
+            "The first input batch size should be equal than second input,"
+            "either two input batch size is 0, but received first input batch "
+            "size"
+            " %d, second input batch size %d",
+            dim_a.batch_size_, dim_b.batch_size_));
 
     this->template BatchedGEMMWithHead<T>(
         transA, transB, dim_a.width_, dim_a.height_, dim_b.width_,
diff --git a/paddle/fluid/operators/math/concat_and_split.h b/paddle/fluid/operators/math/concat_and_split.h
index 3a5eddcbf4af699a89ae1a21571337155699a1f3..18d9a6310dd6c09905ca7fa84d98f391a84dfa2d 100644
--- a/paddle/fluid/operators/math/concat_and_split.h
+++ b/paddle/fluid/operators/math/concat_and_split.h
@@ -65,13 +65,14 @@ class SplitFunctor {
 }  // namespace operators
 }  // namespace paddle
 
-#define FOR_ALL_TYPES(macro) \
-  macro(int);                \
-  macro(float);              \
-  macro(double);             \
-  macro(bool);               \
-  macro(int64_t);            \
-  macro(int16_t);            \
-  macro(uint8_t);            \
-  macro(int8_t);             \
-  macro(::paddle::platform::float16)
+#define FOR_ALL_TYPES(macro)          \
+  macro(int);                         \
+  macro(float);                       \
+  macro(double);                      \
+  macro(bool);                        \
+  macro(int64_t);                     \
+  macro(int16_t);                     \
+  macro(uint8_t);                     \
+  macro(int8_t);                      \
+  macro(::paddle::platform::float16); \
+  macro(::paddle::platform::bfloat16)
diff --git a/paddle/fluid/operators/math/concat_test.cc b/paddle/fluid/operators/math/concat_test.cc
index 411dbca25bb48c99dfd16779f54e46a3e80d0d4e..270a9d3f80a80d5ea2c8b97d4a69125355ddef61 100644
--- a/paddle/fluid/operators/math/concat_test.cc
+++ b/paddle/fluid/operators/math/concat_test.cc
@@ -79,8 +79,16 @@ void ConcatCase1(DeviceContext* context) {
   concat_functor(*context, input, 0, &out);
 
   // check the dim of input_a, input_b
-  PADDLE_ENFORCE_EQ(input_a.dims(), dim_a);
-  PADDLE_ENFORCE_EQ(input_b.dims(), dim_b);
+  PADDLE_ENFORCE_EQ(input_a.dims(), dim_a,
+                    paddle::platform::errors::InvalidArgument(
+                        "The dims of Input tensor should be the same as the "
+                        "declared dims. Tensor dims: [%s], declared dims: [%s]",
+                        input_a.dims(), dim_a));
+  PADDLE_ENFORCE_EQ(input_b.dims(), dim_b,
+                    paddle::platform::errors::InvalidArgument(
+                        "The dims of Input tensor should be the same as the "
+                        "declared dims. Tensor dims: [%s], declared dims: [%s]",
+                        input_b.dims(), dim_b));
 
   int* out_ptr = nullptr;
   if (paddle::platform::is_gpu_place(Place())) {
@@ -95,10 +103,14 @@ void ConcatCase1(DeviceContext* context) {
   int idx_a = 0, idx_b = 0;
   for (int j = 0; j < 5 * 3 * 4; ++j) {
     if (j >= cols) {
-      PADDLE_ENFORCE_EQ(out_ptr[j], b_ptr[idx_b]);
+      PADDLE_ENFORCE_EQ(out_ptr[j], b_ptr[idx_b],
+                        paddle::platform::errors::InvalidArgument(
+                            "Concat test failed, the result should be equal."));
       ++idx_b;
     } else {
-      PADDLE_ENFORCE_EQ(out_ptr[j], a_ptr[idx_a]);
+      PADDLE_ENFORCE_EQ(out_ptr[j], a_ptr[idx_a],
+                        paddle::platform::errors::InvalidArgument(
+                            "Concat test failed, the result should be equal."));
       ++idx_a;
     }
   }
@@ -166,8 +178,16 @@ void ConcatCase2(DeviceContext* context) {
   concat_functor(*context, input, 1, &out);
 
   // check the dim of input_a, input_b
-  PADDLE_ENFORCE_EQ(input_a.dims(), dim_a);
-  PADDLE_ENFORCE_EQ(input_b.dims(), dim_b);
+  PADDLE_ENFORCE_EQ(input_a.dims(), dim_a,
+                    paddle::platform::errors::InvalidArgument(
+                        "The dims of Input tensor should be the same as the "
+                        "declared dims. Tensor dims: [%s], declared dims: [%s]",
+                        input_a.dims(), dim_a));
+  PADDLE_ENFORCE_EQ(input_b.dims(), dim_b,
+                    paddle::platform::errors::InvalidArgument(
+                        "The dims of Input tensor should be the same as the "
+                        "declared dims. Tensor dims: [%s], declared dims: [%s]",
+                        input_b.dims(), dim_b));
 
   int* out_ptr = nullptr;
   if (paddle::platform::is_gpu_place(Place())) {
@@ -183,10 +203,16 @@ void ConcatCase2(DeviceContext* context) {
   for (int i = 0; i < 2; ++i) {
     for (int j = 0; j < 28; ++j) {
       if (j >= cols) {
-        PADDLE_ENFORCE_EQ(out_ptr[i * 28 + j], b_ptr[idx_b]);
+        PADDLE_ENFORCE_EQ(
+            out_ptr[i * 28 + j], b_ptr[idx_b],
+            paddle::platform::errors::InvalidArgument(
+                "Concat test failed, the result should be equal."));
         ++idx_b;
       } else {
-        PADDLE_ENFORCE_EQ(out_ptr[i * 28 + j], a_ptr[idx_a]);
+        PADDLE_ENFORCE_EQ(
+            out_ptr[i * 28 + j], a_ptr[idx_a],
+            paddle::platform::errors::InvalidArgument(
+                "Concat test failed, the result should be equal."));
         ++idx_a;
       }
     }
@@ -255,8 +281,16 @@ void ConcatCase3(DeviceContext* context) {
   concat_functor(*context, input, 2, &out);
 
   // check the dim of input_a, input_b
-  PADDLE_ENFORCE_EQ(input_a.dims(), dim_a);
-  PADDLE_ENFORCE_EQ(input_b.dims(), dim_b);
+  PADDLE_ENFORCE_EQ(input_a.dims(), dim_a,
+                    paddle::platform::errors::InvalidArgument(
+                        "The dims of Input tensor should be the same as the "
+                        "declared dims. Tensor dims: [%s], declared dims: [%s]",
+                        input_a.dims(), dim_a));
+  PADDLE_ENFORCE_EQ(input_b.dims(), dim_b,
+                    paddle::platform::errors::InvalidArgument(
+                        "The dims of Input tensor should be the same as the "
+                        "declared dims. Tensor dims: [%s], declared dims: [%s]",
+                        input_b.dims(), dim_b));
 
   int* out_ptr = nullptr;
   if (paddle::platform::is_gpu_place(Place())) {
@@ -273,10 +307,16 @@ void ConcatCase3(DeviceContext* context) {
   for (int i = 0; i < 6; ++i) {
     for (int j = 0; j < 9; ++j) {
       if (j >= cols) {
-        PADDLE_ENFORCE_EQ(out_ptr[i * 9 + j], b_ptr[idx_b]);
+        PADDLE_ENFORCE_EQ(
+            out_ptr[i * 9 + j], b_ptr[idx_b],
+            paddle::platform::errors::InvalidArgument(
+                "Concat test failed, the result should be equal."));
         ++idx_b;
       } else {
-        PADDLE_ENFORCE_EQ(out_ptr[i * 9 + j], a_ptr[idx_a]);
+        PADDLE_ENFORCE_EQ(
+            out_ptr[i * 9 + j], a_ptr[idx_a],
+            paddle::platform::errors::InvalidArgument(
+                "Concat test failed, the result should be equal."));
         ++idx_a;
       }
     }
@@ -347,8 +387,16 @@ void ConcatCase4(DeviceContext* context) {
   context->Wait();
 
   // check the dim of input_a, input_b
-  PADDLE_ENFORCE_EQ(input_a.dims(), dim_a);
-  PADDLE_ENFORCE_EQ(input_b.dims(), dim_b);
+  PADDLE_ENFORCE_EQ(input_a.dims(), dim_a,
+                    paddle::platform::errors::InvalidArgument(
+                        "The dims of Input tensor should be the same as the "
+                        "declared dims. Tensor dims: [%s], declared dims: [%s]",
+                        input_a.dims(), dim_a));
+  PADDLE_ENFORCE_EQ(input_b.dims(), dim_b,
+                    paddle::platform::errors::InvalidArgument(
+                        "The dims of Input tensor should be the same as the "
+                        "declared dims. Tensor dims: [%s], declared dims: [%s]",
+                        input_b.dims(), dim_b));
 
   int* out_ptr = nullptr;
   if (paddle::platform::is_gpu_place(Place())) {
@@ -365,10 +413,16 @@ void ConcatCase4(DeviceContext* context) {
   for (int i = 0; i < 2; ++i) {
     for (int j = 0; j < 24; ++j) {
       if (j >= cols) {
-        PADDLE_ENFORCE_EQ(out_ptr[i * 24 + j], b_ptr[idx_b]);
+        PADDLE_ENFORCE_EQ(
+            out_ptr[i * 24 + j], b_ptr[idx_b],
+            paddle::platform::errors::InvalidArgument(
+                "Concat test failed, the result should be equal."));
         ++idx_b;
       } else {
-        PADDLE_ENFORCE_EQ(out_ptr[i * 24 + j], a_ptr[idx_a]);
+        PADDLE_ENFORCE_EQ(
+            out_ptr[i * 24 + j], a_ptr[idx_a],
+            paddle::platform::errors::InvalidArgument(
+                "Concat test failed, the result should be equal."));
         ++idx_a;
       }
     }
diff --git a/paddle/fluid/operators/math/context_project.h b/paddle/fluid/operators/math/context_project.h
index e9019c6d2fe6890ee92cb5a3b047666e3c2a7e04..051c6019d74f7d2820dc0ba668da3cafe8864346 100644
--- a/paddle/fluid/operators/math/context_project.h
+++ b/paddle/fluid/operators/math/context_project.h
@@ -134,7 +134,10 @@ class ContextProjectFunctor {
       }
     }
     if (padding_trainable) {
-      PADDLE_ENFORCE_NOT_NULL(padding_data);
+      PADDLE_ENFORCE_NOT_NULL(
+          padding_data,
+          platform::errors::InvalidArgument(
+              "The input tensor 'padding_data' should not be NULL."));
       for (int i = 0; i < static_cast<int>(lod_level_0.size()) - 1; ++i) {
         if (lod_level_0[i] == lod_level_0[i + 1]) continue;
 
diff --git a/paddle/fluid/operators/math/cpu_vec.h b/paddle/fluid/operators/math/cpu_vec.h
index 8940a41424b01c975f1264ca309cc09fc3c7ae85..925f3b6161ae8506107f917196e77ecb2d9c5593 100644
--- a/paddle/fluid/operators/math/cpu_vec.h
+++ b/paddle/fluid/operators/math/cpu_vec.h
@@ -621,7 +621,10 @@ class VecActivations {
     } else if (type == "identity" || type == "") {
       return vec_identity<T, isa>;
     }
-    PADDLE_THROW("Not support type: %s", type);
+    PADDLE_THROW(platform::errors::InvalidArgument(
+        "Expected type should be one of sigmod, relu, tanh, identity. But got "
+        "not support type: %s.",
+        type));
   }
 };
 
diff --git a/paddle/fluid/operators/math/cross_entropy.cu b/paddle/fluid/operators/math/cross_entropy.cu
index c7fac60dd3e663088813f795352e4d751059de39..84fa0d6af990e22083ec1a0e3993893cefad1ab5 100644
--- a/paddle/fluid/operators/math/cross_entropy.cu
+++ b/paddle/fluid/operators/math/cross_entropy.cu
@@ -27,8 +27,8 @@ __global__ void CrossEntropyKernel(T* Y, const T* X, const int64_t* label,
                                    const int ignore_index) {
   CUDA_KERNEL_LOOP(i, N) {
     PADDLE_ENFORCE(label[i] >= 0 && label[i] < D || label[i] == ignore_index,
-                   "label[%d] expected >= 0 and < %ld, or == %ld, but got "
-                   "%ld. Please check input value.",
+                   "The value of label[%d] expected >= 0 and < %ld, or == %ld, "
+                   "but got %ld. Please check input value.",
                    i, D, ignore_index, label[i]);
     Y[i] = ignore_index == label[i]
                ? static_cast<T>(0)
diff --git a/paddle/fluid/operators/math/im2col.cc b/paddle/fluid/operators/math/im2col.cc
index 094a7237826610af574061263e5b0df5eafdf239..6fb393d791cc2a077dbcd0a912bcf31b5d59ad65 100644
--- a/paddle/fluid/operators/math/im2col.cc
+++ b/paddle/fluid/operators/math/im2col.cc
@@ -34,9 +34,16 @@ class Im2ColFunctor<paddle::operators::math::ColFormat::kCFO,
                   const std::vector<int>& stride,
                   const std::vector<int>& padding, framework::Tensor* col,
                   const DataLayout data_layout) {
-    PADDLE_ENFORCE_EQ(im.dims().size(), 3, "The dimension of im should be 3.");
+    PADDLE_ENFORCE_EQ(im.dims().size(), 3,
+                      platform::errors::InvalidArgument(
+                          "The dimension of tensor 'im' should be 3. But got "
+                          "the dims of tensor 'im' is [%s].",
+                          im.dims()));
     PADDLE_ENFORCE_EQ(col->dims().size(), 5,
-                      "The dimension of col should be 5.");
+                      platform::errors::InvalidArgument(
+                          "The dimension of tensor 'col' should be 5. But got "
+                          "the dims of tensor 'col' is [%s].",
+                          col->dims()));
 
     if (stride[0] == 1 && stride[1] == 1 && dilation[0] == 1 &&
         dilation[1] == 1) {
@@ -70,9 +77,16 @@ class Col2ImFunctor<paddle::operators::math::ColFormat::kCFO,
                   const std::vector<int>& stride,
                   const std::vector<int>& padding, framework::Tensor* im,
                   const DataLayout data_layout) {
-    PADDLE_ENFORCE_EQ(im->dims().size(), 3, "The dimension of im should be 3.");
+    PADDLE_ENFORCE_EQ(im->dims().size(), 3,
+                      platform::errors::InvalidArgument(
+                          "The dimension of tensor 'im' should be 3. But got "
+                          "the dims of tensor 'im' is [%s].",
+                          im->dims()));
     PADDLE_ENFORCE_EQ(col.dims().size(), 5,
-                      "The dimension of col should be 5.");
+                      platform::errors::InvalidArgument(
+                          "The dimension of tensor 'col' should be 5. But got "
+                          "the dims of tensor 'col' is [%s].",
+                          col.dims()));
     int im_channels =
         (data_layout != DataLayout::kNHWC ? im->dims()[0] : im->dims()[2]);
     int im_height =
@@ -88,16 +102,16 @@ class Col2ImFunctor<paddle::operators::math::ColFormat::kCFO,
                        ((dilation[0] * (filter_height - 1) + 1))) /
                               stride[0] +
                           1,
-                      col_height,
-                      "Output_height and padding(padding_up, padding_down) are "
-                      "inconsistent.");
+                      col_height, platform::errors::InvalidArgument(
+                                      "Output_height and padding(padding_up, "
+                                      "padding_down) are inconsistent."));
     PADDLE_ENFORCE_EQ((im_width + padding[1] + padding[3] -
                        ((dilation[1] * (filter_width - 1) + 1))) /
                               stride[1] +
                           1,
-                      col_width,
-                      "Output_height and padding(padding_up, padding_down) are "
-                      "inconsistent.");
+                      col_width, platform::errors::InvalidArgument(
+                                     "Output_height and padding(padding_up, "
+                                     "padding_down) are inconsistent."));
 
     int channels_col = im_channels * filter_height * filter_width;
 
@@ -154,9 +168,16 @@ class Im2ColFunctor<paddle::operators::math::ColFormat::kOCF,
                   const std::vector<int>& stride,
                   const std::vector<int>& padding, framework::Tensor* col,
                   const DataLayout data_layout) {
-    PADDLE_ENFORCE_EQ(im.dims().size(), 3, "The dimension of im should be 3.");
+    PADDLE_ENFORCE_EQ(im.dims().size(), 3,
+                      platform::errors::InvalidArgument(
+                          "The dimension of tensor 'im' should be 3. But got "
+                          "the dims of tensor 'im' is [%s].",
+                          im.dims()));
     PADDLE_ENFORCE_EQ(col->dims().size(), 5,
-                      "The dimension of col should be 5.");
+                      platform::errors::InvalidArgument(
+                          "The dimension of tensor 'col' should be 5. But got "
+                          "the dims of tensor 'col' is [%s].",
+                          col->dims()));
     int im_channels = im.dims()[0];
     int im_height = im.dims()[1];
     int im_width = im.dims()[2];
@@ -218,9 +239,16 @@ class Col2ImFunctor<paddle::operators::math::ColFormat::kOCF,
                   const std::vector<int>& stride,
                   const std::vector<int>& padding, framework::Tensor* im,
                   const DataLayout data_layout) {
-    PADDLE_ENFORCE_EQ(im->dims().size(), 3, "The dimension of im should be 3.");
+    PADDLE_ENFORCE_EQ(im->dims().size(), 3,
+                      platform::errors::InvalidArgument(
+                          "The dimension of tensor 'im' should be 3. But got "
+                          "the dims of tensor 'im' is [%s].",
+                          im->dims()));
     PADDLE_ENFORCE_EQ(col.dims().size(), 5,
-                      "The dimension of col should be 5.");
+                      platform::errors::InvalidArgument(
+                          "The dimension of tensor 'col' should be 5. But got "
+                          "the dims of tensor 'col' is [%s].",
+                          col.dims()));
     int im_channels = im->dims()[0];
     int im_height = im->dims()[1];
     int im_width = im->dims()[2];
@@ -231,14 +259,14 @@ class Col2ImFunctor<paddle::operators::math::ColFormat::kOCF,
 
     PADDLE_ENFORCE_EQ(
         (im_height + padding[0] + padding[2] - filter_height) / stride[0] + 1,
-        col_height,
-        "Output_height and padding(padding_up, padding_down) are "
-        "inconsistent.");
+        col_height, platform::errors::InvalidArgument(
+                        "Output_height and padding(padding_up, padding_down) "
+                        "are inconsistent."));
     PADDLE_ENFORCE_EQ(
         (im_width + padding[1] + padding[3] - filter_width) / stride[1] + 1,
         col_width,
-        "col_width and padding(padding_left, padding_right) are "
-        "inconsistent.");
+        platform::errors::InvalidArgument("col_width and padding(padding_left, "
+                                          "padding_right) are inconsistent."));
 
     T* im_data = im->data<T>();
     const T* col_data = col.data<T>();
diff --git a/paddle/fluid/operators/math/im2col.cu b/paddle/fluid/operators/math/im2col.cu
index 97719300daed9c02a716f31d853e3a381312961c..f2a2148ba6954f50cf59ae30f4f4be6aa070739f 100644
--- a/paddle/fluid/operators/math/im2col.cu
+++ b/paddle/fluid/operators/math/im2col.cu
@@ -81,9 +81,16 @@ class Im2ColFunctor<paddle::operators::math::ColFormat::kCFO,
                   const std::vector<int>& stride,
                   const std::vector<int>& padding, framework::Tensor* col,
                   const DataLayout data_layout) {
-    PADDLE_ENFORCE_EQ(im.dims().size(), 3, "The dimension of im should be 3.");
+    PADDLE_ENFORCE_EQ(im.dims().size(), 3,
+                      platform::errors::InvalidArgument(
+                          "The dimension of tensor 'im' should be 3. But got "
+                          "the dims of tensor 'im' is [%s].",
+                          im.dims()));
     PADDLE_ENFORCE_EQ(col->dims().size(), 5,
-                      "The dimension of col should be 5.");
+                      platform::errors::InvalidArgument(
+                          "The dimension of tensor 'col' should be 5. But got "
+                          "the dims of tensor 'col' is [%s].",
+                          col->dims()));
 
     int im_channels =
         (data_layout != DataLayout::kNHWC ? im.dims()[0] : im.dims()[2]);
@@ -182,9 +189,16 @@ class Col2ImFunctor<paddle::operators::math::ColFormat::kCFO,
                   const std::vector<int>& stride,
                   const std::vector<int>& padding, framework::Tensor* im,
                   const DataLayout data_layout) {
-    PADDLE_ENFORCE_EQ(im->dims().size(), 3, "The dimension of im should be 3.");
+    PADDLE_ENFORCE_EQ(im->dims().size(), 3,
+                      platform::errors::InvalidArgument(
+                          "The dimension of tensor 'im' should be 3. But got "
+                          "the dims of tensor 'im' is [%s].",
+                          im->dims()));
     PADDLE_ENFORCE_EQ(col.dims().size(), 5,
-                      "The dimension of col should be 5.");
+                      platform::errors::InvalidArgument(
+                          "The dimension of tensor 'col' should be 5. But got "
+                          "the dims of tensor 'col' is [%s].",
+                          col.dims()));
 
     int im_channels =
         (data_layout != DataLayout::kNHWC ? im->dims()[0] : im->dims()[2]);
@@ -201,16 +215,16 @@ class Col2ImFunctor<paddle::operators::math::ColFormat::kCFO,
                        (dilation[0] * (filter_height - 1) + 1)) /
                               stride[0] +
                           1,
-                      col_height,
-                      "Output_height and padding(padding_up, padding_down) are "
-                      "inconsistent.");
+                      col_height, platform::errors::InvalidArgument(
+                                      "Output_height and padding(padding_up, "
+                                      "padding_down) are inconsistent."));
     PADDLE_ENFORCE_EQ((im_width + padding[1] + padding[3] -
                        (dilation[1] * (filter_width - 1) + 1)) /
                               stride[1] +
                           1,
-                      col_width,
-                      "col_width and padding(padding_left, padding_right) are "
-                      "inconsistent.");
+                      col_width, platform::errors::InvalidArgument(
+                                     "col_width and padding(padding_left, "
+                                     "padding_right) are inconsistent."));
 
     size_t num_kernels = im_channels * im_height * im_width;
 
@@ -285,9 +299,16 @@ class Im2ColFunctor<paddle::operators::math::ColFormat::kOCF,
                   const std::vector<int>& stride,
                   const std::vector<int>& padding, framework::Tensor* col,
                   const DataLayout data_layout) {
-    PADDLE_ENFORCE_EQ(im.dims().size(), 3, "The dimension of im should be 3.");
+    PADDLE_ENFORCE_EQ(im.dims().size(), 3,
+                      platform::errors::InvalidArgument(
+                          "The dimension of tensor 'im' should be 3. But got "
+                          "the dims of tensor 'im' is [%s].",
+                          im.dims()));
     PADDLE_ENFORCE_EQ(col->dims().size(), 5,
-                      "The dimension of col should be 5.");
+                      platform::errors::InvalidArgument(
+                          "The dimension of tensor 'col' should be 5. But got "
+                          "the dims of tensor 'col' is [%s].",
+                          col->dims()));
 
     int im_channels = im.dims()[0];
     int im_height = im.dims()[1];
@@ -370,9 +391,16 @@ class Col2ImFunctor<paddle::operators::math::ColFormat::kOCF,
                   const std::vector<int>& stride,
                   const std::vector<int>& padding, framework::Tensor* im,
                   const DataLayout data_layout) {
-    PADDLE_ENFORCE_EQ(im->dims().size(), 3, "The dimension of im should be 3.");
+    PADDLE_ENFORCE_EQ(im->dims().size(), 3,
+                      platform::errors::InvalidArgument(
+                          "The dimension of tensor 'im' should be 3. But got "
+                          "the dims of tensor 'im' is [%s].",
+                          im->dims()));
     PADDLE_ENFORCE_EQ(col.dims().size(), 5,
-                      "The dimension of col should be 5.");
+                      platform::errors::InvalidArgument(
+                          "The dimension of tensor 'col' should be 5. But got "
+                          "the dims of tensor 'col' is [%s].",
+                          col.dims()));
 
     int im_channels = im->dims()[0];
     int im_height = im->dims()[1];
@@ -386,16 +414,16 @@ class Col2ImFunctor<paddle::operators::math::ColFormat::kOCF,
                        (dilation[0] * (filter_height - 1) + 1)) /
                               stride[0] +
                           1,
-                      col_height,
-                      "Output_height and padding(padding_up, padding_down) are "
-                      "inconsistent.");
+                      col_height, platform::errors::InvalidArgument(
+                                      "Output_height and padding(padding_up, "
+                                      "padding_down) are inconsistent."));
     PADDLE_ENFORCE_EQ((im_width + padding[1] + padding[3] -
                        (dilation[1] * (filter_width - 1) + 1)) /
                               stride[1] +
                           1,
-                      col_width,
-                      "col_width and padding(padding_left, padding_right) are "
-                      "inconsistent.");
+                      col_width, platform::errors::InvalidArgument(
+                                     "col_width and padding(padding_left, "
+                                     "padding_right) are inconsistent."));
 
     int block_dim_x = 0;
     int block_dim_y = 0;
diff --git a/paddle/fluid/operators/math/math_function.cc b/paddle/fluid/operators/math/math_function.cc
index 6748d0ab43f70f997b3008f34f4be743b81e8946..b8af5a21ca58185a10b34bca2310dd93436d340c 100644
--- a/paddle/fluid/operators/math/math_function.cc
+++ b/paddle/fluid/operators/math/math_function.cc
@@ -22,10 +22,12 @@ limitations under the License. */
 #include <cblas.h>
 #endif
 
+#include <utility>
 #include <vector>
 #include "paddle/fluid/framework/data_type.h"
 #include "paddle/fluid/operators/math/math_function_impl.h"
 #include "paddle/fluid/platform/float16.h"
+#include "unsupported/Eigen/CXX11/Tensor"
 
 namespace paddle {
 namespace operators {
@@ -34,6 +36,7 @@ namespace math {
 using float16 = paddle::platform::float16;
 
 template struct SetConstant<platform::CPUDeviceContext, platform::float16>;
+template struct SetConstant<platform::CPUDeviceContext, platform::bfloat16>;
 template struct SetConstant<platform::CPUDeviceContext, float>;
 template struct SetConstant<platform::CPUDeviceContext, double>;
 template struct SetConstant<platform::CPUDeviceContext, int>;
@@ -41,16 +44,18 @@ template struct SetConstant<platform::CPUDeviceContext, int64_t>;
 template struct SetConstant<platform::CPUDeviceContext, bool>;
 template struct SetConstant<platform::CPUDeviceContext, uint8_t>;
 
-#define DEFINE_CPU_TRANS(RANK)                                             \
-  template struct Transpose<platform::CPUDeviceContext, platform::float16, \
-                            RANK>;                                         \
-  template struct Transpose<platform::CPUDeviceContext, float, RANK>;      \
-  template struct Transpose<platform::CPUDeviceContext, double, RANK>;     \
-  template struct Transpose<platform::CPUDeviceContext, int, RANK>;        \
-  template struct Transpose<platform::CPUDeviceContext, int64_t, RANK>;    \
-  template struct Transpose<platform::CPUDeviceContext, bool, RANK>;       \
-  template struct Transpose<platform::CPUDeviceContext, int16_t, RANK>;    \
-  template struct Transpose<platform::CPUDeviceContext, uint8_t, RANK>;    \
+#define DEFINE_CPU_TRANS(RANK)                                              \
+  template struct Transpose<platform::CPUDeviceContext, platform::float16,  \
+                            RANK>;                                          \
+  template struct Transpose<platform::CPUDeviceContext, platform::bfloat16, \
+                            RANK>;                                          \
+  template struct Transpose<platform::CPUDeviceContext, float, RANK>;       \
+  template struct Transpose<platform::CPUDeviceContext, double, RANK>;      \
+  template struct Transpose<platform::CPUDeviceContext, int, RANK>;         \
+  template struct Transpose<platform::CPUDeviceContext, int64_t, RANK>;     \
+  template struct Transpose<platform::CPUDeviceContext, bool, RANK>;        \
+  template struct Transpose<platform::CPUDeviceContext, int16_t, RANK>;     \
+  template struct Transpose<platform::CPUDeviceContext, uint8_t, RANK>;     \
   template struct Transpose<platform::CPUDeviceContext, int8_t, RANK>;
 
 DEFINE_CPU_TRANS(1);
@@ -60,6 +65,55 @@ DEFINE_CPU_TRANS(4);
 DEFINE_CPU_TRANS(5);
 DEFINE_CPU_TRANS(6);
 
+template <typename T>
+struct TransposeNormal<platform::CPUDeviceContext, T> {
+  void operator()(const platform::CPUDeviceContext& context,
+                  const framework::Tensor& in, framework::Tensor* out,
+                  const std::vector<int>& axis) {
+    const int rank = axis.size();
+    auto in_stride = framework::stride(in.dims());
+    auto out_stride = framework::stride(out->dims());
+    const T* in_ptr = in.data<T>();
+    T* out_ptr = out->data<T>();
+
+    auto transpose_helper = [&](int64_t beg, int64_t end) {
+      for (int64_t out_idx = beg; out_idx < end; ++out_idx) {
+        int64_t in_idx = 0;
+        int64_t tmp_idx = out_idx;
+        // calculate the input index
+        for (int i = 0; i < rank; ++i) {
+          const int64_t coordinate = tmp_idx / out_stride[i];
+          tmp_idx -= coordinate * out_stride[i];
+          in_idx += coordinate * in_stride[axis[i]];
+        }
+        out_ptr[out_idx] = in_ptr[in_idx];
+      }
+    };
+    double cost_per_iteration =
+        rank * (Eigen::TensorOpCost::DivCost<int64_t>() +
+                2 * Eigen::TensorOpCost::MulCost<int64_t>() +
+                2 * Eigen::TensorOpCost::AddCost<int64_t>());
+    Eigen::TensorOpCost cost(sizeof(T), sizeof(T), cost_per_iteration);
+    auto* cpu_device = context.eigen_pool_device();
+    cpu_device->parallelFor(out->numel(), cost, std::move(transpose_helper));
+  }
+};
+
+// define transpose normal
+#define DEFINE_CPU_TRANS_NORMAL(TYPE) \
+  template struct TransposeNormal<platform::CPUDeviceContext, TYPE>
+
+DEFINE_CPU_TRANS_NORMAL(platform::float16);
+DEFINE_CPU_TRANS_NORMAL(platform::bfloat16);
+DEFINE_CPU_TRANS_NORMAL(float);
+DEFINE_CPU_TRANS_NORMAL(double);
+DEFINE_CPU_TRANS_NORMAL(int);
+DEFINE_CPU_TRANS_NORMAL(int64_t);
+DEFINE_CPU_TRANS_NORMAL(bool);
+DEFINE_CPU_TRANS_NORMAL(int16_t);
+DEFINE_CPU_TRANS_NORMAL(uint8_t);
+DEFINE_CPU_TRANS_NORMAL(int8_t);
+
 struct TensorSetConstantCPU {
   TensorSetConstantCPU(framework::Tensor* tensor, float value)
       : tensor_(tensor), value_(value) {}
@@ -125,9 +179,23 @@ struct RowwiseAdd<platform::CPUDeviceContext, T> {
                   const framework::Tensor& input,
                   const framework::Tensor& vector, framework::Tensor* output) {
     auto in_dims = input.dims();
+    auto out_dims = output->dims();
     auto size = input.numel() / in_dims[0];
-    PADDLE_ENFORCE_EQ(vector.numel(), size);
-    PADDLE_ENFORCE_EQ(output->dims(), in_dims);
+    PADDLE_ENFORCE_EQ(
+        vector.numel(), size,
+        platform::errors::InvalidArgument(
+            "The input vector size"
+            " should be equal to the size of each row of input tensor."
+            " Expected vector size=%d, but received %d",
+            size, vector.numel()));
+    const char* in_dims_cstr = in_dims.to_str().c_str();
+    const char* out_dims_cstr = out_dims.to_str().c_str();
+    PADDLE_ENFORCE_EQ(out_dims, in_dims,
+                      platform::errors::InvalidArgument(
+                          "The output tensor shape should be same as the input"
+                          " tensor shape. Expected output tensor shape: %s,"
+                          " but received %s",
+                          in_dims_cstr, out_dims_cstr));
 
     auto in = framework::EigenMatrix<T>::From(input);
     auto vec = framework::EigenVector<T>::Flatten(vector);
diff --git a/paddle/fluid/operators/math/math_function.cu b/paddle/fluid/operators/math/math_function.cu
index fba143d017deb4b4814ad8b10e614357a7ebee23..4d7c1a49286dd67a314f81fd706bdcc5e340e450 100644
--- a/paddle/fluid/operators/math/math_function.cu
+++ b/paddle/fluid/operators/math/math_function.cu
@@ -11,8 +11,11 @@ distributed under the License is distributed on an "AS IS" BASIS,
 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
+#include <algorithm>
 #include <vector>
 #include "paddle/fluid/framework/data_type.h"
+#include "paddle/fluid/memory/malloc.h"
+#include "paddle/fluid/memory/memcpy.h"
 #include "paddle/fluid/operators/math/blas.h"
 #include "paddle/fluid/operators/math/math_function.h"
 #include "paddle/fluid/operators/math/math_function_impl.h"
@@ -23,6 +26,7 @@ namespace operators {
 namespace math {
 
 using float16 = paddle::platform::float16;
+using bfloat16 = paddle::platform::bfloat16;
 
 template struct SetConstant<platform::CUDADeviceContext, platform::float16>;
 template struct SetConstant<platform::CUDADeviceContext, float>;
@@ -31,12 +35,13 @@ template struct SetConstant<platform::CUDADeviceContext, int>;
 template struct SetConstant<platform::CUDADeviceContext, int64_t>;
 template struct SetConstant<platform::CUDADeviceContext, bool>;
 
-#define DEFINE_GPU_TRANS(RANK)                                           \
-  template struct Transpose<platform::CUDADeviceContext, float, RANK>;   \
-  template struct Transpose<platform::CUDADeviceContext, double, RANK>;  \
-  template struct Transpose<platform::CUDADeviceContext, float16, RANK>; \
-  template struct Transpose<platform::CUDADeviceContext, int8_t, RANK>;  \
-  template struct Transpose<platform::CUDADeviceContext, int32_t, RANK>; \
+#define DEFINE_GPU_TRANS(RANK)                                            \
+  template struct Transpose<platform::CUDADeviceContext, float, RANK>;    \
+  template struct Transpose<platform::CUDADeviceContext, double, RANK>;   \
+  template struct Transpose<platform::CUDADeviceContext, float16, RANK>;  \
+  template struct Transpose<platform::CUDADeviceContext, bfloat16, RANK>; \
+  template struct Transpose<platform::CUDADeviceContext, int8_t, RANK>;   \
+  template struct Transpose<platform::CUDADeviceContext, int32_t, RANK>;  \
   template struct Transpose<platform::CUDADeviceContext, int64_t, RANK>;
 
 DEFINE_GPU_TRANS(1);
@@ -46,6 +51,88 @@ DEFINE_GPU_TRANS(4);
 DEFINE_GPU_TRANS(5);
 DEFINE_GPU_TRANS(6);
 
+#define REINTERPRET(T, DST_PTR, SRC_PTR) \
+  T* DST_PTR = reinterpret_cast<T*>(SRC_PTR)
+
+template <typename T>
+__global__ void TransposeNormalKernel(const T* in_ptr, T* out_ptr,
+                                      int64_t element,
+                                      const int64_t* in_stride_ptr,
+                                      const int64_t* out_stride_ptr,
+                                      const int64_t* axis_ptr, int rank) {
+  CUDA_KERNEL_LOOP(out_idx, element) {
+    int64_t in_idx = 0;
+    int64_t tmp_idx = out_idx;
+    for (int i = 0; i < rank; ++i) {
+      const int64_t coordinate = tmp_idx / out_stride_ptr[i];
+      tmp_idx -= coordinate * out_stride_ptr[i];
+      in_idx += coordinate * in_stride_ptr[axis_ptr[i]];
+    }
+    out_ptr[out_idx] = in_ptr[in_idx];
+  }
+}
+
+template <typename T>
+struct TransposeNormal<platform::CUDADeviceContext, T> {
+  void operator()(const platform::CUDADeviceContext& context,
+                  const framework::Tensor& in, framework::Tensor* out,
+                  const std::vector<int>& axis) {
+    const int rank = axis.size();
+    auto in_stride = framework::stride(in.dims());
+    auto out_stride = framework::stride(out->dims());
+    auto* in_ptr = in.data<T>();
+    auto* out_ptr = out->data<T>();
+
+    // copy in_stride, out_stride, axis to gpu device
+    const platform::CUDAPlace& cuda_place =
+        BOOST_GET_CONST(platform::CUDAPlace, context.GetPlace());
+    platform::CPUPlace cpu_place = platform::CPUPlace();
+    size_t size = 3 * rank * sizeof(int64_t);
+    auto cpu_buf_holder = memory::AllocShared(cpu_place, size);
+    auto cuda_buf_holder = memory::AllocShared(cuda_place, size);
+    REINTERPRET(int64_t, cpu_buf, cpu_buf_holder->ptr());
+    REINTERPRET(int64_t, cuda_buf, cuda_buf_holder->ptr());
+    for (int i = 0; i < rank; ++i) {
+      cpu_buf[i] = in_stride[i];
+      cpu_buf[rank + i] = out_stride[i];
+      cpu_buf[2 * rank + i] = axis[i];
+    }
+    memory::Copy(cuda_place, cuda_buf, cpu_place, cpu_buf, size,
+                 context.stream());
+    REINTERPRET(const int64_t, in_stride_ptr, cuda_buf);
+    REINTERPRET(const int64_t, out_stride_ptr, cuda_buf + rank);
+    REINTERPRET(const int64_t, axis_ptr, cuda_buf + 2 * rank);
+
+    const int MAX_BLOCK_DIM = context.GetMaxThreadsPerBlock();
+    const int MAX_GRID_DIM =
+        context.GetMaxPhysicalThreadCount() / MAX_BLOCK_DIM;
+    int64_t elements = in.numel();
+    int block_size = (elements >= MAX_BLOCK_DIM)
+                         ? MAX_BLOCK_DIM
+                         : (1 << static_cast<int>(std::log2(elements)));
+    int grid_size = elements / block_size;
+    grid_size = (grid_size >= MAX_GRID_DIM) ? MAX_GRID_DIM : grid_size;
+    TransposeNormalKernel<T><<<grid_size, block_size, 0, context.stream()>>>(
+        in_ptr, out_ptr, elements, in_stride_ptr, out_stride_ptr, axis_ptr,
+        rank);
+  }
+};
+
+// define transpose normal
+#define DEFINE_GPU_TRANS_NORMAL(TYPE) \
+  template struct TransposeNormal<platform::CUDADeviceContext, TYPE>
+
+DEFINE_GPU_TRANS_NORMAL(float16);
+DEFINE_GPU_TRANS_NORMAL(bfloat16);
+DEFINE_GPU_TRANS_NORMAL(float);
+DEFINE_GPU_TRANS_NORMAL(double);
+DEFINE_GPU_TRANS_NORMAL(int);
+DEFINE_GPU_TRANS_NORMAL(int64_t);
+DEFINE_GPU_TRANS_NORMAL(bool);
+DEFINE_GPU_TRANS_NORMAL(int16_t);
+DEFINE_GPU_TRANS_NORMAL(uint8_t);
+DEFINE_GPU_TRANS_NORMAL(int8_t);
+
 struct TensorSetConstantGPU {
   TensorSetConstantGPU(const platform::DeviceContext& context,
                        framework::Tensor* tensor, float value)
@@ -88,9 +175,24 @@ struct RowwiseAdd<platform::CUDADeviceContext, T> {
                   const framework::Tensor& input,
                   const framework::Tensor& vector, framework::Tensor* output) {
     auto in_dims = input.dims();
+    auto out_dims = output->dims();
     auto size = input.numel() / in_dims[0];
-    PADDLE_ENFORCE_EQ(vector.numel(), size);
-    PADDLE_ENFORCE_EQ(output->dims(), in_dims);
+    PADDLE_ENFORCE_EQ(
+        vector.numel(), size,
+        platform::errors::InvalidArgument(
+            "The input vector size"
+            " should be equal to the size of each row of input tensor."
+            " Expected vector size=%d, but received %d",
+            size, vector.numel()));
+    const char* in_dims_cstr = in_dims.to_str().c_str();
+    const char* out_dims_cstr = out_dims.to_str().c_str();
+    PADDLE_ENFORCE_EQ(
+        out_dims, in_dims,
+        platform::errors::InvalidArgument(
+            "The output tensor shape should be same as the input tensor"
+            " shape. Expected output tensor shape: %s,"
+            " but received %s",
+            in_dims_cstr, out_dims_cstr));
     int blocks = 512;
     int grids = (input.numel() + blocks - 1) / blocks;
     RowwiseAddKernel<T><<<grids, blocks, 0, context.stream()>>>(
@@ -113,7 +215,12 @@ void ColwiseSum<platform::CUDADeviceContext, double>::operator()(
     framework::Tensor* vector) {
   auto in_dims = input.dims();
   auto size = input.numel() / in_dims[0];
-  PADDLE_ENFORCE_EQ(vector->numel(), size);
+  PADDLE_ENFORCE_EQ(vector->numel(), size,
+                    platform::errors::InvalidArgument(
+                        "The size of input vector"
+                        " should be equal to the size of input tensor column"
+                        " dimension. Expected vector size=%d, but received %d",
+                        size, vector->numel()));
   framework::Tensor one;
   one.mutable_data<double>({in_dims[0]}, context.GetPlace());
   SetConstant<platform::CUDADeviceContext, double> set;
@@ -134,7 +241,12 @@ void RowwiseSum<platform::CUDADeviceContext, double>::operator()(
     framework::Tensor* vector) {
   auto in_dims = input.dims();
   auto size = input.numel() / in_dims[0];
-  PADDLE_ENFORCE_EQ(vector->numel(), in_dims[0]);
+  PADDLE_ENFORCE_EQ(vector->numel(), in_dims[0],
+                    platform::errors::InvalidArgument(
+                        "The size of input vector"
+                        " should be equal to the size of input tensor row"
+                        " dimension. Expected vector size=%d, but received %d",
+                        in_dims[0], vector->numel()));
   framework::Tensor one;
   one.mutable_data<double>({size}, context.GetPlace());
   SetConstant<platform::CUDADeviceContext, double> set;
diff --git a/paddle/fluid/operators/math/math_function.h b/paddle/fluid/operators/math/math_function.h
index 333552a0c1a79ed1682ea1f6da36f30e2ad5b4bf..6af0278d82503a27a14de6aef96468d69d6c17ad 100644
--- a/paddle/fluid/operators/math/math_function.h
+++ b/paddle/fluid/operators/math/math_function.h
@@ -26,6 +26,14 @@ limitations under the License. */
 namespace paddle {
 namespace operators {
 namespace math {
+
+template <typename DeviceContext, typename T>
+struct TransposeNormal {
+  // for dims >= 7 situation
+  void operator()(const DeviceContext& context, const framework::Tensor& in,
+                  framework::Tensor* out, const std::vector<int>& axis);
+};
+
 template <typename DeviceContext, typename T, int Rank>
 struct Transpose {
   void operator()(const DeviceContext& context, const framework::Tensor& in,
diff --git a/paddle/fluid/operators/math/math_function_impl.h b/paddle/fluid/operators/math/math_function_impl.h
index 693d5620460e1fe6f6d82bd0749b0780b64841f5..869a3054598da9cd2223ca0e705c0f910ba043ec 100644
--- a/paddle/fluid/operators/math/math_function_impl.h
+++ b/paddle/fluid/operators/math/math_function_impl.h
@@ -59,7 +59,12 @@ void ColwiseSum<DeviceContext, T>::operator()(const DeviceContext& context,
                                               framework::Tensor* out) {
   auto in_dims = input.dims();
   auto size = input.numel() / in_dims[0];
-  PADDLE_ENFORCE_EQ(out->numel(), size);
+  PADDLE_ENFORCE_EQ(out->numel(), size,
+                    platform::errors::InvalidArgument(
+                        "The size of output tensor "
+                        "should be equal to the size of input tensor column"
+                        " dimension. Expected output size=%d, but received %d",
+                        size, out->numel()));
 
   auto in = framework::EigenMatrix<T>::From(input);
   auto vec = framework::EigenVector<T>::Flatten(*out);
@@ -78,7 +83,13 @@ class ColwiseSum<platform::CPUDeviceContext, T> {
     auto& in_dims = input.dims();
     auto height = in_dims[0];
     auto size = in_dims[1];
-    PADDLE_ENFORCE_EQ(out->numel(), size);
+    PADDLE_ENFORCE_EQ(
+        out->numel(), size,
+        platform::errors::InvalidArgument(
+            "The size of output tensor "
+            "should be equal to the size of input tensor column"
+            " dimension. Expected output size=%d, but received %d",
+            size, out->numel()));
 
     T* out_buf = out->mutable_data<T>(out->place());
     const T* in_buf = input.data<T>();
@@ -100,8 +111,16 @@ void RowwiseMean<DeviceContext, T>::operator()(const DeviceContext& context,
                                                const framework::Tensor& input,
                                                framework::Tensor* out) {
   auto in_dims = input.dims();
-  PADDLE_ENFORCE_EQ(in_dims.size(), 2U);
-  PADDLE_ENFORCE_EQ(out->numel(), in_dims[0]);
+  PADDLE_ENFORCE_EQ(in_dims.size(), 2U, platform::errors::InvalidArgument(
+                                            "The rank of input tensor "
+                                            "should be 2, but received %d",
+                                            in_dims.size()));
+  PADDLE_ENFORCE_EQ(out->numel(), in_dims[0],
+                    platform::errors::InvalidArgument(
+                        "The size of output tensor "
+                        "should be equal to the size of input tensor row"
+                        " dimension. Expected output size=%d, but received %d",
+                        in_dims[0], out->numel()));
 
   auto in = framework::EigenMatrix<T>::From(input);
   auto vec = framework::EigenVector<T>::Flatten(*out);
@@ -118,10 +137,19 @@ class RowwiseMean<platform::CPUDeviceContext, T> {
   void operator()(const platform::CPUDeviceContext& context,
                   const framework::Tensor& input, framework::Tensor* out) {
     auto& in_dims = input.dims();
-    PADDLE_ENFORCE_EQ(in_dims.size(), 2U);
+    PADDLE_ENFORCE_EQ(in_dims.size(), 2U, platform::errors::InvalidArgument(
+                                              "The rank of input tensor "
+                                              "should be 2, but received %d",
+                                              in_dims.size()));
     auto height = in_dims[0];
     auto size = in_dims[1];
-    PADDLE_ENFORCE_EQ(out->numel(), height);
+    PADDLE_ENFORCE_EQ(
+        out->numel(), height,
+        platform::errors::InvalidArgument(
+            "The size of output tensor "
+            "should be equal to the size of input tensor row"
+            " dimension. Expected output size=%d, but received %d",
+            height, out->numel()));
     auto inv_size = 1.0 / size;
     T* out_buf = out->mutable_data<T>(out->place());
     const T* in_buf = input.data<T>();
@@ -141,8 +169,16 @@ void RowwiseSum<DeviceContext, T>::operator()(const DeviceContext& context,
                                               const framework::Tensor& input,
                                               framework::Tensor* out) {
   auto in_dims = input.dims();
-  PADDLE_ENFORCE_EQ(in_dims.size(), 2U);
-  PADDLE_ENFORCE_EQ(out->numel(), in_dims[0]);
+  PADDLE_ENFORCE_EQ(in_dims.size(), 2U, platform::errors::InvalidArgument(
+                                            "The rank of input tensor "
+                                            "should be 2, but received %d",
+                                            in_dims.size()));
+  PADDLE_ENFORCE_EQ(out->numel(), in_dims[0],
+                    platform::errors::InvalidArgument(
+                        "The size of output tensor "
+                        "should be equal to the size of input tensor row"
+                        " dimension. Expected output size=%d, but received %d",
+                        in_dims[0], out->numel()));
 
   auto in = framework::EigenMatrix<T>::From(input);
   auto vec = framework::EigenVector<T>::Flatten(*out);
@@ -159,10 +195,19 @@ class RowwiseSum<platform::CPUDeviceContext, T> {
   void operator()(const platform::CPUDeviceContext& context,
                   const framework::Tensor& input, framework::Tensor* out) {
     auto& in_dims = input.dims();
-    PADDLE_ENFORCE_EQ(in_dims.size(), 2U);
+    PADDLE_ENFORCE_EQ(in_dims.size(), 2U, platform::errors::InvalidArgument(
+                                              "The rank of input tensor "
+                                              "should be 2, but received %d",
+                                              in_dims.size()));
     auto height = in_dims[0];
     auto size = in_dims[1];
-    PADDLE_ENFORCE_EQ(out->numel(), height);
+    PADDLE_ENFORCE_EQ(
+        out->numel(), height,
+        platform::errors::InvalidArgument(
+            "The size of output tensor "
+            "should be equal to the size of input tensor row"
+            " dimension. Expected output size=%d, but received %d",
+            height, out->numel()));
 
     T* out_buf = out->mutable_data<T>(out->place());
     const T* in_buf = input.data<T>();
diff --git a/paddle/fluid/operators/math/math_function_test.cc b/paddle/fluid/operators/math/math_function_test.cc
index 2343e0ee965303c9fdb2ad3faf9ddf6e5bb7782f..3388d7edafecc4c0dd3a041316dc6f171d035319 100644
--- a/paddle/fluid/operators/math/math_function_test.cc
+++ b/paddle/fluid/operators/math/math_function_test.cc
@@ -224,7 +224,11 @@ TEST(math_funciton, set_constant) {
   auto* ctx = new paddle::platform::CPUDeviceContext();
   paddle::operators::math::set_constant(*ctx, &t, 10);
   for (int64_t i = 0; i < t.numel(); ++i) {
-    PADDLE_ENFORCE_EQ(10, t.data<int>()[i]);
+    PADDLE_ENFORCE_EQ(10, t.data<int>()[i],
+                      paddle::platform::errors::InvalidArgument(
+                          "Each value of input tensor should be 10, "
+                          "but received %d.",
+                          t.data<int>()[i]));
   }
   delete ctx;
 }
diff --git a/paddle/fluid/operators/math/math_function_test.cu b/paddle/fluid/operators/math/math_function_test.cu
index bcbb4a8274f149240b9f0990f38d9f38bdd0e5b1..44b1ee45a4fe9b6f2ea7ba5e09c7cbc60c1aff28 100644
--- a/paddle/fluid/operators/math/math_function_test.cu
+++ b/paddle/fluid/operators/math/math_function_test.cu
@@ -18,7 +18,12 @@
 
 void fill_fp16_data(paddle::platform::float16* in_ptr, size_t size,
                     const std::vector<float>& data) {
-  PADDLE_ENFORCE_EQ(size, data.size());
+  PADDLE_ENFORCE_EQ(
+      size, data.size(),
+      paddle::platform::errors::InvalidArgument(
+          "The size of argument data should"
+          " be equal to the argument size. Expected %d, but received %d.",
+          size, data.size()));
   for (size_t i = 0; i < data.size(); ++i) {
     in_ptr[i] = paddle::platform::float16(data[i]);
   }
diff --git a/paddle/fluid/operators/math/padding.h b/paddle/fluid/operators/math/padding.h
index 63f793433de07ea2e43ad03ea3ccae1a259f7ae2..379b21c3c18888989663221052e6e99df80e7e9d 100644
--- a/paddle/fluid/operators/math/padding.h
+++ b/paddle/fluid/operators/math/padding.h
@@ -85,8 +85,9 @@ void PaddingFunctor(int rank, const framework::ExecutionContext& context,
       PadFunction<DeviceContext, T, 6>(context, pads, src, pad_value, out);
       break;
     default:
-      PADDLE_THROW(
-          "PadOp only support tensors with no more than 6 dimensions.");
+      PADDLE_THROW(platform::errors::Unimplemented(
+          "PadOp only support tensors with no more"
+          " than 6 dimensions currently."));
   }
 }
 
@@ -114,8 +115,9 @@ void PaddingGradFunctor(int rank, const framework::ExecutionContext& context,
       PadGradFunction<DeviceContext, T, 6>(context, pads, src, out);
       break;
     default:
-      PADDLE_THROW(
-          "PadOp only support tensors with no more than 6 dimensions.");
+      PADDLE_THROW(platform::errors::Unimplemented(
+          "PadOp only support tensors with no more"
+          " than 6 dimensions currently."));
   }
 }
 
diff --git a/paddle/fluid/operators/math/sampler.h b/paddle/fluid/operators/math/sampler.h
index 480576ef9dc8c21811a1a867d553ccc6d97fa22a..b90e7e1980335f1facd38738671e7986187d1ceb 100644
--- a/paddle/fluid/operators/math/sampler.h
+++ b/paddle/fluid/operators/math/sampler.h
@@ -19,6 +19,8 @@ limitations under the License. */
 #include <random>
 #include <vector>
 
+#include "paddle/fluid/platform/enforce.h"
+
 namespace paddle {
 namespace operators {
 namespace math {
@@ -31,7 +33,10 @@ namespace math {
 class Sampler {
  public:
   explicit Sampler(int64_t range, unsigned int seed = 0UL) : range_(range) {
-    //    PADDLE_ENFORCE_GT(range, 0, "Range should be greater than 0.");
+    PADDLE_ENFORCE_GT(
+        range, 0,
+        platform::errors::InvalidArgument(
+            "Range should be greater than 0, but recevied %d.", range));
     if (seed == 0) {
       std::random_device r;
       seed_ = r();
diff --git a/paddle/fluid/operators/math/segment_pooling.cc b/paddle/fluid/operators/math/segment_pooling.cc
new file mode 100644
index 0000000000000000000000000000000000000000..3c77d3d4cf88324caded3d7863b25b90b1232db6
--- /dev/null
+++ b/paddle/fluid/operators/math/segment_pooling.cc
@@ -0,0 +1,148 @@
+/* Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/operators/math/segment_pooling.h"
+#include <string>
+#include "paddle/fluid/framework/eigen.h"
+#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/operators/math/math_function.h"
+
+namespace paddle {
+namespace operators {
+
+using Tensor = framework::Tensor;
+
+template <typename T, typename IndexT>
+class SegmentPoolFunctor<platform::CPUDeviceContext, T, IndexT> {
+ public:
+  void operator()(const platform::CPUDeviceContext& context,
+                  const framework::Tensor& input,
+                  const framework::Tensor& segments, framework::Tensor* output,
+                  framework::Tensor* index,
+                  const std::string pooltype = "SUM") {
+    const IndexT* segment_ids = segments.data<IndexT>();
+    auto curent_id = segment_ids[0];
+    int64_t last_idx = 0;
+    int64_t w = input.numel() / input.dims()[0];
+    auto& place = *context.eigen_device();
+    for (int64_t idx = 1; idx <= segments.numel(); ++idx) {
+      if (idx < segments.numel()) {
+        if (segment_ids[idx] == curent_id) continue;
+        PADDLE_ENFORCE_GE(segment_ids[idx], curent_id,
+                          platform::errors::InvalidArgument(
+                              "The segment ids should be sorted, but got "
+                              "segment_ids[%d]:%d > segment_ids[%d]:%d.",
+                              idx - 1, curent_id, idx, segment_ids[idx]));
+      }
+
+      Tensor out_t = output->Slice(curent_id, curent_id + 1);
+      Tensor in_t = input.Slice(last_idx, idx);
+
+      int64_t h = idx - last_idx;
+      auto in_e =
+          framework::EigenMatrix<T>::From(in_t, framework::make_ddim({h, w}));
+      auto out_e = framework::EigenVector<T>::Flatten(out_t);
+
+      auto reduce_dim = Eigen::array<int, 1>({{0}});
+      if (pooltype == "MEAN") {
+        out_e.device(place) = in_e.mean(reduce_dim);
+      } else if (pooltype == "SUM") {
+        out_e.device(place) = in_e.sum(reduce_dim);
+      } else if (pooltype == "MAX") {
+        out_e.device(place) = in_e.maximum(reduce_dim);
+      } else if (pooltype == "MIN") {
+        out_e.device(place) = in_e.minimum(reduce_dim);
+      } else {
+        PADDLE_THROW(platform::errors::InvalidArgument(
+            "Unsupported segment pooling type, only MEAN, SUM, MAX, MIN "
+            "available, but got %s.",
+            pooltype));
+      }
+
+      last_idx = idx;
+      if (idx < segments.numel()) curent_id = segment_ids[idx];
+    }
+  }
+};
+
+template <typename T, typename IndexT>
+class SegmentPoolGradFunctor<platform::CPUDeviceContext, T, IndexT> {
+ public:
+  void operator()(const platform::CPUDeviceContext& context,
+                  const framework::Tensor& input,
+                  const framework::Tensor& output,
+                  const framework::Tensor& out_grad,
+                  const framework::Tensor& segments, framework::Tensor* in_grad,
+                  const framework::Tensor* index = nullptr,
+                  const std::string pooltype = "SUM") {
+    const IndexT* segment_ids = segments.data<IndexT>();
+    auto& place = *context.eigen_device();
+    auto curent_id = segment_ids[0];
+    int64_t last_idx = 0;
+    int64_t w = in_grad->numel() / in_grad->dims()[0];
+    for (int64_t idx = 1; idx <= segments.numel(); ++idx) {
+      if (idx < segments.numel()) {
+        if (segment_ids[idx] == curent_id) continue;
+        PADDLE_ENFORCE_GE(segment_ids[idx], curent_id,
+                          platform::errors::InvalidArgument(
+                              "The segment ids should be sorted, but got "
+                              "segment_ids[%d]:%d > segment_ids[%d]:%d.",
+                              idx - 1, curent_id, idx, segment_ids[idx]));
+      }
+
+      Tensor out_g_t = out_grad.Slice(curent_id, curent_id + 1);
+      Tensor in_g_t = in_grad->Slice(last_idx, idx);
+
+      int64_t h = idx - last_idx;
+      auto in_g_e = framework::EigenMatrix<T>::From(in_g_t, {h, w});
+      auto out_g_e = framework::EigenMatrix<T>::From(out_g_t, {1, w});
+      Eigen::DSizes<int, 2> bcast(h, 1);
+
+      if (pooltype == "MEAN") {
+        in_g_e.device(place) = (out_g_e / static_cast<T>(h)).broadcast(bcast);
+      } else if (pooltype == "SUM") {
+        in_g_e.device(place) = out_g_e.broadcast(bcast);
+      } else if (pooltype == "MAX" || pooltype == "MIN") {
+        Tensor out_t = output.Slice(curent_id, curent_id + 1);
+        Tensor in_t = input.Slice(last_idx, idx);
+        auto in_e = framework::EigenMatrix<T>::From(in_t, {h, w});
+        auto out_e = framework::EigenMatrix<T>::From(out_t, {1, w});
+        in_g_e.device(place) =
+            (in_e == out_e.broadcast(bcast)).template cast<T>() *
+            out_g_e.broadcast(bcast);
+      } else {
+        PADDLE_THROW(platform::errors::InvalidArgument(
+            "Unsupported segment pooling type, only MEAN, SUM, MAX, MIN "
+            "available, but got %s.",
+            pooltype));
+      }
+
+      last_idx = idx;
+      if (idx < segments.numel()) curent_id = segment_ids[idx];
+    }
+  }
+};
+
+using CPU = platform::CPUDeviceContext;
+template class SegmentPoolFunctor<CPU, float, int>;
+template class SegmentPoolFunctor<CPU, float, int64_t>;
+template class SegmentPoolFunctor<CPU, double, int>;
+template class SegmentPoolFunctor<CPU, double, int64_t>;
+template class SegmentPoolGradFunctor<CPU, float, int>;
+template class SegmentPoolGradFunctor<CPU, float, int64_t>;
+template class SegmentPoolGradFunctor<CPU, double, int>;
+template class SegmentPoolGradFunctor<CPU, double, int64_t>;
+
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/fluid/operators/math/segment_pooling.h b/paddle/fluid/operators/math/segment_pooling.h
new file mode 100644
index 0000000000000000000000000000000000000000..561fad6921fe7b9e61f6ea4bc33d820a6af25262
--- /dev/null
+++ b/paddle/fluid/operators/math/segment_pooling.h
@@ -0,0 +1,46 @@
+/* Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+#include <string>
+#include "paddle/fluid/framework/tensor.h"
+#include "paddle/fluid/platform/device_context.h"
+
+namespace paddle {
+namespace operators {
+
+template <typename DeviceContext, typename T, typename IndexT>
+class SegmentPoolFunctor {
+ public:
+  /* mean pool has summed_ids output */
+  void operator()(const DeviceContext& context, const framework::Tensor& input,
+                  const framework::Tensor& segments, framework::Tensor* output,
+                  framework::Tensor* summed_ids = nullptr,
+                  const std::string pooltype = "SUM");
+};
+
+template <typename DeviceContext, typename T, typename IndexT>
+class SegmentPoolGradFunctor {
+ public:
+  /* mean pool has summed_ids output */
+  void operator()(const DeviceContext& context, const framework::Tensor& input,
+                  const framework::Tensor& output,
+                  const framework::Tensor& out_grad,
+                  const framework::Tensor& segments, framework::Tensor* in_grad,
+                  const framework::Tensor* summed_ids = nullptr,
+                  const std::string pooltype = "SUM");
+};
+
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/fluid/operators/math/selected_rows_functor.cc b/paddle/fluid/operators/math/selected_rows_functor.cc
index 3bb9efc5315fcacf0b50682b65c89ac3ad0d2d4e..c2595beb0cb4dc37104a91ac8a2647c7d787c5c5 100644
--- a/paddle/fluid/operators/math/selected_rows_functor.cc
+++ b/paddle/fluid/operators/math/selected_rows_functor.cc
@@ -29,7 +29,12 @@ struct SelectedRowsAdd<platform::CPUDeviceContext, T> {
                   const framework::SelectedRows& input2,
                   framework::SelectedRows* output) {
     auto in1_height = input1.height();
-    PADDLE_ENFORCE_EQ(in1_height, input2.height());
+    PADDLE_ENFORCE_EQ(
+        in1_height, input2.height(),
+        platform::errors::InvalidArgument("The two inputs height must be equal."
+                                          "But recieved first input height  = "
+                                          "[%d], second input height = [%d]",
+                                          in1_height, input2.height()));
     output->set_height(in1_height);
 
     auto& in1_rows = input1.rows();
@@ -47,15 +52,31 @@ struct SelectedRowsAdd<platform::CPUDeviceContext, T> {
     auto& in2_value = input2.value();
 
     auto in1_row_numel = in1_value.numel() / in1_rows.size();
-    PADDLE_ENFORCE_EQ(in1_row_numel, in2_value.numel() / in2_rows.size());
-    PADDLE_ENFORCE_EQ(in1_row_numel, out_value->numel() / out_rows.size());
+    PADDLE_ENFORCE_EQ(
+        in1_row_numel, in2_value.numel() / in2_rows.size(),
+        platform::errors::InvalidArgument(
+            "The two inputs width must be equal."
+            "But recieved first input width = [%d], second input width = [%d]",
+            in1_row_numel, in2_value.numel() / in2_rows.size()));
+    PADDLE_ENFORCE_EQ(
+        in1_row_numel, out_value->numel() / out_rows.size(),
+        platform::errors::InvalidArgument(
+            "The input and oupput width must be equal."
+            "But recieved input width = [%d], output width = [%d]",
+            in1_row_numel, out_value->numel() / out_rows.size()));
 
     auto in1_place = input1.place();
-    PADDLE_ENFORCE(platform::is_cpu_place(in1_place));
+    PADDLE_ENFORCE_EQ(platform::is_cpu_place(in1_place), true,
+                      platform::errors::InvalidArgument(
+                          "The running enviroment is not on the CPU place."));
     auto in2_place = input2.place();
-    PADDLE_ENFORCE(platform::is_cpu_place(in2_place));
+    PADDLE_ENFORCE_EQ(platform::is_cpu_place(in2_place), true,
+                      platform::errors::InvalidArgument(
+                          "The running enviroment is not on the CPU place."));
     auto out_place = context.GetPlace();
-    PADDLE_ENFORCE(platform::is_cpu_place(out_place));
+    PADDLE_ENFORCE_EQ(platform::is_cpu_place(out_place), true,
+                      platform::errors::InvalidArgument(
+                          "The running enviroment is not on the CPU place."));
 
     auto* out_data = out_value->data<T>();
     auto* in1_data = in1_value.data<T>();
@@ -82,15 +103,35 @@ struct SelectedRowsAddTensor<platform::CPUDeviceContext, T> {
     auto in1_height = input1.height();
     auto in2_dims = input2.dims();
     auto out_dims = output->dims();
-    PADDLE_ENFORCE_EQ(in1_height, in2_dims[0]);
-    PADDLE_ENFORCE_EQ(in1_height, out_dims[0]);
+    PADDLE_ENFORCE_EQ(
+        in1_height, in2_dims[0],
+        platform::errors::InvalidArgument("The two inputs height must be equal."
+                                          "But recieved first input height = "
+                                          "[%d], second input height = [%d]",
+                                          in1_height, in2_dims[0]));
+    PADDLE_ENFORCE_EQ(
+        in1_height, out_dims[0],
+        platform::errors::InvalidArgument(
+            "The input and output height must be equal."
+            "But recieved input height = [%d], output height = [%d]",
+            in1_height, out_dims[0]));
 
     auto& in1_value = input1.value();
     auto& in1_rows = input1.rows();
 
     int64_t in1_row_numel = in1_value.numel() / in1_rows.size();
-    PADDLE_ENFORCE_EQ(in1_row_numel, input2.numel() / in1_height);
-    PADDLE_ENFORCE_EQ(in1_row_numel, output->numel() / in1_height);
+    PADDLE_ENFORCE_EQ(
+        in1_row_numel, input2.numel() / in1_height,
+        platform::errors::InvalidArgument(
+            "The two inputs width must be equal."
+            "But recieved first input width = [%d], second input width = [%d]",
+            in1_row_numel, input2.numel() / in1_height));
+    PADDLE_ENFORCE_EQ(
+        in1_row_numel, output->numel() / in1_height,
+        platform::errors::InvalidArgument(
+            "The input and output width must be equal."
+            "But recieved input width = [%d], output width = [%d]",
+            in1_row_numel, output->numel() / in1_height));
 
     SetConstant<platform::CPUDeviceContext, T> functor;
     functor(context, output, 0.0);
@@ -121,7 +162,12 @@ struct SelectedRowsAddTo<platform::CPUDeviceContext, T> {
                   const int64_t input2_offset,
                   framework::SelectedRows* input2) {
     auto in1_height = input1.height();
-    PADDLE_ENFORCE_EQ(in1_height, input2->height());
+    PADDLE_ENFORCE_EQ(
+        in1_height, input2->height(),
+        platform::errors::InvalidArgument("The two inputs height must be equal."
+                                          "But recieved first input height = "
+                                          "[%d], second input height = [%d]",
+                                          in1_height, input2->height()));
 
     auto& in1_rows = input1.rows();
     auto& in2_rows = *(input2->mutable_rows());
@@ -133,9 +179,13 @@ struct SelectedRowsAddTo<platform::CPUDeviceContext, T> {
     in2_rows.Extend(in1_rows.begin(), in1_rows.end());
 
     auto in1_place = input1.place();
-    PADDLE_ENFORCE(platform::is_cpu_place(in1_place));
+    PADDLE_ENFORCE_EQ(platform::is_cpu_place(in1_place), true,
+                      platform::errors::InvalidArgument(
+                          "The running enviroment is not on the CPU place."));
     auto in2_place = input2->place();
-    PADDLE_ENFORCE(platform::is_cpu_place(in2_place));
+    PADDLE_ENFORCE_EQ(platform::is_cpu_place(in2_place), true,
+                      platform::errors::InvalidArgument(
+                          "The running enviroment is not on the CPU place."));
 
     auto* in1_data = in1_value.data<T>();
     auto* in2_data = in2_value->data<T>();
@@ -163,7 +213,12 @@ struct SelectedRowsSumTo<platform::CPUDeviceContext, T> {
       auto& in_rows = (*iter)->rows();
       size += in_rows.end() - in_rows.begin();
       auto in1_height = (*iter)->height();
-      PADDLE_ENFORCE_EQ(in1_height, input2->height());
+      PADDLE_ENFORCE_EQ(in1_height, input2->height(),
+                        platform::errors::InvalidArgument(
+                            "The two inputs height must be equal."
+                            "But recieved first input height = [%d], second "
+                            "input height = [%d]",
+                            in1_height, input2->height()));
     }
     // concat rows
     std::vector<int64_t> in2_rows;
@@ -201,13 +256,23 @@ struct SelectedRowsAddToTensor<platform::CPUDeviceContext, T> {
     }
     auto in1_height = input1.height();
     auto in2_dims = input2->dims();
-    PADDLE_ENFORCE_EQ(in1_height, in2_dims[0]);
+    PADDLE_ENFORCE_EQ(
+        in1_height, in2_dims[0],
+        platform::errors::InvalidArgument("The two inputs height must be equal."
+                                          "But recieved first input height = "
+                                          "[%d], second input height = [%d]",
+                                          in1_height, in2_dims[0]));
 
     auto& in1_value = input1.value();
     auto& in1_rows = input1.rows();
 
     int64_t in1_row_numel = in1_value.numel() / in1_rows.size();
-    PADDLE_ENFORCE_EQ(in1_row_numel, input2->numel() / in1_height);
+    PADDLE_ENFORCE_EQ(
+        in1_row_numel, input2->numel() / in1_height,
+        platform::errors::InvalidArgument(
+            "The two inputs width must be equal."
+            "But recieved first input width = [%d], second input width = [%d]",
+            in1_row_numel, input2->numel() / in1_height));
 
     auto* in1_data = in1_value.data<T>();
     auto* input2_data = input2->data<T>();
@@ -302,10 +367,12 @@ struct MergeAdd<platform::CPUDeviceContext, T> {
         continue;
       }
       PADDLE_ENFORCE_EQ(input_width, input->value().dims()[1],
-                        "all input should have same "
-                        "dimension except for the first one");
+                        platform::errors::InvalidArgument(
+                            "All inputs should have same "
+                            "dimension except for the first one."));
       PADDLE_ENFORCE_EQ(input_height, input->height(),
-                        "all input should have same height");
+                        platform::errors::InvalidArgument(
+                            "All inputs should have same height."));
       row_num += input->rows().size();
       merged_row_set.insert(input->rows().begin(), input->rows().end());
     }
@@ -421,10 +488,12 @@ struct MergeAverage<platform::CPUDeviceContext, T> {
         continue;
       }
       PADDLE_ENFORCE_EQ(input_width, input->value().dims()[1],
-                        "all input should have same "
-                        "dimension except for the first one");
+                        platform::errors::InvalidArgument(
+                            "All inputs should have same "
+                            "dimension except for the first one."));
       PADDLE_ENFORCE_EQ(input_height, input->height(),
-                        "all input should have same height");
+                        platform::errors::InvalidArgument(
+                            "All input should have same height."));
       row_num += input->rows().size();
       merged_row_set.insert(input->rows().begin(), input->rows().end());
     }
@@ -492,13 +561,23 @@ struct UpdateToTensor<platform::CPUDeviceContext, T> {
                   framework::Tensor* input2) {
     auto in1_height = input1.height();
     auto in2_dims = input2->dims();
-    PADDLE_ENFORCE_EQ(in1_height, in2_dims[0]);
+    PADDLE_ENFORCE_EQ(
+        in1_height, in2_dims[0],
+        platform::errors::InvalidArgument("The two inputs height must be equal."
+                                          "But recieved first input height = "
+                                          "[%d], second input height = [%d]",
+                                          in1_height, in2_dims[0]));
 
     auto& in1_value = input1.value();
     auto& in1_rows = input1.rows();
 
     int64_t in1_row_numel = in1_value.numel() / in1_rows.size();
-    PADDLE_ENFORCE_EQ(in1_row_numel, input2->numel() / in1_height);
+    PADDLE_ENFORCE_EQ(
+        in1_row_numel, input2->numel() / in1_height,
+        platform::errors::InvalidArgument(
+            "The two inputs width must be equal."
+            "But recieved first input width = [%d], second input width = [%d]",
+            in1_row_numel, input2->numel() / in1_height));
 
     auto* in1_data = in1_value.data<T>();
     auto* input2_data = input2->data<T>();
diff --git a/paddle/fluid/operators/math/selected_rows_functor.cu b/paddle/fluid/operators/math/selected_rows_functor.cu
index 9cce52c6d4587baf01ba22eebc9c57da04c26590..35bd02ad35b71eb7deb3299490fa545ef8b23dc6 100644
--- a/paddle/fluid/operators/math/selected_rows_functor.cu
+++ b/paddle/fluid/operators/math/selected_rows_functor.cu
@@ -30,7 +30,12 @@ struct SelectedRowsAdd<platform::CUDADeviceContext, T> {
                   const framework::SelectedRows& input2,
                   framework::SelectedRows* output) {
     auto in1_height = input1.height();
-    PADDLE_ENFORCE_EQ(in1_height, input2.height());
+    PADDLE_ENFORCE_EQ(
+        in1_height, input2.height(),
+        platform::errors::InvalidArgument("The two inputs height must be equal."
+                                          "But recieved first input height  = "
+                                          "[%d], second input height = [%d]",
+                                          in1_height, input2.height()));
     output->set_height(in1_height);
 
     framework::Vector<int64_t> in1_rows(input1.rows());
@@ -48,18 +53,34 @@ struct SelectedRowsAdd<platform::CUDADeviceContext, T> {
     auto& in2_value = input2.value();
 
     auto in1_row_numel = in1_value.numel() / in1_rows.size();
-    PADDLE_ENFORCE_EQ(in1_row_numel, in2_value.numel() / in2_rows.size());
-    PADDLE_ENFORCE_EQ(in1_row_numel, out_value->numel() / out_rows.size());
+    PADDLE_ENFORCE_EQ(
+        in1_row_numel, in2_value.numel() / in2_rows.size(),
+        platform::errors::InvalidArgument(
+            "The two inputs width must be equal."
+            "But recieved first input width = [%d], second input width = [%d]",
+            in1_row_numel, in2_value.numel() / in2_rows.size()));
+    PADDLE_ENFORCE_EQ(
+        in1_row_numel, out_value->numel() / out_rows.size(),
+        platform::errors::InvalidArgument(
+            "The input and oupput width must be equal."
+            "But recieved input width = [%d], output width = [%d]",
+            in1_row_numel, out_value->numel() / out_rows.size()));
 
     auto* out_data = out_value->data<T>();
     auto* in1_data = in1_value.data<T>();
 
     auto in1_place = input1.place();
-    PADDLE_ENFORCE_EQ(platform::is_gpu_place(in1_place), true);
+    PADDLE_ENFORCE_EQ(platform::is_gpu_place(in1_place), true,
+                      platform::errors::InvalidArgument(
+                          "The running enviroment is not on the GPU place."));
     auto in2_place = input2.place();
-    PADDLE_ENFORCE_EQ(platform::is_gpu_place(in2_place), true);
+    PADDLE_ENFORCE_EQ(platform::is_gpu_place(in2_place), true,
+                      platform::errors::InvalidArgument(
+                          "The running enviroment is not on the GPU place."));
     auto out_place = context.GetPlace();
-    PADDLE_ENFORCE_EQ(platform::is_gpu_place(out_place), true);
+    PADDLE_ENFORCE_EQ(platform::is_gpu_place(out_place), true,
+                      platform::errors::InvalidArgument(
+                          "The running enviroment is not on the GPU place."));
 
     memory::Copy(BOOST_GET_CONST(platform::CUDAPlace, out_place), out_data,
                  BOOST_GET_CONST(platform::CUDAPlace, in1_place), in1_data,
@@ -104,15 +125,35 @@ struct SelectedRowsAddTensor<platform::CUDADeviceContext, T> {
     auto in1_height = input1.height();
     auto in2_dims = input2.dims();
     auto out_dims = output->dims();
-    PADDLE_ENFORCE_EQ(in1_height, in2_dims[0]);
-    PADDLE_ENFORCE_EQ(in1_height, out_dims[0]);
+    PADDLE_ENFORCE_EQ(
+        in1_height, in2_dims[0],
+        platform::errors::InvalidArgument(
+            "The two inputs height must be equal."
+            "But recieved first input height = [%d], first input height = [%d]",
+            in1_height, in2_dims[0]));
+    PADDLE_ENFORCE_EQ(
+        in1_height, out_dims[0],
+        platform::errors::InvalidArgument(
+            "The input and output height must be equal."
+            "But recieved input height = [%d], output height = [%d]",
+            in1_height, out_dims[0]));
 
     auto& in1_value = input1.value();
     auto& in1_rows = input1.rows();
 
     int64_t in1_row_numel = in1_value.numel() / in1_rows.size();
-    PADDLE_ENFORCE_EQ(in1_row_numel, input2.numel() / in1_height);
-    PADDLE_ENFORCE_EQ(in1_row_numel, output->numel() / in1_height);
+    PADDLE_ENFORCE_EQ(
+        in1_row_numel, input2.numel() / in1_height,
+        platform::errors::InvalidArgument(
+            "The two inputs width must be equal."
+            "But recieved first input width = [%d], second input width = [%d]",
+            in1_row_numel, input2.numel() / in1_height));
+    PADDLE_ENFORCE_EQ(
+        in1_row_numel, output->numel() / in1_height,
+        platform::errors::InvalidArgument(
+            "The input and output width must be equal."
+            "But recieved input width = [%d], output width = [%d]",
+            in1_row_numel, output->numel() / in1_height));
 
     auto* in1_data = in1_value.data<T>();
     auto* in2_data = input2.data<T>();
@@ -148,7 +189,12 @@ struct SelectedRowsAddTo<platform::CUDADeviceContext, T> {
                   const int64_t input2_offset,
                   framework::SelectedRows* input2) {
     auto in1_height = input1.height();
-    PADDLE_ENFORCE_EQ(in1_height, input2->height());
+    PADDLE_ENFORCE_EQ(
+        in1_height, input2->height(),
+        platform::errors::InvalidArgument("The two inputs height must be equal."
+                                          "But recieved first input height = "
+                                          "[%d], second input height = [%d]",
+                                          in1_height, input2->height()));
 
     auto& in1_rows = input1.rows();
     auto& in2_rows = *(input2->mutable_rows());
@@ -162,9 +208,13 @@ struct SelectedRowsAddTo<platform::CUDADeviceContext, T> {
     }
 
     auto in1_place = input1.place();
-    PADDLE_ENFORCE_EQ(platform::is_gpu_place(in1_place), true);
+    PADDLE_ENFORCE_EQ(platform::is_gpu_place(in1_place), true,
+                      platform::errors::InvalidArgument(
+                          "The running enviroment is not on the GPU place."));
     auto in2_place = input2->place();
-    PADDLE_ENFORCE_EQ(platform::is_gpu_place(in2_place), true);
+    PADDLE_ENFORCE_EQ(platform::is_gpu_place(in1_place), true,
+                      platform::errors::InvalidArgument(
+                          "The running enviroment is not on the GPU place."));
 
     auto* in1_data = in1_value.data<T>();
     auto* in2_data = in2_value->data<T>();
@@ -209,13 +259,23 @@ struct SelectedRowsAddToTensor<platform::CUDADeviceContext, T> {
                   framework::Tensor* input2) {
     auto in1_height = input1.height();
     auto in2_dims = input2->dims();
-    PADDLE_ENFORCE_EQ(in1_height, in2_dims[0]);
+    PADDLE_ENFORCE_EQ(
+        in1_height, in2_dims[0],
+        platform::errors::InvalidArgument("The two inputs height must be equal."
+                                          "But recieved first input height = "
+                                          "[%d], second input height = [%d]",
+                                          in1_height, in2_dims[0]));
 
     auto& in1_value = input1.value();
     auto& in1_rows = input1.rows();
 
     int64_t in1_row_numel = in1_value.numel() / in1_rows.size();
-    PADDLE_ENFORCE_EQ(in1_row_numel, input2->numel() / in1_height);
+    PADDLE_ENFORCE_EQ(
+        in1_row_numel, input2->numel() / in1_height,
+        platform::errors::InvalidArgument(
+            "The two inputs width must be equal."
+            "But recieved first input width = [%d], second input width = [%d]",
+            in1_row_numel, input2->numel() / in1_height));
 
     auto* in1_data = in1_value.data<T>();
     auto* in2_data = input2->data<T>();
@@ -340,10 +400,12 @@ struct MergeAdd<platform::CUDADeviceContext, T> {
         continue;
       }
       PADDLE_ENFORCE_EQ(input_width, input->value().dims()[1],
-                        "all input should have same "
-                        "dimension except for the first one");
+                        platform::errors::InvalidArgument(
+                            "All input should have same "
+                            "dimension except for the first one."));
       PADDLE_ENFORCE_EQ(input_height, input->height(),
-                        "all input should have same height");
+                        platform::errors::InvalidArgument(
+                            "All input should have same height."));
       merged_row_set.insert(input->rows().begin(), input->rows().end());
     }
     std::vector<int64_t> merge_rows_cpu(merged_row_set.begin(),
@@ -448,13 +510,23 @@ struct UpdateToTensor<platform::CUDADeviceContext, T> {
 
     auto in1_height = merged_in1.height();
     auto in2_dims = input2->dims();
-    PADDLE_ENFORCE_EQ(in1_height, in2_dims[0]);
+    PADDLE_ENFORCE_EQ(
+        in1_height, in2_dims[0],
+        platform::errors::InvalidArgument("The two inputs height must be equal."
+                                          "But recieved first input height = "
+                                          "[%d], second input height = [%d]",
+                                          in1_height, in2_dims[0]));
 
     auto& in1_value = merged_in1.value();
     auto& in1_rows = merged_in1.rows();
 
     int64_t in1_row_numel = in1_value.numel() / in1_rows.size();
-    PADDLE_ENFORCE_EQ(in1_row_numel, input2->numel() / in1_height);
+    PADDLE_ENFORCE_EQ(
+        in1_row_numel, input2->numel() / in1_height,
+        platform::errors::InvalidArgument(
+            "The two inputs width must be equal."
+            "But recieved first input width = [%d], second input width = [%d]",
+            in1_row_numel, input2->numel() / in1_height));
 
     auto* in1_data = in1_value.template data<T>();
     auto* in2_data = input2->data<T>();
diff --git a/paddle/fluid/operators/math/selected_rows_functor_test.cu.cc b/paddle/fluid/operators/math/selected_rows_functor_test.cu.cc
index 74892316e6decdeab3a08396fa2f4bdeb8eb7b73..81ad620466ee3d9fcd9d3e057cfd0dd9053089f0 100644
--- a/paddle/fluid/operators/math/selected_rows_functor_test.cu.cc
+++ b/paddle/fluid/operators/math/selected_rows_functor_test.cu.cc
@@ -38,7 +38,9 @@ TEST(selected_rows_functor, gpu_add) {
           {static_cast<int64_t>(rows1.size()), row_numel}),
       gpu_place);
   functor(ctx, in1_value, 1.0);
-  PADDLE_ENFORCE(cudaDeviceSynchronize());
+  PADDLE_ENFORCE_EQ(cudaDeviceSynchronize(), 0,
+                    paddle::platform::errors::PreconditionNotMet(
+                        "The all synchronization on the cuda is error!"));
 
   std::vector<int64_t> rows2{0, 5, 7, 9};
   std::unique_ptr<paddle::framework::SelectedRows> selected_rows2{
diff --git a/paddle/fluid/operators/math/sequence2batch.cc b/paddle/fluid/operators/math/sequence2batch.cc
index e4ffeedb5a0061dd60ca3a30aa9928ef8b05887c..300a3692012ab9631d7049d2042e91fb99ad3c21 100644
--- a/paddle/fluid/operators/math/sequence2batch.cc
+++ b/paddle/fluid/operators/math/sequence2batch.cc
@@ -29,11 +29,24 @@ class CopyMatrixRowsFunctor<platform::CPUDeviceContext, T> {
     auto src_dims = src.dims();
     auto dst_dims = dst->dims();
     PADDLE_ENFORCE_EQ(src_dims.size(), 2UL,
-                      "The src must be matrix with rank 2.");
+                      platform::errors::InvalidArgument(
+                          "The source tensor must be a matrix with rank 2, but "
+                          "got the source tensor rank is %lu. "
+                          "Please check the rank of the source tensor",
+                          src_dims.size()));
     PADDLE_ENFORCE_EQ(dst_dims.size(), 2UL,
-                      "The dst must be matrix with rank 2.");
-    PADDLE_ENFORCE_EQ(src_dims[1], dst_dims[1],
-                      "The width of src and dst must be same.");
+                      platform::errors::InvalidArgument(
+                          "The destination tensor must be a matrix with rank, "
+                          "but got the destination tensor rank is %lu. "
+                          "Please check the rank of the destination tensor",
+                          dst_dims.size()));
+    PADDLE_ENFORCE_EQ(
+        src_dims[1], dst_dims[1],
+        platform::errors::InvalidArgument(
+            "The width of the source tensor and the destination tensor must be "
+            "same. But got %lu != %lu.Please check the rank of the source "
+            "tensor",
+            src_dims.size(), dst_dims.size()));
     auto height = dst_dims[0];
     auto width = dst_dims[1];
     auto* src_data = src.data<T>();
diff --git a/paddle/fluid/operators/math/sequence2batch.cu b/paddle/fluid/operators/math/sequence2batch.cu
index 9ab13659c1cc5b59d28395bcebcfb43fac5b4544..cd1ca572689bc701da801384e5ed08fe6dc10749 100644
--- a/paddle/fluid/operators/math/sequence2batch.cu
+++ b/paddle/fluid/operators/math/sequence2batch.cu
@@ -46,11 +46,24 @@ class CopyMatrixRowsFunctor<platform::CUDADeviceContext, T> {
     auto src_dims = src.dims();
     auto dst_dims = dst->dims();
     PADDLE_ENFORCE_EQ(src_dims.size(), 2,
-                      "The src must be matrix with rank 2.");
+                      platform::errors::InvalidArgument(
+                          "The source tensor must be a matrix with rank 2, but "
+                          "got the source tensor rank is %lu. "
+                          "Please check the rank of the source tensor",
+                          src_dims.size()));
     PADDLE_ENFORCE_EQ(dst_dims.size(), 2,
-                      "The dst must be matrix with rank 2.");
-    PADDLE_ENFORCE_EQ(src_dims[1], dst_dims[1],
-                      "The width of src and dst must be same.");
+                      platform::errors::InvalidArgument(
+                          "The destination tensor must be a matrix with rank, "
+                          "but got the destination tensor rank is %lu. "
+                          "Please check the rank of the destination tensor",
+                          dst_dims.size()));
+    PADDLE_ENFORCE_EQ(
+        src_dims[1], dst_dims[1],
+        platform::errors::InvalidArgument(
+            "The width of the source tensor and the destination tensor must be "
+            "same. But got %lu != %lu.Please check the rank of the source "
+            "tensor",
+            src_dims.size(), dst_dims.size()));
     auto height = dst_dims[0];
     auto width = dst_dims[1];
     auto* src_data = src.data<T>();
diff --git a/paddle/fluid/operators/math/sequence2batch.h b/paddle/fluid/operators/math/sequence2batch.h
index 9d9f7ef00b8a12088225fd3620cb30b43ef9dce9..6aa513e4d10eef49c02417e98b31cddd57088d7c 100644
--- a/paddle/fluid/operators/math/sequence2batch.h
+++ b/paddle/fluid/operators/math/sequence2batch.h
@@ -64,19 +64,30 @@ class LoDTensor2BatchFunctor {
                   bool is_reverse = false) const {
     if (!is_cal_batch_lod) {
       auto lods = batch->lod();
-      PADDLE_ENFORCE_GT(lods.size(), 2UL,
-                        "The LoD of LoDTensor should inlcude at least 2-level "
-                        "sequence information.");
+      PADDLE_ENFORCE_GT(
+          lods.size(), 2UL,
+          platform::errors::InvalidArgument(
+              "The LoD of LoDTensor should inlcude at least 2-level "
+              "sequence information, but got the LoD level is %lu. Please "
+              "check the input value.",
+              lods.size()));
       PADDLE_ENFORCE_EQ(
           lods[1].size(), static_cast<size_t>(lod_tensor.dims()[0]),
-          "The LoD information should be consistent with the dims.");
+          platform::errors::InvalidArgument(
+              "The LoD information should be consistent with the dims, but got "
+              "%lu != %lu. Please check the input value.",
+              lods[1].size(), static_cast<size_t>(lod_tensor.dims()[0])));
       CopyMatrixRowsFunctor<DeviceContext, T> to_batch;
       to_batch(context, lod_tensor, lods[1], batch, true);
       return;
     }
 
     auto lods = lod_tensor.lod();
-    PADDLE_ENFORCE_EQ(lods.size(), 1UL, "Only support one level sequence now.");
+    PADDLE_ENFORCE_EQ(lods.size(), 1UL,
+                      platform::errors::InvalidArgument(
+                          "Only support one level sequence now, but got the "
+                          "LoD level is %lu. Please check the input value.",
+                          lods.size()));
 
     const auto& lod = lods[0];
 
@@ -161,12 +172,19 @@ class Batch2LoDTensorFunctor {
                   const framework::LoDTensor& batch,
                   framework::LoDTensor* lod_tensor) const {
     auto in_lod = batch.lod();
-    PADDLE_ENFORCE_GT(in_lod.size(), 2UL,
-                      "The LoD of LoDTensor should inlcude at least 2-level "
-                      "sequence information.");
+    PADDLE_ENFORCE_GT(
+        in_lod.size(), 2UL,
+        platform::errors::InvalidArgument(
+            "The LoD of LoDTensor should inlcude at least 2-level "
+            "sequence information, but got the LoD level is %lu. Please check "
+            "the input value.",
+            in_lod.size()));
     PADDLE_ENFORCE_EQ(
         in_lod[1].size(), static_cast<size_t>(lod_tensor->dims()[0]),
-        "The LoD information should be consistent with the dims.");
+        platform::errors::InvalidArgument(
+            "The LoD information should be consistent with the dims, but got "
+            "%lu != %lu. Please check the input value.",
+            in_lod[1].size(), static_cast<size_t>(lod_tensor->dims()[0])));
     CopyMatrixRowsFunctor<DeviceContext, T> to_seq;
     to_seq(context, batch, in_lod[1], lod_tensor, false);
   }
diff --git a/paddle/fluid/operators/math/sequence_padding.cc b/paddle/fluid/operators/math/sequence_padding.cc
index 4630689dec160da145e607f662a802444ac98b55..076df0176429c7bbd350698af0137fbcca18f806 100644
--- a/paddle/fluid/operators/math/sequence_padding.cc
+++ b/paddle/fluid/operators/math/sequence_padding.cc
@@ -35,7 +35,11 @@ void CopyValidData(framework::Tensor* dst_tensor,
     int valid_seq_len = seq_offsets[seq_idx + 1] - seq_offsets[seq_idx];
     PADDLE_ENFORCE_GE(
         pad_seq_len, valid_seq_len,
-        "The padded sequence length can not be less than its original length.");
+        platform::errors::InvalidArgument(
+            "The padded sequence length can not "
+            "be less than its original length. Expected %ld >= %ld, but got "
+            "%ld < %ld. Please check input value.",
+            pad_seq_len, valid_seq_len, pad_seq_len, valid_seq_len));
     int seq_data_offset = seq_offsets[seq_idx] * step_width;
     int pad_data_offset = layout == kBatchLengthWidth
                               ? seq_idx * pad_seq_len * step_width
@@ -95,9 +99,14 @@ class PaddingLoDTensorFunctor<platform::CPUDeviceContext, T> {
 
     CheckDims(seq_tensor_dims, pad_tensor_dims, seq_offsets, pad_seq_len,
               step_width, layout);
-    PADDLE_ENFORCE(pad_value.numel() == 1 || pad_value.numel() == step_width,
-                   "The numel of 'pad_value' can only be 1 or be equal to the "
-                   "'step_width'.");
+
+    PADDLE_ENFORCE_EQ(
+        pad_value.numel() == 1 || pad_value.numel() == step_width, true,
+        platform::errors::InvalidArgument(
+            "The numel of 'pad_value' can only be 1 or be equal to the "
+            "'step_width', but got %ld != 1 and %ld. Please check the input "
+            "value.",
+            pad_value.numel(), step_width));
 
     // fill padding value
     T* pad_data = pad_tensor->data<T>();
diff --git a/paddle/fluid/operators/math/sequence_padding.cu b/paddle/fluid/operators/math/sequence_padding.cu
index 1b433067900af71bb8a6833cef019d41f9c76858..19c3af03411b8ce95d274532707c7ee3e93f1d55 100644
--- a/paddle/fluid/operators/math/sequence_padding.cu
+++ b/paddle/fluid/operators/math/sequence_padding.cu
@@ -66,17 +66,25 @@ class PaddingLoDTensorFunctor<platform::CUDADeviceContext, T> {
     if (pad_seq_len == -1) {
       pad_seq_len = max_seq_len;
     }
-    PADDLE_ENFORCE_GE(pad_seq_len, max_seq_len,
-                      "The pad_seq_len must be equal to or greater than the "
-                      "original max sequence length.");
+    PADDLE_ENFORCE_GE(
+        pad_seq_len, max_seq_len,
+        platform::errors::InvalidArgument(
+            "The pad_seq_len must be equal to or greater than the "
+            "original max sequence length. Expected %ld >= %ld, but got %ld < "
+            "%ld. Please check the input value.",
+            pad_seq_len, max_seq_len, pad_seq_len, max_seq_len));
     int step_width = seq_tensor.numel() / seq_tensor_dims[0];
     int seq_num = seq_offsets.size() - 1;
 
     CheckDims(seq_tensor_dims, pad_tensor_dims, seq_offsets, pad_seq_len,
               step_width, layout);
-    PADDLE_ENFORCE(pad_value.numel() == 1 || pad_value.numel() == step_width,
-                   "The numel of 'pad_value' can only be 1 or be equal to the "
-                   "'step_width'.");
+    PADDLE_ENFORCE_EQ(
+        pad_value.numel() == 1 || pad_value.numel() == step_width, true,
+        platform::errors::InvalidArgument(
+            "The numel of 'pad_value' can only be 1 or be equal to "
+            "the 'step_width', but got %ld != 1 and %ld. Please check the "
+            "input value.",
+            pad_value.numel(), step_width));
 
     const int kBlockSize = 512;
 
diff --git a/paddle/fluid/operators/math/sequence_padding.h b/paddle/fluid/operators/math/sequence_padding.h
index 5580ee5374658c3b7b8e31962cd50f1d72113ba0..956a4ff6a2d45cb619183f9beba1b7e35b7f229c 100644
--- a/paddle/fluid/operators/math/sequence_padding.h
+++ b/paddle/fluid/operators/math/sequence_padding.h
@@ -52,14 +52,25 @@ inline static void CheckDims(const framework::DDim& seq_tensor_dims,
                              const framework::Vector<size_t>& seq_offset,
                              int64_t padded_seq_len, int64_t step_width,
                              const PadLayout& layout) {
-  PADDLE_ENFORCE_EQ(static_cast<size_t>(seq_tensor_dims[0]), seq_offset.back(),
-                    "Value of 1st dimension of the sequence tensor should be "
-                    "equal to sum of lengths of all sequences.");
+  PADDLE_ENFORCE_EQ(
+      static_cast<size_t>(seq_tensor_dims[0]), seq_offset.back(),
+      platform::errors::InvalidArgument(
+          "Value of 1st dimension of the sequence tensor should be "
+          "equal to sum of lengths of all sequences. Expected %ld == %ld, but "
+          "got %ld != %ld. Please check the input value.",
+          static_cast<size_t>(seq_tensor_dims[0]), seq_offset.back(),
+          static_cast<size_t>(seq_tensor_dims[0]), seq_offset.back()));
 
-  PADDLE_ENFORCE(seq_tensor_dims.size() + 1 == pad_tensor_dims.size() ||
-                     seq_tensor_dims.size() == pad_tensor_dims.size(),
-                 "pad_tensor's rank should be 1 greater than seq_tensor's "
-                 "rank, or be equal with it.");
+  PADDLE_ENFORCE_EQ(
+      seq_tensor_dims.size() + 1 == pad_tensor_dims.size() ||
+          seq_tensor_dims.size() == pad_tensor_dims.size(),
+      true, platform::errors::InvalidArgument(
+                "pad_tensor's rank should be 1 greater than seq_tensor's "
+                "rank, or be equal with it. The pad_tensor's rank is %ld, "
+                "expected the seq_tensor's rank is %ld or %ld, but got %ld. "
+                "Please check the input value.",
+                pad_tensor_dims.size(), pad_tensor_dims.size(),
+                pad_tensor_dims.size() - 1, seq_tensor_dims.size()));
 }
 
 /*
diff --git a/paddle/fluid/operators/math/sequence_pooling.cc b/paddle/fluid/operators/math/sequence_pooling.cc
index cc3fbd587668b17b7edde50b157adca83e81eddc..2eee4d0a6c14e8b6134b71294745c71302450347 100644
--- a/paddle/fluid/operators/math/sequence_pooling.cc
+++ b/paddle/fluid/operators/math/sequence_pooling.cc
@@ -42,15 +42,29 @@ class MaxSeqPoolFunctor {
     auto out_dims = output->dims();
     auto idx_dims = index->dims();
     PADDLE_ENFORCE_GT(in_dims.size(), 1,
-                      "The rank of input shall be greater than 1.");
+                      platform::errors::InvalidArgument(
+                          "The rank of input shall be greater than 1, but got "
+                          "the rank is %ld. Please check the input value",
+                          in_dims.size()));
     PADDLE_ENFORCE_GT(out_dims.size(), 1,
-                      "The rank of output shall be greater than 1.");
+                      platform::errors::InvalidArgument(
+                          "The rank of output shall be greater than 1, but got "
+                          "the rank is %ld. Please check the input value",
+                          out_dims.size()));
     for (int64_t i = 1; i < in_dims.size(); ++i) {
-      PADDLE_ENFORCE_EQ(in_dims[i], out_dims[i],
-                        "The dimension of input and output shall be same.");
+      PADDLE_ENFORCE_EQ(
+          in_dims[i], out_dims[i],
+          platform::errors::InvalidArgument(
+              "The dimension of input and output shall be same. Expected %ld "
+              "== %ld, but got %ld != %ld. Please check the input value.",
+              in_dims[i], out_dims[i], in_dims[i], out_dims[i]));
     }
-    PADDLE_ENFORCE_EQ(idx_dims, out_dims,
-                      "The dimension of index and output shall be same.");
+    PADDLE_ENFORCE_EQ(
+        idx_dims, out_dims,
+        platform::errors::InvalidArgument(
+            "The dimension of index and output shall be same. Expected %ld == "
+            "%ld, but got %ld != %ld. Please check the input value.",
+            idx_dims, out_dims, idx_dims, out_dims));
 
     auto lod_level = input.lod().size();
     auto starts = input.lod()[lod_level - 1];
@@ -94,12 +108,22 @@ class MaxSeqPoolFunctor<T, true> {
     auto in_dims = input.dims();
     auto out_dims = output->dims();
     PADDLE_ENFORCE_GT(in_dims.size(), 1,
-                      "The rank of input shall be greater than 1.");
+                      platform::errors::InvalidArgument(
+                          "The rank of input shall be greater than 1, but got "
+                          "%ld <= 1. Please check the input value.",
+                          in_dims.size()));
     PADDLE_ENFORCE_GT(out_dims.size(), 1,
-                      "The rank of output shall be greater than 1.");
+                      platform::errors::InvalidArgument(
+                          "The rank of output shall be greater than 1, but got "
+                          "%ld <= 1. Please check the input value.",
+                          out_dims.size()));
     for (int64_t i = 1; i < in_dims.size(); ++i) {
-      PADDLE_ENFORCE_EQ(in_dims[i], out_dims[i],
-                        "The dimension of input and output shall be same.");
+      PADDLE_ENFORCE_EQ(
+          in_dims[i], out_dims[i],
+          platform::errors::InvalidArgument(
+              "The dimension of input and output shall be same. Expected %ld "
+              "== %ld, but got %ld != %ld. Please check the input value.",
+              in_dims[i], out_dims[i], in_dims[i], out_dims[i]));
     }
 
     auto lod_level = input.lod().size();
@@ -139,16 +163,29 @@ class MaxSeqPoolGradFunctor {
     auto ig_dims = in_grad->dims();
     auto idx_dims = index.dims();
     PADDLE_ENFORCE_GT(og_dims.size(), 1,
-                      "The rank of output@Grad shall be greater than 1.");
+                      platform::errors::InvalidArgument(
+                          "The rank of output@Grad shall be greater than 1, "
+                          "but got %ld <= 1. Please check the input value.",
+                          og_dims.size()));
     PADDLE_ENFORCE_GT(ig_dims.size(), 1,
-                      "The rank of input@Grad shall be greater than 1.");
+                      platform::errors::InvalidArgument(
+                          "The rank of input@Grad shall be greater than 1, but "
+                          "got %ld <= 1. Please check the input value.",
+                          ig_dims.size()));
     for (int64_t i = 1; i < og_dims.size(); ++i) {
-      PADDLE_ENFORCE_EQ(
-          og_dims[i], ig_dims[i],
-          "The dimension of input@Grad and output@Grad shall be same.");
+      PADDLE_ENFORCE_EQ(og_dims[i], ig_dims[i],
+                        platform::errors::InvalidArgument(
+                            "The dimension of input@Grad and output@Grad shall "
+                            "be same. Expected %ld == %ld, but got %ld != %ld. "
+                            "Please check the input value.",
+                            og_dims[i], ig_dims[i], og_dims[i], ig_dims[i]));
     }
-    PADDLE_ENFORCE_EQ(idx_dims, og_dims,
-                      "The dimension of index and output@Grad shall be same.");
+    PADDLE_ENFORCE_EQ(
+        idx_dims, og_dims,
+        platform::errors::InvalidArgument(
+            "The dimension of index and output@Grad shall be same. Expected "
+            "%ld == %ld, but got %ld != %ld. Please check the input value.",
+            idx_dims, og_dims, idx_dims, og_dims));
 
     const T* og_data = out_grad.data<T>();
     const int* max_index = index.data<int>();
@@ -244,9 +281,12 @@ class SumSeqPoolGradFunctor {
     auto lod = in_grad->lod()[lod_level - 1];
     int64_t out_w = out_grad.numel() / out_grad.dims()[0];
     int64_t in_w = in_grad->numel() / in_grad->dims()[0];
-    PADDLE_ENFORCE_EQ(
-        in_w, out_w,
-        "The feature size of input@Grad and output@Grad shall be same.");
+    PADDLE_ENFORCE_EQ(in_w, out_w,
+                      platform::errors::InvalidArgument(
+                          "The feature size of input@Grad and output@Grad "
+                          "shall be same. Expected %ld == %ld, but got %ld != "
+                          "%ld. Please check the input value.",
+                          in_w, out_w, in_w, out_w));
     const T* out_g_data = out_grad.data<T>();
     T* in_g_data = in_grad->mutable_data<T>(context.GetPlace());
     auto blas = math::GetBlas<platform::CPUDeviceContext, T>(context);
@@ -298,7 +338,8 @@ class SequencePoolFunctor<platform::CPUDeviceContext, T> {
       auto place = context.GetPlace();
       PADDLE_ENFORCE_EQ(
           platform::is_cpu_place(place), true,
-          "Sequence_pool should run on CPU Device when pooltype is SUM");
+          platform::errors::InvalidArgument(
+              "Sequence_pool should run on CPU Device when pooltype is SUM"));
       const T* src = input.data<T>();
       T* dst = output->mutable_data<T>(place);
       jit::seq_pool_attr_t attr(
@@ -342,7 +383,10 @@ class SequencePoolFunctor<platform::CPUDeviceContext, T> {
         out_e.device(place) = in_e.sum(Eigen::array<int, 1>({{0}})) /
                               std::sqrt(static_cast<T>(h));
       } else {
-        PADDLE_THROW("unsupported pooling pooltype");
+        PADDLE_THROW(platform::errors::InvalidArgument(
+            "unsupported pooling pooltype: %s. Only support \"AVERAGE\" and "
+            "\"SQRT\"",
+            pooltype));
       }
     }
   }
@@ -400,7 +444,10 @@ class SequencePoolGradFunctor<platform::CPUDeviceContext, T> {
       } else if (pooltype == "FIRST") {
         in_g_e.chip(0, 0).device(place) = out_g_e_v;
       } else {
-        PADDLE_THROW("unsupported pooling pooltype");
+        PADDLE_THROW(platform::errors::InvalidArgument(
+            "unsupported pooling pooltype: %s. Only support \"AVERAGE\", "
+            "\"SQRT\", \"LAST\" and \"FIRST\"",
+            pooltype));
       }
     }
   }
diff --git a/paddle/fluid/operators/math/sequence_pooling.cu b/paddle/fluid/operators/math/sequence_pooling.cu
index 422b06c70eb2107659666edf58223ae8e4666b1d..cba8dd935ef1b3625f5f68578e411aec81eaa4f4 100644
--- a/paddle/fluid/operators/math/sequence_pooling.cu
+++ b/paddle/fluid/operators/math/sequence_pooling.cu
@@ -205,7 +205,10 @@ class SequencePoolFunctor<platform::CUDADeviceContext, T> {
           lod.CUDAData(context.GetPlace()), lod.size(), item_dim,
           output->mutable_data<T>(context.GetPlace()), nullptr);
     } else {
-      PADDLE_THROW("unsupported pooling pooltype");
+      PADDLE_THROW(platform::errors::InvalidArgument(
+          "unsupported pooling pooltype: %s. Only support \"MAX\", "
+          "\"AVERAGE\", \"SUM\", \"SQRT\", \"LAST\" and \"FIRST\"",
+          pooltype));
     }
   }
 };
@@ -370,7 +373,10 @@ class SequencePoolGradFunctor<platform::CUDADeviceContext, T> {
           in_grad->mutable_data<T>(context.GetPlace()), nullptr);
 
     } else {
-      PADDLE_THROW("unsupported pooling pooltype");
+      PADDLE_THROW(platform::errors::InvalidArgument(
+          "unsupported pooling pooltype: %s. Only support \"MAX\", "
+          "\"AVERAGE\", \"SUM\", \"SQRT\", \"LAST\" and \"FIRST\"",
+          pooltype));
     }
   }
 };
diff --git a/paddle/fluid/operators/math/sequence_pooling_test.cc b/paddle/fluid/operators/math/sequence_pooling_test.cc
index efab1a375b56bea3caec2c8169dc390298a37cbe..4b5f484e52c6acb2d7fb2cea6265c8dd7826571b 100644
--- a/paddle/fluid/operators/math/sequence_pooling_test.cc
+++ b/paddle/fluid/operators/math/sequence_pooling_test.cc
@@ -50,9 +50,21 @@ void TestSequencePoolingSum(const DeviceContext &context,
   in_grad.mutable_data<T>(in_dims, place);
 
   // check tensor contruction result
-  PADDLE_ENFORCE_EQ(in_grad.dims().size(), out_grad.dims().size());
+  PADDLE_ENFORCE_EQ(
+      in_grad.dims().size(), out_grad.dims().size(),
+      paddle::platform::errors::InvalidArgument(
+          "The dimension of input and output shall be same. Expected %ld == "
+          "%ld, but got %ld != %ld. Please check the input value.",
+          in_grad.dims().size(), out_grad.dims().size(), in_grad.dims().size(),
+          out_grad.dims().size()));
   for (int64_t i = 1; i < out_grad.dims().size(); ++i) {
-    PADDLE_ENFORCE_EQ(in_grad.dims()[i], out_grad.dims()[i]);
+    PADDLE_ENFORCE_EQ(
+        in_grad.dims()[i], out_grad.dims()[i],
+        paddle::platform::errors::InvalidArgument(
+            "The dimension of input and output shall be same. Expected %ld == "
+            "%ld, but got %ld != %ld. Please check the input value.",
+            in_grad.dims()[i], out_grad.dims()[i], in_grad.dims()[i],
+            out_grad.dims()[i]));
   }
 
   // call functor
diff --git a/paddle/fluid/operators/math/tree2col.cc b/paddle/fluid/operators/math/tree2col.cc
index cafcf6319326cc9a496de8ee6aa1033e1320f4b0..0344226ea66e2a64fce0574cb45a8c2ce918359c 100644
--- a/paddle/fluid/operators/math/tree2col.cc
+++ b/paddle/fluid/operators/math/tree2col.cc
@@ -55,7 +55,11 @@ void Tree2ColUtil::construct_tree(const paddle::Tensor &EdgeSet,
                                   std::vector<std::vector<int>> *tr,
                                   size_t *node_count) {
   auto edge_set_dims = EdgeSet.dims();
-  PADDLE_ENFORCE_EQ(edge_set_dims[1], 2);
+  PADDLE_ENFORCE_EQ(edge_set_dims[1], 2,
+                    platform::errors::InvalidArgument(
+                        "The second dimension of the EdgeSet shall be 2, but "
+                        "got %ld != 2. Please check the input value.",
+                        edge_set_dims[1]));
   int64_t edge_count = EdgeSet.numel();
 
   const int *edge_data = EdgeSet.data<int>();
diff --git a/paddle/fluid/operators/math/unpooling.cc b/paddle/fluid/operators/math/unpooling.cc
index 13f0845bb8579615381c06072eb1e32507ffd3cf..9ad2ec5005203d02032aaa6f7e48b27f52631059 100644
--- a/paddle/fluid/operators/math/unpooling.cc
+++ b/paddle/fluid/operators/math/unpooling.cc
@@ -37,7 +37,13 @@ class Unpool2dMaxFunctor<platform::CPUDeviceContext, T> {
       for (int c = 0; c < output_channels; ++c) {
         for (int i = 0; i < input_feasize; ++i) {
           int index = indices_data[i];
-          PADDLE_ENFORCE(index < output_feasize, "err index in unpooling!");
+          PADDLE_ENFORCE_LT(
+              index, output_feasize,
+              platform::errors::InvalidArgument(
+                  "index should less than output tensor height * output tensor "
+                  "width. Expected %ld < %ld, but got "
+                  "%ld >= %ld. Please check input value.",
+                  index, output_feasize, index, output_feasize));
           output_data[index] = input_data[i];
         }
         input_data += input_feasize;
@@ -72,7 +78,13 @@ class Unpool2dMaxGradFunctor<platform::CPUDeviceContext, T> {
       for (int c = 0; c < output_channels; ++c) {
         for (int i = 0; i < input_feasize; ++i) {
           int index = indices_data[i];
-          PADDLE_ENFORCE(index < output_feasize, "err index in unpooling!");
+          PADDLE_ENFORCE_LT(
+              index, output_feasize,
+              platform::errors::InvalidArgument(
+                  "index should less than output tensor height * output tensor "
+                  "width. Expected %ld < %ld, but got "
+                  "%ld >= %ld. Please check input value.",
+                  index, output_feasize, index, output_feasize));
           input_grad_data[i] = output_grad_data[index];
         }
         input_grad_data += input_feasize;
diff --git a/paddle/fluid/operators/math/vol2col.cc b/paddle/fluid/operators/math/vol2col.cc
index 01f50727b442579fa62059560d0c75d329d6e288..794fc647172b040d4e926144a87b84eb4e5216b0 100644
--- a/paddle/fluid/operators/math/vol2col.cc
+++ b/paddle/fluid/operators/math/vol2col.cc
@@ -35,9 +35,14 @@ class Vol2ColFunctor<platform::CPUDeviceContext, T> {
                   const std::vector<int>& paddings, framework::Tensor* col,
                   const DataLayout data_layout) const {
     PADDLE_ENFORCE_EQ(vol.dims().size(), 4,
-                      "The dimension of vol should be 4.");
+                      platform::errors::InvalidArgument(
+                          "The dimension of vol should be 4, but received %d.",
+                          vol.dims().size()));
+
     PADDLE_ENFORCE_EQ(col->dims().size(), 7,
-                      "The dimension of col should be 7.");
+                      platform::errors::InvalidArgument(
+                          "The dimension of col should be 7, but received %d.",
+                          col->dims().size()));
 
     int input_channels =
         (data_layout != DataLayout::kNHWC ? vol.dims()[0] : vol.dims()[3]);
@@ -65,27 +70,33 @@ class Vol2ColFunctor<platform::CPUDeviceContext, T> {
     int pad_w_left = paddings_size_is_6 ? paddings[4] : paddings[2];
     int pad_w_right = paddings_size_is_6 ? paddings[5] : paddings[2];
 
-    PADDLE_ENFORCE_EQ((input_depth + pad_d_forth + pad_d_back -
-                       ((dilations[0] * (filter_depth - 1) + 1))) /
-                              strides[0] +
-                          1,
-                      output_depth,
-                      "input_depth and output_depth are "
-                      "mismatching.");
-    PADDLE_ENFORCE_EQ((input_height + pad_h_up + pad_h_down -
-                       ((dilations[1] * (filter_height - 1) + 1))) /
-                              strides[1] +
-                          1,
-                      output_height,
-                      "input_height and output_height are "
-                      "mismatching.");
-    PADDLE_ENFORCE_EQ((input_width + pad_w_left + pad_w_right -
-                       ((dilations[2] * (filter_width - 1) + 1))) /
-                              strides[2] +
-                          1,
-                      output_width,
-                      "input_width and output_width are "
-                      "mismatching.");
+    auto input_depth_tmp = (input_depth + pad_d_forth + pad_d_back -
+                            ((dilations[0] * (filter_depth - 1) + 1))) /
+                               strides[0] +
+                           1;
+    PADDLE_ENFORCE_EQ(
+        input_depth_tmp, output_depth,
+        platform::errors::InvalidArgument(
+            "input_depth(%d) and output_depth(%d) are mismatching.",
+            input_depth_tmp, output_depth));
+    auto input_height_tmp = (input_height + pad_h_up + pad_h_down -
+                             ((dilations[1] * (filter_height - 1) + 1))) /
+                                strides[1] +
+                            1;
+    PADDLE_ENFORCE_EQ(
+        input_height_tmp, output_height,
+        platform::errors::InvalidArgument(
+            "input_height(%d) and output_height(%d) are mismatching.",
+            input_height_tmp, output_height));
+    auto input_width_tmp = (input_width + pad_w_left + pad_w_right -
+                            ((dilations[2] * (filter_width - 1) + 1))) /
+                               strides[2] +
+                           1;
+    PADDLE_ENFORCE_EQ(
+        input_width_tmp, output_width,
+        platform::errors::InvalidArgument(
+            "input_width(%d) and output_width(%d) are mismatching.",
+            input_width_tmp, output_width));
     const T* vol_data = vol.data<T>();
     T* col_data = col->data<T>();
 
@@ -141,9 +152,14 @@ class Col2VolFunctor<platform::CPUDeviceContext, T> {
                   const std::vector<int>& paddings, framework::Tensor* vol,
                   const DataLayout data_layout) const {
     PADDLE_ENFORCE_EQ(vol->dims().size(), 4,
-                      "The dimension of vol should be 4.");
+                      platform::errors::InvalidArgument(
+                          "The dimension of vol should be 4, but received %d.",
+                          vol->dims().size()));
+
     PADDLE_ENFORCE_EQ(col.dims().size(), 7,
-                      "The dimension of col should be 7.");
+                      platform::errors::InvalidArgument(
+                          "The dimension of col  should be 7, but received %d.",
+                          col.dims().size()));
 
     int input_channels =
         (data_layout != DataLayout::kNHWC ? vol->dims()[0] : vol->dims()[3]);
@@ -170,27 +186,33 @@ class Col2VolFunctor<platform::CPUDeviceContext, T> {
     int pad_w_left = paddings_size_is_6 ? paddings[4] : paddings[2];
     int pad_w_right = paddings_size_is_6 ? paddings[5] : paddings[2];
 
-    PADDLE_ENFORCE_EQ((input_depth + pad_d_forth + pad_d_back -
-                       ((dilations[0] * (filter_depth - 1) + 1))) /
-                              strides[0] +
-                          1,
-                      output_depth,
-                      "input_depth and output_depth are "
-                      "mismatching.");
-    PADDLE_ENFORCE_EQ((input_height + pad_h_up + pad_h_down -
-                       ((dilations[1] * (filter_height - 1) + 1))) /
-                              strides[1] +
-                          1,
-                      output_height,
-                      "input_height and output_height are "
-                      "mismatching.");
-    PADDLE_ENFORCE_EQ((input_width + pad_w_left + pad_w_right -
-                       ((dilations[2] * (filter_width - 1) + 1))) /
-                              strides[2] +
-                          1,
-                      output_width,
-                      "input_width and output_width are "
-                      "mismatching.");
+    auto input_depth_tmp = (input_depth + pad_d_forth + pad_d_back -
+                            ((dilations[0] * (filter_depth - 1) + 1))) /
+                               strides[0] +
+                           1;
+    PADDLE_ENFORCE_EQ(
+        input_depth_tmp, output_depth,
+        platform::errors::InvalidArgument(
+            "input_depth(%d) and output_depth(%d) are mismatching.",
+            input_depth_tmp, output_depth));
+    auto input_height_tmp = (input_height + pad_h_up + pad_h_down -
+                             ((dilations[1] * (filter_height - 1) + 1))) /
+                                strides[1] +
+                            1;
+    PADDLE_ENFORCE_EQ(
+        input_height_tmp, output_height,
+        platform::errors::InvalidArgument(
+            "input_height(%d) and output_height(%d) are mismatching.",
+            input_height_tmp, output_height));
+    auto input_width_tmp = (input_width + pad_w_left + pad_w_right -
+                            ((dilations[2] * (filter_width - 1) + 1))) /
+                               strides[2] +
+                           1;
+    PADDLE_ENFORCE_EQ(
+        input_width_tmp, output_width,
+        platform::errors::InvalidArgument(
+            "input_width(%d)  and output_width(%d) are mismatching.",
+            input_width_tmp, output_width));
     T* vol_data = vol->data<T>();
     const T* col_data = col.data<T>();
 
diff --git a/paddle/fluid/operators/math/vol2col.cu b/paddle/fluid/operators/math/vol2col.cu
index 9de9051f512348f2567bfc35ae775b1852ed25fc..eca39e919737210267d7af1856903d3e1fc697d1 100644
--- a/paddle/fluid/operators/math/vol2col.cu
+++ b/paddle/fluid/operators/math/vol2col.cu
@@ -91,9 +91,13 @@ class Vol2ColFunctor<platform::CUDADeviceContext, T> {
                   const std::vector<int>& paddings, framework::Tensor* col,
                   const DataLayout data_layout) const {
     PADDLE_ENFORCE_EQ(vol.dims().size(), 4,
-                      "The dimension of vol should be 4.");
+                      platform::errors::InvalidArgument(
+                          "The dimension of  vol should be 4, but received %d.",
+                          vol.dims().size()));
     PADDLE_ENFORCE_EQ(col->dims().size(), 7,
-                      "The dimension of col should be 7.");
+                      platform::errors::InvalidArgument(
+                          "The dimension of col should be 7, but received %d.",
+                          col->dims().size()));
 
     int input_channels =
         (data_layout != DataLayout::kNHWC ? vol.dims()[0] : vol.dims()[3]);
@@ -117,27 +121,33 @@ class Vol2ColFunctor<platform::CUDADeviceContext, T> {
     int pad_h_down = paddings_size_is_6 ? paddings[3] : paddings[1];
     int pad_w_left = paddings_size_is_6 ? paddings[4] : paddings[2];
     int pad_w_right = paddings_size_is_6 ? paddings[5] : paddings[2];
-    PADDLE_ENFORCE_EQ((input_depth + pad_d_forth + pad_d_back -
-                       ((dilations[0] * (filter_depth - 1) + 1))) /
-                              strides[0] +
-                          1,
-                      output_depth,
-                      "input_depth and output_depth are "
-                      "mismatching.");
-    PADDLE_ENFORCE_EQ((input_height + pad_h_up + pad_h_down -
-                       ((dilations[1] * (filter_height - 1) + 1))) /
-                              strides[1] +
-                          1,
-                      output_height,
-                      "input_height and output_height are "
-                      "mismatching.");
-    PADDLE_ENFORCE_EQ((input_width + pad_w_left + pad_w_right -
-                       ((dilations[2] * (filter_width - 1) + 1))) /
-                              strides[2] +
-                          1,
-                      output_width,
-                      "input_width and output_width are "
-                      "mismatching.");
+    auto input_depth_tmp = (input_depth + pad_d_forth + pad_d_back -
+                            ((dilations[0] * (filter_depth - 1) + 1))) /
+                               strides[0] +
+                           1;
+    PADDLE_ENFORCE_EQ(
+        input_depth_tmp, output_depth,
+        platform::errors::InvalidArgument(
+            "input_depth(%d) and output_depth(%d) are mismatching.",
+            input_depth_tmp, output_depth));
+    auto input_height_tmp = (input_height + pad_h_up + pad_h_down -
+                             ((dilations[1] * (filter_height - 1) + 1))) /
+                                strides[1] +
+                            1;
+    PADDLE_ENFORCE_EQ(
+        input_height_tmp, output_height,
+        platform::errors::InvalidArgument(
+            "input_height(%d) and output_height(%d) are mismatching.",
+            input_height_tmp, output_height));
+    auto input_width_tmp = (input_width + pad_w_left + pad_w_right -
+                            ((dilations[2] * (filter_width - 1) + 1))) /
+                               strides[2] +
+                           1;
+    PADDLE_ENFORCE_EQ(
+        input_width_tmp, output_width,
+        platform::errors::InvalidArgument(
+            "input_width(%d) and output_width(%d) are mismatching.",
+            input_width_tmp, output_width));
 
     int num_outputs =
         input_channels * output_depth * output_height * output_width;
@@ -242,9 +252,13 @@ class Col2VolFunctor<platform::CUDADeviceContext, T> {
                   const std::vector<int>& paddings, framework::Tensor* vol,
                   const DataLayout data_layout) const {
     PADDLE_ENFORCE_EQ(vol->dims().size(), 4,
-                      "The dimension of vol should be 4.");
+                      platform::errors::InvalidArgument(
+                          "The dimension of vol  should be 4, but received %d.",
+                          vol->dims().size()));
     PADDLE_ENFORCE_EQ(col.dims().size(), 7,
-                      "The dimension of col should be 7.");
+                      platform::errors::InvalidArgument(
+                          "The dimension of col  should be 7, but received %d.",
+                          col.dims().size()));
 
     int input_channels =
         (data_layout != DataLayout::kNHWC ? vol->dims()[0] : vol->dims()[3]);
@@ -269,27 +283,33 @@ class Col2VolFunctor<platform::CUDADeviceContext, T> {
     int pad_w_left = paddings_size_is_6 ? paddings[4] : paddings[2];
     int pad_w_right = paddings_size_is_6 ? paddings[5] : paddings[2];
 
-    PADDLE_ENFORCE_EQ((input_depth + pad_d_forth + pad_d_back -
-                       ((dilations[0] * (filter_depth - 1) + 1))) /
-                              strides[0] +
-                          1,
-                      output_depth,
-                      "input_depth and output_depth are "
-                      "mismatching.");
-    PADDLE_ENFORCE_EQ((input_height + pad_h_up + pad_h_down -
-                       ((dilations[1] * (filter_height - 1) + 1))) /
-                              strides[1] +
-                          1,
-                      output_height,
-                      "input_height and output_height are "
-                      "mismatching.");
-    PADDLE_ENFORCE_EQ((input_width + pad_w_left + pad_w_right -
-                       ((dilations[2] * (filter_width - 1) + 1))) /
-                              strides[2] +
-                          1,
-                      output_width,
-                      "input_width and output_width are "
-                      "mismatching.");
+    auto input_depth_tmp = (input_depth + pad_d_forth + pad_d_back -
+                            ((dilations[0] * (filter_depth - 1) + 1))) /
+                               strides[0] +
+                           1;
+    PADDLE_ENFORCE_EQ(
+        input_depth_tmp, output_depth,
+        platform::errors::InvalidArgument(
+            "input_depth(%d) and output_depth(%d) are mismatching.",
+            input_depth_tmp, output_depth));
+    auto input_height_tmp = (input_height + pad_h_up + pad_h_down -
+                             ((dilations[1] * (filter_height - 1) + 1))) /
+                                strides[1] +
+                            1;
+    PADDLE_ENFORCE_EQ(
+        input_height_tmp, output_height,
+        platform::errors::InvalidArgument(
+            "input_height(%d) and output_height(%d) are mismatching.",
+            input_height_tmp, output_height));
+    auto input_width_tmp = (input_width + pad_w_left + pad_w_right -
+                            ((dilations[2] * (filter_width - 1) + 1))) /
+                               strides[2] +
+                           1;
+    PADDLE_ENFORCE_EQ(
+        input_width_tmp, output_width,
+        platform::errors::InvalidArgument(
+            "input_width(%d) and output_width(%d) are mismatching.",
+            input_width_tmp, output_width));
 
     int num_kernels = input_channels * input_depth * input_height * input_width;
 
diff --git a/paddle/fluid/operators/merge_lod_tensor_op.cc b/paddle/fluid/operators/merge_lod_tensor_op.cc
index c9b852cfc05127a4bbf00ea23a751c59dc2d109d..87d914aa79753fbdc9d859c43bbf749b3ddf95cf 100644
--- a/paddle/fluid/operators/merge_lod_tensor_op.cc
+++ b/paddle/fluid/operators/merge_lod_tensor_op.cc
@@ -44,8 +44,10 @@ class MergeLoDTensorOp : public framework::OperatorBase {
         scope.FindVar(Output("Out"))->GetMutable<framework::LoDTensor>();
     auto level = static_cast<size_t>(Attr<int>("level"));
 
-    PADDLE_ENFORCE(in_true.numel() || in_false.numel(),
-                   "Input(InTrue) or Input(InFalse) should be initialized.");
+    PADDLE_ENFORCE_EQ(
+        in_true.numel() || in_false.numel(), true,
+        platform::errors::InvalidArgument(
+            "Input(InTrue) or Input(InFalse) should be initialized."));
 
     auto &mask_dim = mask.dims();
     std::unique_ptr<framework::LoDTensor> cpu_mask{new framework::LoDTensor()};
@@ -56,7 +58,9 @@ class MergeLoDTensorOp : public framework::OperatorBase {
       framework::TensorCopy(mask, platform::CPUPlace(), dev_ctx,
                             cpu_mask.get());
 #else
-      PADDLE_THROW("Not supported GPU, Please compile WITH_GPU option");
+      PADDLE_THROW(platform::errors::PreconditionNotMet(
+          "Not supported GPU, Please recompile or reinstall paddle with CUDA "
+          "support."));
 #endif
     }
     auto *mask_data = cpu_mask->data<bool>();
@@ -109,7 +113,11 @@ class MergeLoDTensorOp : public framework::OperatorBase {
       size_t start_offset = lod_and_offset.second.first;
       size_t end_offset = lod_and_offset.second.second;
 
-      PADDLE_ENFORCE_GE(end_offset, start_offset);
+      PADDLE_ENFORCE_GE(end_offset, start_offset,
+                        platform::errors::InvalidArgument(
+                            "The end offset less than start offset, end offset "
+                            "is %d, start offset is %d.",
+                            end_offset, start_offset));
       size_t len = end_offset - start_offset;
       if (len == 0) {
         continue;
@@ -189,22 +197,24 @@ class MergeLoDTensorInferShape : public framework::InferShapeBase {
                    "merge_lod_tensor");
     auto mask_dim = context->GetInputDim("Mask");
     PADDLE_ENFORCE_EQ(mask_dim.size(), 2,
-                      "If you are using IfElse OP:"
-                      "\n\nie = fluid.layers.IfElse(cond=cond)\nwith "
-                      "ie.true_block():\n    out_1 = ie.input(x)\n\n"
-                      "Please ensure that the cond should be a 2-D tensor and "
-                      "the second dim size of cond should be 1. "
-                      "But now the cond's shape is [",
-                      *mask_dim.Get(), "].\n");
+                      platform::errors::InvalidArgument(
+                          "If you are using IfElse OP:"
+                          "\n\nie = fluid.layers.IfElse(cond=cond)\nwith "
+                          "ie.true_block():\n    out_1 = ie.input(x)\n\n"
+                          "Please ensure that the cond is a 2-D tensor and "
+                          "the second dim size of cond is 1. "
+                          "But now the cond's shape is [%s].\n",
+                          mask_dim));
     if (context->IsRuntime() || mask_dim[1] > 0) {
       PADDLE_ENFORCE_EQ(mask_dim[1], 1,
-                        "If you are using IfElse OP:"
-                        "\n\nie = fluid.layers.IfElse(cond=cond)\nwith "
-                        "ie.true_block():\n    out_1 = ie.input(x)\n\n"
-                        "Please ensure that the cond should be a 2-D tensor "
-                        "and the second dim size of cond should be 1. "
-                        "But now the cond's shape is [",
-                        *mask_dim.Get(), "].\n");
+                        platform::errors::InvalidArgument(
+                            "If you are using IfElse OP:"
+                            "\n\nie = fluid.layers.IfElse(cond=cond)\nwith "
+                            "ie.true_block():\n    out_1 = ie.input(x)\n\n"
+                            "Please ensure that the cond is a 2-D tensor "
+                            "and the second dim size of cond is 1. "
+                            "But now the cond's shape is [%s].\n",
+                            mask_dim));
     }
 
     context->SetOutputDim("Out", context->GetInputDim("InTrue"));
diff --git a/paddle/fluid/operators/mkldnn/activation_mkldnn_op.cc b/paddle/fluid/operators/mkldnn/activation_mkldnn_op.cc
index 5ca9216d0c8d6b3f773a1eb1a0cec216ca6ed4f3..487deb11b48687a91174c8d9baf072a5ca929de8 100644
--- a/paddle/fluid/operators/mkldnn/activation_mkldnn_op.cc
+++ b/paddle/fluid/operators/mkldnn/activation_mkldnn_op.cc
@@ -33,10 +33,12 @@ class MKLDNNActivationKernel
  public:
   void Compute(const framework::ExecutionContext &ctx) const override {
     const auto *x = ctx.Input<Tensor>("X");
-    PADDLE_ENFORCE_EQ(x->layout(), DataLayout::kMKLDNN,
-                      "Wrong layout set for X tensor");
-    PADDLE_ENFORCE_NE(x->format(), MKLDNNMemoryFormat::undef,
-                      "Wrong format set for X tensor");
+    PADDLE_ENFORCE_EQ(
+        x->layout(), DataLayout::kMKLDNN,
+        platform::errors::InvalidArgument("Wrong layout set for X tensor"));
+    PADDLE_ENFORCE_NE(
+        x->format(), MKLDNNMemoryFormat::undef,
+        platform::errors::InvalidArgument("Wrong format set for X tensor"));
 
     Functor functor;
     functor(ctx);
@@ -50,9 +52,11 @@ class MKLDNNActivationGradKernel
   void Compute(const framework::ExecutionContext &ctx) const override {
     const auto *diff_y = ctx.Input<Tensor>(framework::GradVarName("Out"));
     PADDLE_ENFORCE_EQ(diff_y->layout(), DataLayout::kMKLDNN,
-                      "Wrong layout set for Input OutGrad tensor");
+                      platform::errors::InvalidArgument(
+                          "Wrong layout set for Input OutGrad tensor"));
     PADDLE_ENFORCE_NE(diff_y->format(), MKLDNNMemoryFormat::undef,
-                      "Wrong format set for Input OutGrad tensor");
+                      platform::errors::InvalidArgument(
+                          "Wrong format set for Input OutGrad tensor"));
 
     Functor functor;
     functor(ctx);
@@ -82,7 +86,7 @@ void eltwise_forward(const framework::ExecutionContext &ctx,
 
   PADDLE_ENFORCE(
       x->dims().size() == 2 || x->dims().size() == 3 || x->dims().size() == 4,
-      "Input dim must be with 2, 3 or 4");
+      platform::errors::Unimplemented("Input dim must be with 2, 3 or 4"));
 
   auto src_tz = framework::vectorize<int64_t>(x->dims());
 
diff --git a/paddle/fluid/operators/mkldnn/batch_norm_mkldnn_op.cc b/paddle/fluid/operators/mkldnn/batch_norm_mkldnn_op.cc
index 33cf00b2c01da8e346e4c7e6be81fce3fd47f54f..8a02a697cbb21b28e14f19c6202ae0777b5102de 100644
--- a/paddle/fluid/operators/mkldnn/batch_norm_mkldnn_op.cc
+++ b/paddle/fluid/operators/mkldnn/batch_norm_mkldnn_op.cc
@@ -262,9 +262,11 @@ class BatchNormMKLDNNGradOpKernel : public paddle::framework::OpKernel<T> {
     auto *diff_shift = ctx.Output<Tensor>(framework::GradVarName("Bias"));
 
     PADDLE_ENFORCE_EQ(diff_y->layout(), DataLayout::kMKLDNN,
-                      "Wrong layout set for Input diff_y tensor");
+                      platform::errors::InvalidArgument(
+                          "Wrong layout set for Input diff_y tensor"));
     PADDLE_ENFORCE_NE(diff_y->format(), MKLDNNMemoryFormat::undef,
-                      "Wrong format set for Input diff_y tensor");
+                      platform::errors::InvalidArgument(
+                          "Wrong format set for Input diff_y tensor"));
 
     auto src_tz = paddle::framework::vectorize<int64_t>(x->dims());
     auto scale_tz = paddle::framework::vectorize<int64_t>(scale->dims());
diff --git a/paddle/fluid/operators/mkldnn/concat_mkldnn_op.cc b/paddle/fluid/operators/mkldnn/concat_mkldnn_op.cc
index 40f64800a0b81a161805857cb3e0a3855f386720..b2815cbdc65b53beba9cdb1864d10875d5db5e62 100644
--- a/paddle/fluid/operators/mkldnn/concat_mkldnn_op.cc
+++ b/paddle/fluid/operators/mkldnn/concat_mkldnn_op.cc
@@ -30,10 +30,12 @@ using platform::to_void_cast;
 
 static void EnforceLayouts(const std::vector<const Tensor*> inputs) {
   for (auto* input : inputs) {
-    PADDLE_ENFORCE_EQ(input->layout(), DataLayout::kMKLDNN,
-                      "Wrong layout set for Input tensor");
-    PADDLE_ENFORCE_NE(input->format(), MKLDNNMemoryFormat::undef,
-                      "Wrong format set for Input tensor");
+    PADDLE_ENFORCE_EQ(
+        input->layout(), DataLayout::kMKLDNN,
+        platform::errors::InvalidArgument("Wrong layout set for Input tensor"));
+    PADDLE_ENFORCE_NE(
+        input->format(), MKLDNNMemoryFormat::undef,
+        platform::errors::InvalidArgument("Wrong format set for Input tensor"));
   }
 }
 
@@ -49,7 +51,7 @@ static platform::CPUPlace GetCpuPlace(
     const paddle::framework::ExecutionContext& ctx) {
   auto place = ctx.GetPlace();
   PADDLE_ENFORCE(paddle::platform::is_cpu_place(place),
-                 "It must use CPUPlace.");
+                 platform::errors::InvalidArgument("It must use CPUPlace."));
   return BOOST_GET_CONST(platform::CPUPlace, place);
 }
 
@@ -84,8 +86,10 @@ class ConcatPrimitiveFactory {
   concat CreateConcatPrimitive(const concat::primitive_desc& concat_pd,
                                Tensor* output, platform::CPUPlace place,
                                const mkldnn::engine& mkldnn_engine) {
-    dst_mem = mkldnn::memory(concat_pd.dst_desc(), mkldnn_engine,
-                             output->mutable_data<T>(place));
+    dst_mem = mkldnn::memory(
+        concat_pd.dst_desc(), mkldnn_engine,
+        output->mutable_data<T>(place, concat_pd.dst_desc().get_size()));
+
     return concat(concat_pd);
   }
 
@@ -191,7 +195,9 @@ class ConcatMKLDNNOpKernel : public paddle::framework::OpKernel<T> {
         prim_creator.SetSrcDataHandleByIndex(
             *srcs, i, to_void_cast<T>(multi_input[i]->data<T>()));
       }
-      prim_creator.SetDstDataHandle(*dst_mem, output->mutable_data<T>(place));
+      prim_creator.SetDstDataHandle(
+          *dst_mem,
+          output->mutable_data<T>(place, concat_pd->dst_desc().get_size()));
     }
 
     mkldnn::stream astream(mkldnn_engine);
diff --git a/paddle/fluid/operators/mkldnn/conv_mkldnn_op.cc b/paddle/fluid/operators/mkldnn/conv_mkldnn_op.cc
index 7d99bb7d2b7a7049c67788df4c507afc14880815..19ee8764e27b235a2fa8e0720c11bce601b030db 100644
--- a/paddle/fluid/operators/mkldnn/conv_mkldnn_op.cc
+++ b/paddle/fluid/operators/mkldnn/conv_mkldnn_op.cc
@@ -561,7 +561,8 @@ class ConvMKLDNNOpKernel : public paddle::framework::OpKernel<T> {
 
       PADDLE_ENFORCE_EQ(
           !fuse_residual_conn || !force_fp32_output, true,
-          "residual fusion does not support force output with fp32");
+          platform::errors::Unimplemented(
+              "residual fusion does not support force output with fp32"));
 
       auto* bias = ctx.HasInput("Bias") ? ctx.Input<Tensor>("Bias") : nullptr;
 
@@ -625,7 +626,8 @@ class ConvMKLDNNOpKernel : public paddle::framework::OpKernel<T> {
               ? dilations.size() == 3 && dilations[0] == 1 &&
                     dilations[1] == 1 && dilations[2] == 1
               : dilations.size() == 2 && dilations[0] == 1 && dilations[1] == 1,
-          true, "dilation in convolution is not implemented yet");
+          true, platform::errors::Unimplemented(
+                    "dilation in convolution is not implemented yet"));
 
       const K* filter_data = filter->data<K>();
       auto scale_in_data = ctx.Attr<float>("Scale_in");
@@ -887,7 +889,8 @@ class ConvMKLDNNGradOpKernel : public paddle::framework::OpKernel<T> {
             "The output_grad tensor's layout should be %d, but got %d.",
             DataLayout::kMKLDNN, output_grad->layout()));
     PADDLE_ENFORCE_NE(output_grad->format(), MKLDNNMemoryFormat::undef,
-                      "Wrong format set for output_grad tensor");
+                      platform::errors::InvalidArgument(
+                          "Wrong format set for output_grad tensor"));
 
     PADDLE_ENFORCE_EQ(
         ctx.Attr<bool>("is_test"), false,
@@ -1052,7 +1055,11 @@ class ConvMKLDNNGradOpKernel : public paddle::framework::OpKernel<T> {
       astream.wait();
 
       filter_grad->set_layout(DataLayout::kMKLDNN);
-      filter_grad->set_format(GetMKLDNNFormat(*diff_weights_memory_p));
+      // in OneDNN groups in convolution are treated as separate dimension
+      // which is not the case in paddlepaddle
+      auto filter_fmt = GetMKLDNNFormat(*diff_weights_memory_p);
+      filter_grad->set_format(platform::MKLDNNFormatForSize(
+          g > 1 ? weights_tz.size() - 1 : weights_tz.size(), filter_fmt));
     }
     if (input_grad) {
       auto weights_memory_p = handler.AcquireWeightsMemoryFromDataPrimitive(
diff --git a/paddle/fluid/operators/mkldnn/conv_transpose_mkldnn_op.cc b/paddle/fluid/operators/mkldnn/conv_transpose_mkldnn_op.cc
index 48279658c80e93428f940c40e61d7b9af23f4ee3..56537900216a8a4e4e96791123c7d50da621ab62 100644
--- a/paddle/fluid/operators/mkldnn/conv_transpose_mkldnn_op.cc
+++ b/paddle/fluid/operators/mkldnn/conv_transpose_mkldnn_op.cc
@@ -117,7 +117,8 @@ class ConvTransposeMKLDNNOpKernel : public paddle::framework::OpKernel<T> {
 
     PADDLE_ENFORCE(
         dilations.size() == 2 && dilations[0] == 1 && dilations[1] == 1,
-        "dilation in convolution is not implemented yet");
+        platform::errors::Unimplemented(
+            "dilation in convolution is not implemented yet"));
 
     const T* input_data = input->data<T>();
     const T* filter_data = filter->data<T>();
diff --git a/paddle/fluid/operators/mkldnn/gaussian_random_mkldnn_op.cc b/paddle/fluid/operators/mkldnn/gaussian_random_mkldnn_op.cc
index 98200caca8cf66960632b88966f23e99fcd4c299..51fa5ad021a2b284cd75f297d83326b2102c1e41 100644
--- a/paddle/fluid/operators/mkldnn/gaussian_random_mkldnn_op.cc
+++ b/paddle/fluid/operators/mkldnn/gaussian_random_mkldnn_op.cc
@@ -30,8 +30,7 @@ class GaussianMKLDNNKernel : public paddle::framework::OpKernel<T> {
     float std = context.Attr<float>("std");
     auto* tensor = context.Output<framework::Tensor>("Out");
 
-    const std::string op_type = "gaussian_random";
-    auto shape = GetShape(context, op_type);
+    auto shape = GetShape(context);
     tensor->Resize(shape);
     T* data = tensor->mutable_data<T>(context.GetPlace());
     int64_t size = tensor->numel();
diff --git a/paddle/fluid/operators/mkldnn/pool_mkldnn_op.cc b/paddle/fluid/operators/mkldnn/pool_mkldnn_op.cc
index 2a8b332521804ccebdbd4e6914b2763abfb5dbdc..9df30b3295c00e69a956ee84770dfeb19a83487c 100644
--- a/paddle/fluid/operators/mkldnn/pool_mkldnn_op.cc
+++ b/paddle/fluid/operators/mkldnn/pool_mkldnn_op.cc
@@ -83,19 +83,24 @@ class PoolMKLDNNGradOpKernel : public paddle::framework::OpKernel<T> {
     const Tensor* out_grad = ctx.Input<Tensor>(framework::GradVarName("Out"));
     Tensor* in_x_grad = ctx.Output<Tensor>(framework::GradVarName("X"));
 
-    PADDLE_ENFORCE_EQ(in_x->layout(), DataLayout::kMKLDNN,
-                      "Wrong layout set for Input tensor");
-    PADDLE_ENFORCE_NE(in_x->format(), MKLDNNMemoryFormat::undef,
-                      "Wrong format set for Input tensor");
+    PADDLE_ENFORCE_EQ(
+        in_x->layout(), DataLayout::kMKLDNN,
+        platform::errors::InvalidArgument("Wrong layout set for Input tensor"));
+    PADDLE_ENFORCE_NE(
+        in_x->format(), MKLDNNMemoryFormat::undef,
+        platform::errors::InvalidArgument("Wrong format set for Input tensor"));
 
     PADDLE_ENFORCE_EQ(out_grad->layout(), DataLayout::kMKLDNN,
-                      "Wrong layout set for Input output_grad tensor");
+                      platform::errors::InvalidArgument(
+                          "Wrong layout set for Input output_grad tensor"));
     PADDLE_ENFORCE_NE(out_grad->format(), MKLDNNMemoryFormat::undef,
-                      "Wrong format set for Input output_grad tensor");
+                      platform::errors::InvalidArgument(
+                          "Wrong format set for Input output_grad tensor"));
 
     PADDLE_ENFORCE_EQ(
         ctx.Attr<bool>("is_test"), false,
-        "is_test attribute should be set to False in training phase.");
+        platform::errors::InvalidArgument(
+            "is_test attribute should be set to False in training phase."));
 
     std::string pooling_type = ctx.Attr<std::string>("pooling_type");
 
diff --git a/paddle/fluid/operators/mkldnn/quantize_mkldnn_op.cc b/paddle/fluid/operators/mkldnn/quantize_mkldnn_op.cc
index 29a86a35d7b26f41745907fb6bacf30506c027a0..a6c8f8656a4e252f1a1eedb6d67ca322f0747a66 100644
--- a/paddle/fluid/operators/mkldnn/quantize_mkldnn_op.cc
+++ b/paddle/fluid/operators/mkldnn/quantize_mkldnn_op.cc
@@ -48,6 +48,7 @@ class QuantOpKernel : public framework::OpKernel<T> {
     const T* input_data = input->data<T>();
 
     bool is_negative = ctx.Attr<bool>("is_negative_input");
+    bool bfloat16 = ctx.Attr<bool>("bfloat16");
     std::string key =
         platform::CreateKey(platform::ThreadIDasStr(), src_tz, scale_data,
                             is_negative, ctx.OutputName("Output"));
@@ -74,7 +75,10 @@ class QuantOpKernel : public framework::OpKernel<T> {
           src_md, engine, to_void_cast<T>(input_data));
 
       std::shared_ptr<mkldnn::memory::desc> dst_md;
-      if (is_negative) {
+      if (bfloat16) {
+        platform::SetDstMemoryQuantized<paddle::platform::bfloat16>(
+            ctx, output, dst_tz, engine, dst_md, dst_memory, out_format);
+      } else if (is_negative) {
         platform::SetDstMemoryQuantized<int8_t>(ctx, output, dst_tz, engine,
                                                 dst_md, dst_memory, out_format);
       } else {
@@ -96,7 +100,11 @@ class QuantOpKernel : public framework::OpKernel<T> {
       dst_memory = std::static_pointer_cast<mkldnn::memory>(
           dev_ctx.GetBlob(key_dst_mem));
       auto place = ctx.GetPlace();
-      if (is_negative) {
+
+      if (bfloat16) {
+        dst_memory->set_data_handle(
+            output->mutable_data<paddle::platform::bfloat16>(place));
+      } else if (is_negative) {
         dst_memory->set_data_handle(output->mutable_data<int8_t>(place));
       } else {
         dst_memory->set_data_handle(output->mutable_data<uint8_t>(place));
diff --git a/paddle/fluid/operators/mkldnn/softmax_mkldnn_op.cc b/paddle/fluid/operators/mkldnn/softmax_mkldnn_op.cc
index 4d825e4ee279bc2c505cfabff1917d1a5319d1dd..5014381a4e215917883f45288de4482db5cbf79c 100644
--- a/paddle/fluid/operators/mkldnn/softmax_mkldnn_op.cc
+++ b/paddle/fluid/operators/mkldnn/softmax_mkldnn_op.cc
@@ -140,7 +140,8 @@ class SoftmaxMKLDNNGradKernel : public paddle::framework::OpKernel<T> {
 
     PADDLE_ENFORCE_EQ(
         dout->dims(), dx->dims(),
-        "The shape of softmax_grad's input and output must be identical.");
+        platform::errors::InvalidArgument(
+            "The shape of softmax_grad's input and output must be identical."));
 
     auto dims = dout->dims();  // input and output share the same shape
     const int axis = CanonicalAxis(ctx.Attr<int>("axis"), dims.size());
diff --git a/paddle/fluid/operators/mv_op.cc b/paddle/fluid/operators/mv_op.cc
new file mode 100644
index 0000000000000000000000000000000000000000..1339982adaab162056bdefd3eecb405e95188a0d
--- /dev/null
+++ b/paddle/fluid/operators/mv_op.cc
@@ -0,0 +1,125 @@
+/* Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/operators/mv_op.h"
+namespace paddle {
+namespace operators {
+
+class MVOpMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  void Make() override {
+    AddInput("X", "The matrix input of mv op");
+    AddInput("Vec", "The vector input of mv op");
+    AddOutput("Out", "The output of mv op");
+    AddComment(R"DOC(
+MV Operator.
+
+This operator is used to perform matrix vector multiplication
+of the input tensors `X` and `Vec`.
+)DOC");
+  }
+};
+
+class MVOp : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+
+ protected:
+  void InferShape(framework::InferShapeContext *context) const override {
+    OP_INOUT_CHECK(context->HasInput("X"), "Input", "X", "mv");
+    OP_INOUT_CHECK(context->HasInput("Vec"), "Input", "Vec", "mv");
+    OP_INOUT_CHECK(context->HasOutput("Out"), "Output", "Out", "mv");
+
+    auto dim_x = context->GetInputDim("X");
+    auto dim_y = context->GetInputDim("Vec");
+    PADDLE_ENFORCE_EQ(
+        dim_x.size(), 2,
+        platform::errors::InvalidArgument(
+            "The rank of input X should be 2, but is %d", dim_x.size()));
+    PADDLE_ENFORCE_EQ(
+        dim_y.size(), 1,
+        platform::errors::InvalidArgument(
+            "The rank of input Vec should be 1, but is %d", dim_y.size()));
+    PADDLE_ENFORCE_EQ(dim_x[1] == dim_y[0], true,
+                      platform::errors::InvalidArgument(
+                          "The length of input X' second dim should equal the "
+                          "length of input Vec,"
+                          " but X[%d, %d], Vec[%d]",
+                          dim_x[0], dim_x[1], dim_y[0]));
+
+    framework::DDim dim_out = framework::make_ddim({dim_x[0]});
+
+    context->SetOutputDim("Out", dim_out);
+    context->ShareLoD("X", /*->*/ "Out");
+  }
+};
+
+template <typename T>
+class MVOpGradMaker : public framework::SingleGradOpMaker<T> {
+ public:
+  using framework::SingleGradOpMaker<T>::SingleGradOpMaker;
+
+ protected:
+  void Apply(GradOpPtr<T> retv) const override {
+    retv->SetType("mv_grad");
+    retv->SetInput("X", this->Input("X"));
+    retv->SetInput("Vec", this->Input("Vec"));
+    retv->SetInput(framework::GradVarName("Out"), this->OutputGrad("Out"));
+    retv->SetOutput(framework::GradVarName("X"), this->InputGrad("X"));
+    retv->SetOutput(framework::GradVarName("Vec"), this->InputGrad("Vec"));
+  }
+};
+
+class MVOpGrad : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+
+ protected:
+  void InferShape(framework::InferShapeContext *context) const override {
+    OP_INOUT_CHECK(context->HasInput("X"), "Input", "X", "mv");
+    OP_INOUT_CHECK(context->HasInput("Vec"), "Input", "Vec", "mv");
+    OP_INOUT_CHECK(context->HasInput(framework::GradVarName("Out")), "Input",
+                   "Out@GRAD", "mv");
+    auto x_dims = context->GetInputDim("X");
+    auto vec_dims = context->GetInputDim("Vec");
+
+    auto x_grad_name = framework::GradVarName("X");
+    auto vec_grad_name = framework::GradVarName("Vec");
+
+    if (context->HasOutput(x_grad_name)) {
+      context->SetOutputDim(x_grad_name, x_dims);
+    }
+    if (context->HasOutput(vec_grad_name)) {
+      context->SetOutputDim(vec_grad_name, vec_dims);
+    }
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+namespace plat = paddle::platform;
+
+REGISTER_OPERATOR(mv, ops::MVOp, ops::MVOpMaker,
+                  ops::MVOpGradMaker<paddle::framework::OpDesc>,
+                  ops::MVOpGradMaker<paddle::imperative::OpBase>);
+REGISTER_OPERATOR(mv_grad, ops::MVOpGrad);
+
+REGISTER_OP_CPU_KERNEL(
+    mv, ops::MVKernel<paddle::platform::CPUDeviceContext, float>,
+    ops::MVKernel<paddle::platform::CPUDeviceContext, double>);
+REGISTER_OP_CPU_KERNEL(
+    mv_grad, ops::MVGradKernel<paddle::platform::CPUDeviceContext, float>,
+    ops::MVGradKernel<paddle::platform::CPUDeviceContext, double>);
diff --git a/paddle/fluid/operators/mv_op.cu b/paddle/fluid/operators/mv_op.cu
new file mode 100644
index 0000000000000000000000000000000000000000..9a16fe025cd71457faade38f92f56e56c26b3b32
--- /dev/null
+++ b/paddle/fluid/operators/mv_op.cu
@@ -0,0 +1,95 @@
+/* Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/operators/mv_op.h"
+#include "paddle/fluid/platform/gpu_launch_param_config.h"
+
+namespace paddle {
+namespace operators {
+
+template <typename T>
+__global__ void MVGradCUDAKernel(const int m, const int n, const T *dout,
+                                 const T *vec, T *dx) {
+  int idx = blockDim.x * blockIdx.x + threadIdx.x;
+  for (; idx < m * n; idx += blockDim.x * gridDim.x) {
+    int i = idx / n;
+    int j = idx % n;
+    dx[idx] = dout[i] * vec[j];
+  }
+}
+
+// Using dimensional constraints on matrix multiplication, it is
+// straight-forward to check the following table for when X and Y
+// are both matrices.
+//
+// dX = | dOut Vec^T
+// dVec = | X^T dOut
+template <typename T>
+class MVGradKernel<platform::CUDADeviceContext, T>
+    : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext &context) const override {
+    auto *x = context.Input<framework::Tensor>("X");
+    auto *vec = context.Input<framework::Tensor>("Vec");
+    auto *dout =
+        context.Input<framework::Tensor>(framework::GradVarName("Out"));
+    auto *dx = context.Output<framework::Tensor>(framework::GradVarName("X"));
+    auto *dvec =
+        context.Output<framework::Tensor>(framework::GradVarName("Vec"));
+
+    auto dim_x = x->dims();
+    int m = dim_x[0];
+    int n = dim_x[1];
+
+    dx->Resize(framework::make_ddim({m * n}));
+
+    // get data ptr
+    const T *x_data = x->data<T>();
+    const T *vec_data = vec->data<T>();
+    const T *dout_data = dout->data<T>();
+
+    T *dx_data = dx->mutable_data<T>(context.GetPlace());
+    T *dvec_data = dvec->mutable_data<T>(context.GetPlace());
+
+    auto &dev_ctx =
+        context.template device_context<platform::CUDADeviceContext>();
+    auto blas = math::GetBlas<platform::CUDADeviceContext, T>(dev_ctx);
+
+    // calculate dx
+    auto stream = context.cuda_device_context().stream();
+    auto config = GetGpuLaunchConfig1D(dev_ctx, m * n);
+    MVGradCUDAKernel<
+        T><<<config.block_per_grid.x, config.thread_per_block.x, 0, stream>>>(
+        m, n, dout_data, vec_data, dx_data);
+
+    dx->Resize(framework::make_ddim({m, n}));
+
+    // calculate dvec
+    blas.GEMV(true, dim_x[0], dim_x[1], static_cast<T>(1), x_data, dout_data,
+              static_cast<T>(0), dvec_data);
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+namespace plat = paddle::platform;
+
+REGISTER_OP_CUDA_KERNEL(
+    mv, ops::MVKernel<paddle::platform::CUDADeviceContext, float>,
+    ops::MVKernel<paddle::platform::CUDADeviceContext, double>);
+REGISTER_OP_CUDA_KERNEL(
+    mv_grad, ops::MVGradKernel<paddle::platform::CUDADeviceContext, float>,
+    ops::MVGradKernel<paddle::platform::CUDADeviceContext, double>);
diff --git a/paddle/fluid/operators/mv_op.h b/paddle/fluid/operators/mv_op.h
new file mode 100644
index 0000000000000000000000000000000000000000..3c63f3640ff46f5592a244a930a191a23959baf7
--- /dev/null
+++ b/paddle/fluid/operators/mv_op.h
@@ -0,0 +1,105 @@
+/* Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include <algorithm>
+#include <utility>
+#include <vector>
+#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/operators/math/blas.h"
+#ifdef PADDLE_WITH_MKLDNN
+#include "paddle/fluid/platform/mkldnn_helper.h"
+#endif
+
+namespace paddle {
+namespace operators {
+
+using Tensor = framework::Tensor;
+
+template <typename DeviceContext, typename T>
+class MVKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext &context) const override {
+    auto *x = context.Input<framework::Tensor>("X");
+    auto *vec = context.Input<framework::Tensor>("Vec");
+
+    auto *out = context.Output<framework::Tensor>("Out");
+
+    auto dim_x = x->dims();
+
+    // get data ptr
+    const T *x_data = x->data<T>();
+    const T *vec_data = vec->data<T>();
+    T *out_data = out->mutable_data<T>(context.GetPlace());
+
+    auto &dev_ctx = context.template device_context<DeviceContext>();
+    auto blas = math::GetBlas<DeviceContext, T>(dev_ctx);
+
+    blas.GEMV(false, dim_x[0], dim_x[1], static_cast<T>(1), x_data, vec_data,
+              static_cast<T>(0), out_data);
+  }
+};
+
+// Using dimensional constraints on matrix multiplication, it is
+// straight-forward to check the following table for when X and Y
+// are both matrices.
+//
+// dX = | dOut vec^T
+// dVec = | X^T dOut
+template <typename DeviceContext, typename T>
+class MVGradKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext &context) const override {
+    auto *x = context.Input<framework::Tensor>("X");
+    auto *vec = context.Input<framework::Tensor>("Vec");
+    auto *dout =
+        context.Input<framework::Tensor>(framework::GradVarName("Out"));
+    auto *dx = context.Output<framework::Tensor>(framework::GradVarName("X"));
+    auto *dvec =
+        context.Output<framework::Tensor>(framework::GradVarName("Vec"));
+
+    auto dim_x = x->dims();
+    int m = dim_x[0];
+    int n = dim_x[1];
+
+    dx->Resize(framework::make_ddim({m * n}));
+
+    // get data ptr
+    const T *x_data = x->data<T>();
+    const T *vec_data = vec->data<T>();
+    const T *dout_data = dout->data<T>();
+
+    T *dx_data = dx->mutable_data<T>(context.GetPlace());
+    T *dvec_data = dvec->mutable_data<T>(context.GetPlace());
+
+    auto &dev_ctx = context.template device_context<DeviceContext>();
+    auto blas = math::GetBlas<DeviceContext, T>(dev_ctx);
+
+    // calculate dx
+    for (int i = 0; i < m; ++i) {
+      for (int j = 0; j < n; ++j)
+        dx_data[i * n + j] = dout_data[i] * vec_data[j];
+    }
+
+    dx->Resize(framework::make_ddim({m, n}));
+
+    // calculate dvec
+    blas.GEMV(true, dim_x[0], dim_x[1], static_cast<T>(1), x_data, dout_data,
+              static_cast<T>(0), dvec_data);
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/fluid/operators/norm_utils.cu.h b/paddle/fluid/operators/norm_utils.cu.h
new file mode 100644
index 0000000000000000000000000000000000000000..07333f1ae11c3889b543ca6d327e480607a4bcea
--- /dev/null
+++ b/paddle/fluid/operators/norm_utils.cu.h
@@ -0,0 +1,486 @@
+/* Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+#include <algorithm>
+#include <cfloat>
+#include <string>
+#include <vector>
+#include "cub/cub.cuh"
+#include "paddle/fluid/framework/data_layout.h"
+#include "paddle/fluid/operators/math/math_function.h"
+#include "paddle/fluid/platform/cudnn_helper.h"
+
+namespace paddle {
+namespace operators {
+
+using Tensor = framework::Tensor;
+using DataLayout = framework::DataLayout;
+
+// math: dx = scale * ((x - mean) * inv_var / NxHxW * (np.mean(ddx,
+// axis=(n,h,w)) *
+//          np.sum(dy, axis=(n,h,w)) -
+//          np.sum(dy * ddx, axis=(n,h,w)) + 3 * np.mean(dy * (x -
+//          mean),
+//          axis=(n,h,w)) * inv_var.pow(2) *
+//          np.sum(ddx * (x - mean), axis=(n,h,w))) + inv_var.pow(3) /
+//          NxHxW *
+//          np.sum(ddx * (x - mean)) *
+//          (np.mean(dy, axis=(n,h,w)) - dy) + inv_var.pow(3) / NxHxW *
+//          np.sum(dy,
+//          axis=(n,h,w)) * (x - mean) *
+//          (np.mean(ddx, axis=(n,h,w)) - ddx) + ddr * (dy * inv_var -
+//          inv_var
+//          *
+//          np.mean(dy, axis=(n,h,w)) -
+//          inv_var.pow(3) * (x - mean) * np.mean(dy * (x - mean),
+//          axis=(n,h,w))))
+
+template <typename T, int BlockDim, framework::DataLayout layout>
+__global__ void DoubleGradComputeDX(const T *x, const T *mean,
+                                    const T *variance, const T *ddx,
+                                    const T *dy, const T *scale,
+                                    const T *ddscale, const int N, const int C,
+                                    const int sample_size, const double epsilon,
+                                    T *dx) {
+  const int outer_size = C;
+  const int inner_size = N * sample_size;
+
+  typedef cub::BlockReduce<T, BlockDim> BlockReduce;
+  __shared__ typename BlockReduce::TempStorage dy_storage;
+  __shared__ typename BlockReduce::TempStorage ddx_storage;
+  __shared__ typename BlockReduce::TempStorage dy_mul_ddx_storage;
+  __shared__ typename BlockReduce::TempStorage dy_mul_x_sub_mean_storage;
+  __shared__ typename BlockReduce::TempStorage ddx_mul_x_sub_mean_storage;
+  __shared__ T dy_sum_val;
+  __shared__ T ddx_sum_val;
+  __shared__ T dy_mul_ddx_sum_val;
+  __shared__ T dy_mul_x_sub_mean_sum_val;
+  __shared__ T ddx_mul_x_sub_mean_sum_val;
+
+  for (int i = blockIdx.x; i < outer_size; i += gridDim.x) {
+    T mean_val = mean[i];
+    T var_val = variance[i];
+    T dy_sum = 0;
+    T ddx_sum = 0;
+    T dy_mul_ddx_sum = 0;
+    T dy_mul_x_sub_mean_sum = 0;
+    T ddx_mul_x_sub_mean_sum = 0;
+    for (int j = threadIdx.x; j < inner_size; j += blockDim.x) {
+      const int index =
+          layout == framework::DataLayout::kNCHW
+              ? (j / sample_size * C + i) * sample_size + j % sample_size
+              : j * outer_size + i;
+      T ddx_i = ddx[index];
+      T dy_i = dy[index];
+      T tmp = x[index] - mean_val;
+
+      dy_sum += dy_i;
+      ddx_sum += ddx_i;
+      dy_mul_ddx_sum += (ddx_i * dy_i);
+
+      dy_mul_x_sub_mean_sum += (dy_i * tmp);
+      ddx_mul_x_sub_mean_sum += (ddx_i * tmp);
+    }
+
+    dy_sum = BlockReduce(dy_storage).Reduce(dy_sum, cub::Sum());
+    ddx_sum = BlockReduce(ddx_storage).Reduce(ddx_sum, cub::Sum());
+    dy_mul_ddx_sum =
+        BlockReduce(dy_mul_ddx_storage).Reduce(dy_mul_ddx_sum, cub::Sum());
+    dy_mul_x_sub_mean_sum = BlockReduce(dy_mul_x_sub_mean_storage)
+                                .Reduce(dy_mul_x_sub_mean_sum, cub::Sum());
+    ddx_mul_x_sub_mean_sum = BlockReduce(ddx_mul_x_sub_mean_storage)
+                                 .Reduce(ddx_mul_x_sub_mean_sum, cub::Sum());
+
+    if (threadIdx.x == 0) {
+      dy_sum_val = dy_sum;
+      ddx_sum_val = ddx_sum;
+      dy_mul_ddx_sum_val = dy_mul_ddx_sum;
+      dy_mul_x_sub_mean_sum_val = dy_mul_x_sub_mean_sum;
+      ddx_mul_x_sub_mean_sum_val = ddx_mul_x_sub_mean_sum;
+    }
+    __syncthreads();
+
+    if (ddx != nullptr) {
+      for (int j = threadIdx.x; j < inner_size; j += blockDim.x) {
+        const int index =
+            layout == framework::DataLayout::kNCHW
+                ? (j / sample_size * C + i) * sample_size + j % sample_size
+                : j * outer_size + i;
+        dx[index] +=
+            ((x[index] - mean_val) * var_val * var_val * var_val / inner_size *
+                 (ddx_sum_val * dy_sum_val / inner_size - dy_mul_ddx_sum_val +
+                  3. * dy_mul_x_sub_mean_sum_val * var_val *
+                      ddx_mul_x_sub_mean_sum_val * var_val / inner_size) +
+             ddx_mul_x_sub_mean_sum_val * var_val / inner_size * var_val *
+                 var_val * (dy_sum_val / inner_size - dy[index]) +
+             dy_mul_x_sub_mean_sum_val * var_val / inner_size * var_val *
+                 var_val * (ddx_sum_val / inner_size - ddx[index])) *
+            scale[i];
+      }
+    }
+    __syncthreads();
+    if (ddscale != nullptr) {
+      for (int j = threadIdx.x; j < inner_size; j += blockDim.x) {
+        const int index =
+            layout == framework::DataLayout::kNCHW
+                ? (j / sample_size * C + i) * sample_size + j % sample_size
+                : j * outer_size + i;
+        dx[index] += (dy[index] * var_val - dy_sum_val / inner_size * var_val -
+                      (x[index] - mean_val) * var_val *
+                          dy_mul_x_sub_mean_sum_val * var_val / inner_size) *
+                     ddscale[i];
+      }
+    }
+  }
+}
+
+// math: ddy = (x - mean) * inv_var * ddscale + ddbias +
+//           scale * inv_var * (ddx - (x - mean) * inv_var.pow(2) *
+//           np.mean(ddx * (x - mean), axis=(n,h,w)))
+template <typename T, int BlockDim, framework::DataLayout layout>
+__global__ void DoubleGradComputeDDY(const T *x, const T *mean,
+                                     const T *variance, const T *ddscale,
+                                     const T *ddbias, const T *ddx,
+                                     const T *scale, const int N, const int C,
+                                     const int sample_size,
+                                     const double epsilon, T *ddy) {
+  const int outer_size = C;
+  const int inner_size = N * sample_size;
+
+  typedef cub::BlockReduce<T, BlockDim> BlockReduce;
+  __shared__ typename BlockReduce::TempStorage ddx_storage;
+  __shared__ typename BlockReduce::TempStorage ddx_mul_x_sub_mean_storage;
+  __shared__ T ddx_sum_val;
+  __shared__ T ddx_mul_x_sub_mean_sum_val;
+
+  for (int i = blockIdx.x; i < outer_size; i += gridDim.x) {
+    T mean_val = mean[i];
+    T var_val = variance[i];
+    T ddx_sum = 0;
+    T ddx_mul_x_sub_mean_sum = 0;
+    for (int j = threadIdx.x; j < inner_size; j += blockDim.x) {
+      const int index =
+          layout == framework::DataLayout::kNCHW
+              ? (j / sample_size * C + i) * sample_size + j % sample_size
+              : j * outer_size + i;
+      T ddx_i = ddx[index];
+      ddx_sum += ddx_i;
+      ddx_mul_x_sub_mean_sum += (ddx_i * (x[index] - mean_val));
+    }
+    ddx_sum = BlockReduce(ddx_storage).Reduce(ddx_sum, cub::Sum());
+    ddx_mul_x_sub_mean_sum = BlockReduce(ddx_mul_x_sub_mean_storage)
+                                 .Reduce(ddx_mul_x_sub_mean_sum, cub::Sum());
+
+    if (threadIdx.x == 0) {
+      ddx_sum_val = ddx_sum;
+      ddx_mul_x_sub_mean_sum_val = ddx_mul_x_sub_mean_sum;
+    }
+    __syncthreads();
+
+    if (ddx != nullptr) {
+      for (int j = threadIdx.x; j < inner_size; j += blockDim.x) {
+        const int index =
+            layout == framework::DataLayout::kNCHW
+                ? (j / sample_size * C + i) * sample_size + j % sample_size
+                : j * outer_size + i;
+        ddy[index] += scale[i] * var_val *
+                      (ddx[index] - ddx_sum_val / inner_size -
+                       (x[index] - mean_val) * var_val *
+                           ddx_mul_x_sub_mean_sum_val * var_val / inner_size);
+      }
+    }
+    __syncthreads();
+    if (ddscale != nullptr) {
+      for (int j = threadIdx.x; j < inner_size; j += blockDim.x) {
+        const int index =
+            layout == framework::DataLayout::kNCHW
+                ? (j / sample_size * C + i) * sample_size + j % sample_size
+                : j * outer_size + i;
+        ddy[index] += (x[index] - mean_val) * var_val * ddscale[i];
+      }
+    }
+    __syncthreads();
+    if (ddbias != nullptr) {
+      for (int j = threadIdx.x; j < inner_size; j += blockDim.x) {
+        const int index =
+            layout == framework::DataLayout::kNCHW
+                ? (j / sample_size * C + i) * sample_size + j % sample_size
+                : j * outer_size + i;
+        ddy[index] += ddbias[i];
+      }
+    }
+  }
+}
+
+// math: dscale = inv_var * (dy - np.mean(dy, axis=(n,h,w) - (x-mean) *
+//            inv_var.pow(2) * np.mean(dy * (x-mean), axis=(n,h,w)))) *
+//            ddx
+template <typename T, int BlockDim, framework::DataLayout layout>
+__global__ void DoubleGradComputeDScale(const T *x, const T *mean,
+                                        const T *variance, const T *ddx,
+                                        const T *dy, const int N, const int C,
+                                        const int sample_size,
+                                        const double epsilon, T *dscale) {
+  const int outer_size = C;
+  const int inner_size = N * sample_size;
+
+  typedef cub::BlockReduce<T, BlockDim> BlockReduce;
+  __shared__ typename BlockReduce::TempStorage dy_storage;
+  __shared__ typename BlockReduce::TempStorage dy_mul_x_sub_mean_storage;
+  __shared__ typename BlockReduce::TempStorage dscale_tmp_storage;
+  __shared__ T dy_sum_val;
+  __shared__ T dy_mul_x_sub_mean_sum_val;
+
+  for (int i = blockIdx.x; i < outer_size; i += gridDim.x) {
+    T dy_sum = 0;
+    T dy_mul_x_sub_mean_sum = 0;
+    T mean_val = mean[i];
+    T var_val = variance[i];
+    for (int j = threadIdx.x; j < inner_size; j += blockDim.x) {
+      const int index =
+          layout == framework::DataLayout::kNCHW
+              ? (j / sample_size * C + i) * sample_size + j % sample_size
+              : j * outer_size + i;
+      T dy_i = dy[index];
+      dy_sum += dy_i;
+      dy_mul_x_sub_mean_sum += (dy_i * (x[index] - mean_val));
+    }
+    dy_sum = BlockReduce(dy_storage).Reduce(dy_sum, cub::Sum());
+    dy_mul_x_sub_mean_sum = BlockReduce(dy_mul_x_sub_mean_storage)
+                                .Reduce(dy_mul_x_sub_mean_sum, cub::Sum());
+
+    if (threadIdx.x == 0) {
+      dy_sum_val = dy_sum;
+      dy_mul_x_sub_mean_sum_val = dy_mul_x_sub_mean_sum;
+    }
+    __syncthreads();
+
+    if (ddx != nullptr) {
+      T dscale_tmp = 0;
+      for (int j = threadIdx.x; j < inner_size; j += blockDim.x) {
+        const int index =
+            layout == framework::DataLayout::kNCHW
+                ? (j / sample_size * C + i) * sample_size + j % sample_size
+                : j * outer_size + i;
+        dscale_tmp += ddx[index] * var_val *
+                      (dy[index] - dy_sum_val / inner_size -
+                       dy_mul_x_sub_mean_sum_val * (x[index] - mean_val) *
+                           var_val * var_val / inner_size);
+      }
+      dscale_tmp =
+          BlockReduce(dscale_tmp_storage).Reduce(dscale_tmp, cub::Sum());
+
+      if (threadIdx.x == 0) {
+        dscale[i] += dscale_tmp;
+      }
+      __syncthreads();
+    }
+  }
+}
+
+// math: dscale = np.sum(ddx * dy, axis=(n,h,w)) * inv_var
+template <typename T, int BlockDim, framework::DataLayout layout>
+__global__ void DoubleGradComputeDScaleWithGlobal(
+    const T *ddx, const T *variance, const T *dy, const double epsilon,
+    const int N, const int C, const int sample_size, T *dscale) {
+  int outer_size = C;
+  int inner_size = N * sample_size;
+  typedef cub::BlockReduce<T, BlockDim> BlockReduce;
+  __shared__ typename BlockReduce::TempStorage ddx_mul_dy_storage;
+  __shared__ T ddx_mul_dy_sum_val;
+  for (int i = blockIdx.x; i < outer_size; i += gridDim.x) {
+    T inv_var_i = 1.0 / sqrt(variance[i] + epsilon);
+    T ddx_mul_dy_sum = 0;
+    for (int j = threadIdx.x; j < inner_size; j += blockDim.x) {
+      const int index =
+          layout == framework::DataLayout::kNCHW
+              ? (j / sample_size * C + i) * sample_size + j % sample_size
+              : j * outer_size + i;
+      T ddx_i = ddx[index];
+      T dy_i = dy[index];
+      ddx_mul_dy_sum += (ddx_i * dy_i);
+    }
+    ddx_mul_dy_sum =
+        BlockReduce(ddx_mul_dy_storage).Reduce(ddx_mul_dy_sum, cub::Sum());
+    if (threadIdx.x == 0) {
+      ddx_mul_dy_sum_val = ddx_mul_dy_sum;
+    }
+    __syncthreads();
+
+    if (ddx != nullptr) {
+      dscale[i] = inv_var_i * ddx_mul_dy_sum_val;
+    }
+  }
+}
+
+// math: dx = ddscale * dy * inv_var
+// math: ddy = scale * ddx * inv_var
+template <typename T, framework::DataLayout layout>
+__global__ void DoubleGradComputeDataWithGlobal(
+    const T *dy, const T *scale, const T *variance, const double epsilon,
+    const int C, const int sample_size, const int num, T *dx) {
+  int gid = blockIdx.x * blockDim.x + threadIdx.x;
+  int stride = blockDim.x * gridDim.x;
+  if (scale != nullptr) {
+    for (int i = gid; i < num; i += stride) {
+      const int c =
+          layout == framework::DataLayout::kNCHW ? i / sample_size % C : i % C;
+      T inv_var = 1.0 / sqrt(variance[c] + epsilon);
+      dx[i] = dy[i] * scale[c] * inv_var;
+    }
+  }
+}
+
+template <typename DeviceContext, typename T>
+void NormDoubleGradFunctor(const framework::ExecutionContext &ctx,
+                           const DataLayout data_layout, const Tensor *X,
+                           const Tensor *Scale, const Tensor *dY,
+                           const Tensor *Saved_mean,
+                           const Tensor *Saved_variance, const double epsilon,
+                           const bool use_global_stats, const Tensor *ddX,
+                           const Tensor *ddScale, const Tensor *ddBias,
+                           Tensor *dX, Tensor *dScale, Tensor *ddY) {
+  const T *x_data = X->data<T>();
+  const T *dy_data = dY->data<T>();
+  const T *ddx_data = (ddX == nullptr ? nullptr : ddX->data<T>());
+
+  const T *ddscale_data = (ddScale == nullptr ? nullptr : ddScale->data<T>());
+  const T *ddbias_data = (ddBias == nullptr ? nullptr : ddBias->data<T>());
+
+  auto &dev_ctx = ctx.template device_context<platform::CUDADeviceContext>();
+  math::SetConstant<platform::CUDADeviceContext, T> set_constant;
+
+  auto &x_dims = X->dims();
+  const int C = (data_layout == DataLayout::kNCHW ? x_dims[1]
+                                                  : x_dims[x_dims.size() - 1]);
+  const int N = x_dims[0];
+  const int num = X->numel();
+  const int sample_size = num / N / C;
+  Tensor scale_tmp;
+  if (!Scale) {
+    scale_tmp.mutable_data<T>({C}, ctx.GetPlace());
+    set_constant(dev_ctx, &scale_tmp, static_cast<T>(1));
+  }
+  const T *scale_data = Scale ? Scale->data<T>() : scale_tmp.data<T>();
+
+  const int block = 512;
+  int max_threads = dev_ctx.GetMaxPhysicalThreadCount();
+  const int max_blocks = std::max(max_threads / block, 1);
+  int grid = std::min(C, max_blocks);
+  int grid1 = (num + block - 1) / block;
+
+  const T *mean_data, *variance_data;
+  if (use_global_stats) {
+    const auto *running_var = ctx.Input<Tensor>("Variance");
+    const auto *running_var_data = running_var->template data<T>();
+    variance_data = running_var_data;
+  } else {
+    const T *smean_data = Saved_mean->data<T>();
+    const T *svariance_data = Saved_variance->data<T>();
+    mean_data = smean_data;
+    variance_data = svariance_data;
+  }
+
+  if (dX) {
+    T *dx_data = dX->mutable_data<T>(ctx.GetPlace());
+    set_constant(dev_ctx, dX, static_cast<T>(0));
+    if (use_global_stats) {
+      if (data_layout == DataLayout::kNHWC) {
+        DoubleGradComputeDataWithGlobal<
+            T, DataLayout::kNHWC><<<grid1, block, 0, dev_ctx.stream()>>>(
+            dy_data, ddscale_data, variance_data, epsilon, C, sample_size, num,
+            dx_data);
+      } else {
+        DoubleGradComputeDataWithGlobal<
+            T, DataLayout::kNCHW><<<grid1, block, 0, dev_ctx.stream()>>>(
+            dy_data, ddscale_data, variance_data, epsilon, C, sample_size, num,
+            dx_data);
+      }
+    } else {
+      if (data_layout == DataLayout::kNHWC) {
+        DoubleGradComputeDX<
+            T, block, DataLayout::kNHWC><<<grid, block, 0, dev_ctx.stream()>>>(
+            x_data, mean_data, variance_data, ddx_data, dy_data, scale_data,
+            ddscale_data, N, C, sample_size, epsilon, dx_data);
+      } else {
+        DoubleGradComputeDX<
+            T, block, DataLayout::kNCHW><<<grid, block, 0, dev_ctx.stream()>>>(
+            x_data, mean_data, variance_data, ddx_data, dy_data, scale_data,
+            ddscale_data, N, C, sample_size, epsilon, dx_data);
+      }
+    }
+  }
+  if (dScale) {
+    T *dscale_data = dScale->mutable_data<T>(ctx.GetPlace());
+    set_constant(dev_ctx, dScale, static_cast<T>(0));
+    if (use_global_stats) {
+      if (data_layout == DataLayout::kNHWC) {
+        DoubleGradComputeDScaleWithGlobal<
+            T, block, DataLayout::kNHWC><<<grid, block, 0, dev_ctx.stream()>>>(
+            ddx_data, variance_data, dy_data, epsilon, N, C, sample_size,
+            dscale_data);
+      } else {
+        DoubleGradComputeDScaleWithGlobal<
+            T, block, DataLayout::kNCHW><<<grid, block, 0, dev_ctx.stream()>>>(
+            ddx_data, variance_data, dy_data, epsilon, N, C, sample_size,
+            dscale_data);
+      }
+    } else {
+      if (data_layout == DataLayout::kNHWC) {
+        DoubleGradComputeDScale<
+            T, block, DataLayout::kNHWC><<<grid, block, 0, dev_ctx.stream()>>>(
+            x_data, mean_data, variance_data, ddx_data, dy_data, N, C,
+            sample_size, epsilon, dscale_data);
+      } else {
+        DoubleGradComputeDScale<
+            T, block, DataLayout::kNCHW><<<grid, block, 0, dev_ctx.stream()>>>(
+            x_data, mean_data, variance_data, ddx_data, dy_data, N, C,
+            sample_size, epsilon, dscale_data);
+      }
+    }
+  }
+  if (ddY) {
+    T *ddy_data = ddY->mutable_data<T>(ctx.GetPlace());
+    set_constant(dev_ctx, ddY, static_cast<T>(0));
+    if (use_global_stats) {
+      if (data_layout == DataLayout::kNHWC) {
+        DoubleGradComputeDataWithGlobal<
+            T, DataLayout::kNHWC><<<grid1, block, 0, dev_ctx.stream()>>>(
+            ddx_data, scale_data, variance_data, epsilon, C, sample_size, num,
+            ddy_data);
+      } else {
+        DoubleGradComputeDataWithGlobal<
+            T, DataLayout::kNCHW><<<grid1, block, 0, dev_ctx.stream()>>>(
+            ddx_data, scale_data, variance_data, epsilon, C, sample_size, num,
+            ddy_data);
+      }
+    } else {
+      if (data_layout == DataLayout::kNHWC) {
+        DoubleGradComputeDDY<
+            T, block, DataLayout::kNHWC><<<grid, block, 0, dev_ctx.stream()>>>(
+            x_data, mean_data, variance_data, ddscale_data, ddbias_data,
+            ddx_data, scale_data, N, C, sample_size, epsilon, ddy_data);
+      } else {
+        DoubleGradComputeDDY<
+            T, block, DataLayout::kNCHW><<<grid, block, 0, dev_ctx.stream()>>>(
+            x_data, mean_data, variance_data, ddscale_data, ddbias_data,
+            ddx_data, scale_data, N, C, sample_size, epsilon, ddy_data);
+      }
+    }
+  }
+}
+
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/fluid/operators/optimizers/decayed_adagrad_op.cc b/paddle/fluid/operators/optimizers/decayed_adagrad_op.cc
index 5c6c38da92808f05c90e7dad2482e7c7364a1f80..eb41d21e09218b203f887d8fd812d46dc8367c71 100644
--- a/paddle/fluid/operators/optimizers/decayed_adagrad_op.cc
+++ b/paddle/fluid/operators/optimizers/decayed_adagrad_op.cc
@@ -23,46 +23,54 @@ class DecayedAdagradOp : public framework::OperatorWithKernel {
   using framework::OperatorWithKernel::OperatorWithKernel;
 
   void InferShape(framework::InferShapeContext *ctx) const override {
-    PADDLE_ENFORCE(ctx->HasInput("Param"),
-                   "Input(Param) of DecayedAdagradOp should not be null.");
-    PADDLE_ENFORCE(ctx->HasInput("Grad"),
-                   "Input(Grad) of DecayedAdagradOp should not be null.");
-    PADDLE_ENFORCE(ctx->HasInput("Moment"),
-                   "Input(Moment) of DecayedAdagradOp should not be null.");
-    PADDLE_ENFORCE(
-        ctx->HasInput("LearningRate"),
-        "Input(LearningRate) of DecayedAdagradOp should not be null.");
-    PADDLE_ENFORCE(
-        ctx->GetInputsVarType("Param").front() ==
-            framework::proto::VarType::LOD_TENSOR,
-        "The input var's type should be LoDTensor, but the received is %s",
-        ctx->Inputs("Param").front(), ctx->GetInputsVarType("Param").front());
-    PADDLE_ENFORCE(
-        ctx->GetInputsVarType("Grad").front() ==
-            framework::proto::VarType::LOD_TENSOR,
-        "The input var's type should be LoDTensor, but the received is %s",
-        ctx->Inputs("Grad").front(), ctx->GetInputsVarType("Grad").front());
-
-    PADDLE_ENFORCE(ctx->HasOutput("ParamOut"),
-                   "Output(ParamOut) of DecayedAdagradOp should not be null.");
-    PADDLE_ENFORCE(ctx->HasOutput("MomentOut"),
-                   "Output(MomentOut) of DecayedAdagradOp should not be null.");
+    OP_INOUT_CHECK(ctx->HasInput("Param"), "Input", "Param",
+                   "DecayedAdagradOp");
+    OP_INOUT_CHECK(ctx->HasInput("Grad"), "Input", "Grad", "DecayedAdagradOp");
+    OP_INOUT_CHECK(ctx->HasInput("Moment"), "Input", "Moment",
+                   "DecayedAdagradOp");
+    OP_INOUT_CHECK(ctx->HasInput("LearningRate"), "Input", "LearningRate",
+                   "DecayedAdagradOp");
+    PADDLE_ENFORCE_EQ(
+        ctx->GetInputsVarType("Param").front(),
+        framework::proto::VarType::LOD_TENSOR,
+        platform::errors::InvalidArgument(
+            "The input var's type should be LoDTensor, but the received is %s",
+            ctx->Inputs("Param").front(),
+            ctx->GetInputsVarType("Param").front()));
+    PADDLE_ENFORCE_EQ(
+        ctx->GetInputsVarType("Grad").front(),
+        framework::proto::VarType::LOD_TENSOR,
+        platform::errors::InvalidArgument(
+            "The input var's type should be LoDTensor, but the received is %s",
+            ctx->Inputs("Grad").front(),
+            ctx->GetInputsVarType("Grad").front()));
+
+    OP_INOUT_CHECK(ctx->HasOutput("ParamOut"), "Output", "ParamOut",
+                   "DecayedAdagradOp");
+    OP_INOUT_CHECK(ctx->HasOutput("MomentOut"), "Output", "MomentOut",
+                   "DecayedAdagradOp");
 
     auto lr_dims = ctx->GetInputDim("LearningRate");
     PADDLE_ENFORCE_NE(framework::product(lr_dims), 0,
-                      "Maybe the Input variable LearningRate has not "
-                      "been initialized. You may need to confirm "
-                      "if you put exe.run(startup_program) "
-                      "after optimizer.minimize function.");
+                      platform::errors::InvalidArgument(
+                          "Maybe the Input variable LearningRate has not "
+                          "been initialized. You may need to confirm "
+                          "if you put exe.run(startup_program) "
+                          "after optimizer.minimize function."));
     PADDLE_ENFORCE_EQ(framework::product(lr_dims), 1,
-                      "LearningRate should have one element");
+                      platform::errors::InvalidArgument(
+                          "LearningRate should have one element"));
     auto param_dims = ctx->GetInputDim("Param");
-    PADDLE_ENFORCE_EQ(param_dims, ctx->GetInputDim("Grad"),
-                      "Param and Grad input of DecayedAdagradOp should have "
-                      "the same dimension.");
-    PADDLE_ENFORCE_EQ(param_dims, ctx->GetInputDim("Moment"),
-                      "Param and Moment input of DecayedAdagradOp should have "
-                      "the same dimension.");
+    PADDLE_ENFORCE_EQ(
+        param_dims, ctx->GetInputDim("Grad"),
+        platform::errors::InvalidArgument(
+            "Param and Grad input of DecayedAdagradOp should have "
+            "the same dimension."));
+    PADDLE_ENFORCE_EQ(
+        param_dims, ctx->GetInputDim("Moment"),
+        platform::errors::InvalidArgument(
+            "Param and Moment input of DecayedAdagradOp should have "
+            "the same dimension."));
 
     ctx->SetOutputDim("ParamOut", param_dims);
     ctx->SetOutputDim("MomentOut", param_dims);
diff --git a/paddle/fluid/operators/optimizers/decayed_adagrad_op.h b/paddle/fluid/operators/optimizers/decayed_adagrad_op.h
index 279edfb015c26848d4078975a40bdca650bdc6a0..f264ebf8a32636a1e2076f8721b3c95d65f5382b 100644
--- a/paddle/fluid/operators/optimizers/decayed_adagrad_op.h
+++ b/paddle/fluid/operators/optimizers/decayed_adagrad_op.h
@@ -24,17 +24,19 @@ class DecayedAdagradOpKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& ctx) const override {
     const auto* param_var = ctx.InputVar("Param");
-    PADDLE_ENFORCE(param_var->IsType<framework::LoDTensor>(),
-                   "The Var(%s)'s type should be LoDTensor, "
-                   "but the received is %s",
-                   ctx.InputNames("Param").front(),
-                   framework::ToTypeName(param_var->Type()));
+    PADDLE_ENFORCE_EQ(param_var->IsType<framework::LoDTensor>(), true,
+                      platform::errors::InvalidArgument(
+                          "The Var(%s)'s type should be LoDTensor, "
+                          "but the received is %s",
+                          ctx.InputNames("Param").front(),
+                          framework::ToTypeName(param_var->Type())));
     const auto* grad_var = ctx.InputVar("Grad");
-    PADDLE_ENFORCE(grad_var->IsType<framework::LoDTensor>(),
-                   "The Var(%s)'s type should be LoDTensor, "
-                   "but the received is %s",
-                   ctx.InputNames("Grad").front(),
-                   framework::ToTypeName(grad_var->Type()));
+    PADDLE_ENFORCE_EQ(grad_var->IsType<framework::LoDTensor>(), true,
+                      platform::errors::InvalidArgument(
+                          "The Var(%s)'s type should be LoDTensor, "
+                          "but the received is %s",
+                          ctx.InputNames("Grad").front(),
+                          framework::ToTypeName(grad_var->Type())));
 
     auto param_out_tensor = ctx.Output<framework::Tensor>("ParamOut");
     auto moment_out_tensor = ctx.Output<framework::Tensor>("MomentOut");
diff --git a/paddle/fluid/operators/optimizers/lars_momentum_op.cc b/paddle/fluid/operators/optimizers/lars_momentum_op.cc
old mode 100644
new mode 100755
index 5f0500d2faa77f7c2e901c0d30ab2c42036d2a86..479f9643749d63c673158ad055409a0925f3d576
--- a/paddle/fluid/operators/optimizers/lars_momentum_op.cc
+++ b/paddle/fluid/operators/optimizers/lars_momentum_op.cc
@@ -48,6 +48,9 @@ class LarsMomentumOpMaker : public framework::OpProtoAndCheckerMaker {
     AddAttr<float>("lars_weight_decay",
                    "(float, default 0.0005) LARS weight decay")
         .SetDefault(0.0005);
+    AddAttr<float>("epsilon",
+                   "(float, default 0.0) epsilon to avoid Division by Zero.")
+        .SetDefault(0.0);
 
     AddComment(R"DOC(
 Lars Momentum Optimizer.
diff --git a/paddle/fluid/operators/optimizers/lars_momentum_op.cu b/paddle/fluid/operators/optimizers/lars_momentum_op.cu
index 1dace4ed6ab3e17b348035e34f6d9ea6d31edae9..eb0111ae4de2f066359e26406f6c7ec3eb54d5fc 100644
--- a/paddle/fluid/operators/optimizers/lars_momentum_op.cu
+++ b/paddle/fluid/operators/optimizers/lars_momentum_op.cu
@@ -23,14 +23,16 @@ __global__ void MomentumLarsKernel(const T* p, const T* g, const T* v,
                                    const T* learning_rate, const T mu,
                                    const int64_t num, const T lars_coeff,
                                    const T lars_weight_decay, const T* p_norm,
-                                   const T* g_norm, T* p_out, T* v_out) {
+                                   const T* g_norm, T* p_out, T* v_out,
+                                   const T epsilon) {
   T lr = learning_rate[0];
   T local_lr = learning_rate[0];
   CUDA_KERNEL_LOOP(i, num) {
-    if (p_norm[0] > 0 && g_norm[0] > 0) {
+    if (lars_weight_decay > 0 && p_norm[0] > 0 && g_norm[0] > 0) {
       local_lr = lr * lars_coeff * p_norm[0] /
-                 (g_norm[0] + lars_weight_decay * p_norm[0]);
+                 (g_norm[0] + lars_weight_decay * p_norm[0] + epsilon);
     }
+
     T v_new = v[i] * mu + local_lr * (g[i] + lars_weight_decay * p[i]);
     v_out[i] = v_new;
     p_out[i] = p[i] - v_new;
@@ -54,6 +56,7 @@ class LarsMomentumOpCUDAKernel : public framework::OpKernel<T> {
     T mu = static_cast<T>(ctx.Attr<float>("mu"));
     T lars_coeff = ctx.Attr<float>("lars_coeff");
     T lars_weight_decay = ctx.Attr<float>("lars_weight_decay");
+    T epsilon = ctx.Attr<float>("epsilon");
 
     auto* p = param->data<T>();
     auto* v = velocity->data<T>();
@@ -79,7 +82,7 @@ class LarsMomentumOpCUDAKernel : public framework::OpKernel<T> {
     eg_norm.device(*place) = eigen_g.square().sum().sqrt();
     MomentumLarsKernel<<<grid, block, 0, ctx.cuda_device_context().stream()>>>(
         p, g, v, lr, mu, param->numel(), lars_coeff, lars_weight_decay,
-        p_norm_data, g_norm_data, p_out, v_out);
+        p_norm_data, g_norm_data, p_out, v_out, epsilon);
   }
 };
 
diff --git a/paddle/fluid/operators/optimizers/lars_momentum_op.h b/paddle/fluid/operators/optimizers/lars_momentum_op.h
old mode 100644
new mode 100755
index e0064c201825b1f074eb53c591dc3abdd7bc1e1b..55775bc08fb5ebc31cd231b8088a9798561fabfc
--- a/paddle/fluid/operators/optimizers/lars_momentum_op.h
+++ b/paddle/fluid/operators/optimizers/lars_momentum_op.h
@@ -30,7 +30,12 @@ class LarsMomentumOpKernel : public framework::OpKernel<T> {
     auto learning_rate = ctx.Input<framework::LoDTensor>("LearningRate");
     auto* grad_var = ctx.InputVar("Grad");
     // only support dense for now.
-    PADDLE_ENFORCE_EQ(grad_var->IsType<framework::LoDTensor>(), true);
+    PADDLE_ENFORCE_EQ(grad_var->IsType<framework::LoDTensor>(), true,
+                      platform::errors::InvalidArgument(
+                          "The Var(%s)'s type should be LoDTensor, "
+                          "but the received is %s",
+                          ctx.InputNames("Grad").front(),
+                          framework::ToTypeName(grad_var->Type())));
     auto grad = ctx.Input<framework::LoDTensor>("Grad");
 
     param_out->mutable_data<T>(ctx.GetPlace());
@@ -39,6 +44,7 @@ class LarsMomentumOpKernel : public framework::OpKernel<T> {
     T mu = static_cast<T>(ctx.Attr<float>("mu"));
     T lars_coeff = ctx.Attr<float>("lars_coeff");
     T lars_weight_decay = ctx.Attr<float>("lars_weight_decay");
+    T epsilon = ctx.Attr<float>("epsilon");
 
     auto p_out = framework::EigenVector<T>::Flatten(*param_out);
     auto v_out = framework::EigenVector<T>::Flatten(*velocity_out);
@@ -59,9 +65,9 @@ class LarsMomentumOpKernel : public framework::OpKernel<T> {
     ep_norm = p.square().sum().sqrt();
     eg_norm = g.square().sum().sqrt();
     T local_lr = lr[0];
-    if (ep_norm(0) > 0 && eg_norm(0) > 0) {
+    if (lars_weight_decay > 0 && ep_norm(0) > 0 && eg_norm(0) > 0) {
       local_lr = lr[0] * lars_coeff * ep_norm(0) /
-                 (eg_norm(0) + lars_weight_decay * ep_norm(0));
+                 (eg_norm(0) + lars_weight_decay * ep_norm(0) + epsilon);
     }
     v_out = v * mu + local_lr * (g + lars_weight_decay * p);
     p_out = p - v_out;
diff --git a/paddle/fluid/operators/optimizers/rmsprop_op.cc b/paddle/fluid/operators/optimizers/rmsprop_op.cc
index 99d1156ee6d5fc88161e25bfa581a265707e6f92..eeee008cdc53c457146074060d526d8d0e8b43aa 100644
--- a/paddle/fluid/operators/optimizers/rmsprop_op.cc
+++ b/paddle/fluid/operators/optimizers/rmsprop_op.cc
@@ -143,4 +143,5 @@ http://www.cs.toronto.edu/~tijmen/csc321/slides/lecture_slides_lec6.pdf)
 namespace ops = paddle::operators;
 REGISTER_OP_WITHOUT_GRADIENT(rmsprop, ops::RmspropOp, ops::RmspropOpMaker);
 REGISTER_OP_CPU_KERNEL(
-    rmsprop, ops::RmspropOpKernel<paddle::platform::CPUDeviceContext, float>);
+    rmsprop, ops::RmspropOpKernel<paddle::platform::CPUDeviceContext, float>,
+    ops::RmspropOpKernel<paddle::platform::CPUDeviceContext, double>);
diff --git a/paddle/fluid/operators/optimizers/rmsprop_op.cu b/paddle/fluid/operators/optimizers/rmsprop_op.cu
index 8b17d6a0204045a9b20adb79dbad72dff5ba267e..bf11ee686757c6c5e54e05f055eaa19f6553f915 100644
--- a/paddle/fluid/operators/optimizers/rmsprop_op.cu
+++ b/paddle/fluid/operators/optimizers/rmsprop_op.cu
@@ -15,4 +15,5 @@ limitations under the License. */
 
 namespace ops = paddle::operators;
 REGISTER_OP_CUDA_KERNEL(
-    rmsprop, ops::RmspropOpKernel<paddle::platform::CUDADeviceContext, float>);
+    rmsprop, ops::RmspropOpKernel<paddle::platform::CUDADeviceContext, float>,
+    ops::RmspropOpKernel<paddle::platform::CUDADeviceContext, double>);
diff --git a/paddle/fluid/operators/p_norm_op.cc b/paddle/fluid/operators/p_norm_op.cc
index 59035d5a8ca5d4214f1370e1b14b2be9b234fa6a..cd7a8c6d24eaaca096b630b74dd7cd1bb9d35d09 100644
--- a/paddle/fluid/operators/p_norm_op.cc
+++ b/paddle/fluid/operators/p_norm_op.cc
@@ -105,6 +105,12 @@ class PnormOp : public framework::OperatorWithKernel {
     bool asvector = ctx->Attrs().Get<bool>("asvector");
     if (asvector) {
       reduce_dims.emplace_back(1);
+      if (keepdim) {
+        for (int i = 1; i < x_dim.size(); ++i) {
+          reduce_dims.emplace_back(1);
+        }
+        x_dim = framework::make_ddim(reduce_dims);
+      }
     } else {
       if (axis < 0) axis = x_dim.size() + axis;
       for (int i = 0; i < x_dim.size(); ++i) {
diff --git a/paddle/fluid/operators/quantize_op.cc b/paddle/fluid/operators/quantize_op.cc
index 8924e21b46f49b0fd0ec72e6acc7463d7d574d6f..602fdc6ff67787ace488379a2730dad4b8ffe1b1 100644
--- a/paddle/fluid/operators/quantize_op.cc
+++ b/paddle/fluid/operators/quantize_op.cc
@@ -40,6 +40,8 @@ void QuantOpMaker::Make() {
   AddAttr<std::string>("output_format",
                        "Convert format to NHWC or NCHW during quantization.")
       .SetDefault("NHWC");
+  AddAttr<bool>("bfloat16", "(bool, default false) Convert to bfloat16")
+      .SetDefault(false);
   AddComment(R"DOC(This op will quantize data from FP32 to INT8)DOC");
 }
 
diff --git a/paddle/fluid/operators/randint_op.cu b/paddle/fluid/operators/randint_op.cu
index a07a92621e6b3726be518df6abcec58257a91489..40e390b0b87246bbaa8474262df8ba5576297385 100644
--- a/paddle/fluid/operators/randint_op.cu
+++ b/paddle/fluid/operators/randint_op.cu
@@ -13,6 +13,7 @@
 // limitations under the License.
 #include <thrust/random.h>
 #include <thrust/transform.h>
+#include "paddle/fluid/framework/generator.h"
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/operators/uniform_random_op.h"
 
@@ -49,15 +50,23 @@ class GPURandintKernel : public framework::OpKernel<T> {
 
     int64_t size = out->numel();
     unsigned int seed = static_cast<unsigned int>(context.Attr<int>("seed"));
+
+    /*
     std::minstd_rand engine;
     if (seed == 0) {
       std::random_device rd;
       seed = rd();
     }
     engine.seed(seed);
+    */
+
     std::uniform_int_distribution<> dist(context.Attr<int>("low"),
                                          context.Attr<int>("high") - 1);
-    for (int64_t i = 0; i < size; ++i) data[i] = dist(engine);
+    auto engine = framework::GetCPURandomEngine(seed);
+
+    for (int64_t i = 0; i < size; ++i) {
+      data[i] = dist(*engine);
+    }
 
     if (platform::is_gpu_place(context.GetPlace())) {
       // Copy tensor to out
diff --git a/paddle/fluid/operators/reduce_ops/CMakeLists.txt b/paddle/fluid/operators/reduce_ops/CMakeLists.txt
index 3da481a142aa2282aade661de7679cf4edf597a0..a68666b100cb52c722c4fefc849e94947130010f 100644
--- a/paddle/fluid/operators/reduce_ops/CMakeLists.txt
+++ b/paddle/fluid/operators/reduce_ops/CMakeLists.txt
@@ -1,6 +1,10 @@
 include(operators)
 if(WITH_GPU)
-    register_operators(DEPS cub)
+    if(${CMAKE_CUDA_COMPILER_VERSION} LESS 11.0)
+        register_operators(DEPS cub)
+    else()
+        register_operators()
+    endif()
 else()
     register_operators()
 endif()
@@ -24,5 +28,9 @@ if(WITH_GPU)
 endif()
 
 if(WITH_GPU)
-    nv_test(check_reduce_rank_test SRCS check_reduce_rank_test.cu DEPS tensor cub)
+    if (${CMAKE_CUDA_COMPILER_VERSION} LESS 11.0)
+	nv_test(check_reduce_rank_test SRCS check_reduce_rank_test.cu DEPS tensor cub)
+    else()
+	nv_test(check_reduce_rank_test SRCS check_reduce_rank_test.cu DEPS tensor)
+    endif()
 endif()
diff --git a/paddle/fluid/operators/reduce_ops/logsumexp_op.cc b/paddle/fluid/operators/reduce_ops/logsumexp_op.cc
index 322a1637f5deec909db13f1bd0433446cd7606ae..7cd164bfd3a3d77288b59c40f147ae9cdd8215e0 100644
--- a/paddle/fluid/operators/reduce_ops/logsumexp_op.cc
+++ b/paddle/fluid/operators/reduce_ops/logsumexp_op.cc
@@ -13,18 +13,138 @@
 // limitations under the License.
 
 #include "paddle/fluid/operators/reduce_ops/logsumexp_op.h"
-#include <memory>
+#include <algorithm>
 #include <string>
-#include <utility>
 #include <vector>
 
 namespace paddle {
 namespace operators {
 
-class LogsumexpOpMaker : public ops::ReduceOpMaker {
- protected:
-  virtual std::string GetName() const { return "logsumexp"; }
-  virtual std::string GetOpType() const { return "Reduce logsumexp"; }
+class LogsumexpOp : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+
+  void InferShape(framework::InferShapeContext* ctx) const override {
+    OP_INOUT_CHECK(ctx->HasInput("X"), "Input", "X", "logsumexp");
+    OP_INOUT_CHECK(ctx->HasOutput("Out"), "Output", "Out", "logsumexp");
+    auto x_dims = ctx->GetInputDim("X");
+    auto x_rank = x_dims.size();
+    PADDLE_ENFORCE_LE(x_rank, 4,
+                      platform::errors::InvalidArgument(
+                          "The input tensor X's dimensions of logsumexp "
+                          "should be less equal than 4. But received X's "
+                          "dimensions = %d, X's shape = [%s].",
+                          x_rank, x_dims));
+    auto axis = ctx->Attrs().Get<std::vector<int>>("axis");
+    PADDLE_ENFORCE_GT(
+        axis.size(), 0,
+        platform::errors::InvalidArgument(
+            "The size of axis of logsumexp "
+            "should be greater than 0. But received the size of axis "
+            "of logsumexp is %d.",
+            axis.size()));
+
+    for (size_t i = 0; i < axis.size(); i++) {
+      PADDLE_ENFORCE_LT(
+          axis[i], x_rank,
+          platform::errors::InvalidArgument(
+              "axis[%d] should be in the "
+              "range [-dimension(X), dimension(X)] "
+              "where dimesion(X) is %d. But received axis[i] = %d.",
+              i, x_rank, axis[i]));
+      PADDLE_ENFORCE_GE(
+          axis[i], -x_rank,
+          platform::errors::InvalidArgument(
+              "axis[%d] should be in the "
+              "range [-dimension(X), dimension(X)] "
+              "where dimesion(X) is %d. But received axis[i] = %d.",
+              i, x_rank, axis[i]));
+      if (axis[i] < 0) {
+        axis[i] += x_rank;
+      }
+    }
+
+    bool keepdim = ctx->Attrs().Get<bool>("keepdim");
+    bool reduce_all = ctx->Attrs().Get<bool>("reduce_all");
+    auto dims_vector = vectorize(x_dims);
+    if (reduce_all) {
+      if (keepdim)
+        ctx->SetOutputDim(
+            "Out", framework::make_ddim(std::vector<int64_t>(x_rank, 1)));
+      else
+        ctx->SetOutputDim("Out", {1});
+    } else {
+      auto dims_vector = vectorize(x_dims);
+      if (keepdim) {
+        for (size_t i = 0; i < axis.size(); ++i) {
+          dims_vector[axis[i]] = 1;
+        }
+      } else {
+        const int kDelFlag = -1;
+        for (size_t i = 0; i < axis.size(); ++i) {
+          dims_vector[axis[i]] = kDelFlag;
+        }
+        dims_vector.erase(
+            std::remove(dims_vector.begin(), dims_vector.end(), kDelFlag),
+            dims_vector.end());
+      }
+      if (!keepdim && dims_vector.size() == 0) {
+        dims_vector.push_back(1);
+      }
+      auto out_dims = framework::make_ddim(dims_vector);
+      ctx->SetOutputDim("Out", out_dims);
+      if (axis.size() > 0 && axis[0] != 0) {
+        // Only pass LoD when not reducing on the first dim.
+        ctx->ShareLoD("X", /*->*/ "Out");
+      }
+    }
+  }
+};
+
+class LogsumexpOpMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  void Make() override {
+    AddInput("X",
+             "(Tensor) The input tensor. Tensors with rank at most 4 are "
+             "supported.");
+    AddOutput("Out", "(Tensor) The result tensor.");
+    AddAttr<std::vector<int>>(
+        "axis",
+        "(list<int>, default {0}) The dimensions to reduce. "
+        "Must be in the range [-rank(input), rank(input)). "
+        "If `axis[i] < 0`, the axis[i] to reduce is `rank + axis[i]`. "
+        "Note that reducing on the first dim will make the LoD info lost.")
+        .SetDefault({0});
+    AddAttr<bool>("keepdim",
+                  "(bool, default false) "
+                  "If true, retain the reduced dimension with length 1.")
+        .SetDefault(false);
+    AddAttr<bool>("reduce_all",
+                  "(bool, default false) "
+                  "If true, output a scalar reduced along all dimensions.")
+        .SetDefault(false);
+    AddComment(string::Sprintf(R"DOC(
+logsumexp Operator.
+
+This operator computes the logsumexp of input tensor along the given axis.
+The result tensor has 1 fewer dimension than the input unless keep_dim is true.
+If reduce_all is true, just reduce along all dimensions and output a scalar.
+
+)DOC"));
+  }
+};
+
+class LogsumexpGrapOp : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+
+  void InferShape(framework::InferShapeContext* ctx) const override {
+    OP_INOUT_CHECK(ctx->HasInput("X"), "Input", "X", "logsumexp");
+    OP_INOUT_CHECK(ctx->HasInput("Out"), "Input", "Out", "logsumexp");
+    OP_INOUT_CHECK(ctx->HasInput(framework::GradVarName("Out")), "Input",
+                   "Out@GRAD", "logsumexp");
+    ctx->SetOutputDim(framework::GradVarName("X"), ctx->GetInputDim("X"));
+  }
 };
 
 template <typename T>
@@ -32,7 +152,6 @@ class LogsumexpGradOpMaker : public framework::SingleGradOpMaker<T> {
  public:
   using framework::SingleGradOpMaker<T>::SingleGradOpMaker;
 
- protected:
   void Apply(GradOpPtr<T> op) const override {
     op->SetType("logsumexp_grad");
     op->SetInput("X", this->Input("X"));
@@ -46,18 +165,17 @@ class LogsumexpGradOpMaker : public framework::SingleGradOpMaker<T> {
 }  // namespace operators
 }  // namespace paddle
 
-REGISTER_OPERATOR(logsumexp, ops::ReduceOp, ops::LogsumexpOpMaker,
+namespace ops = paddle::operators;
+
+REGISTER_OPERATOR(logsumexp, ops::LogsumexpOp, ops::LogsumexpOpMaker,
                   ops::LogsumexpGradOpMaker<paddle::framework::OpDesc>,
                   ops::LogsumexpGradOpMaker<paddle::imperative::OpBase>);
-REGISTER_OPERATOR(logsumexp_grad, ops::ReduceGradOp);
+REGISTER_OPERATOR(logsumexp_grad, ops::LogsumexpGrapOp);
 
-REGISTER_OP_CPU_KERNEL(logsumexp,
-                       ops::ReduceKernel<paddle::platform::CPUDeviceContext,
-                                         float, ops::LogsumexpFunctor>,
-                       ops::ReduceKernel<paddle::platform::CPUDeviceContext,
-                                         double, ops::LogsumexpFunctor>);
 REGISTER_OP_CPU_KERNEL(
-    logsumexp_grad, ops::ReduceGradKernel<paddle::platform::CPUDeviceContext,
-                                          float, ops::LogsumexpGradFunctor>,
-    ops::ReduceGradKernel<paddle::platform::CPUDeviceContext, double,
-                          ops::LogsumexpGradFunctor>);
+    logsumexp, ops::LogsumexpKernel<paddle::platform::CPUDeviceContext, float>,
+    ops::LogsumexpKernel<paddle::platform::CPUDeviceContext, double>);
+REGISTER_OP_CPU_KERNEL(
+    logsumexp_grad,
+    ops::LogsumexpGradKernel<paddle::platform::CPUDeviceContext, float>,
+    ops::LogsumexpGradKernel<paddle::platform::CPUDeviceContext, double>);
diff --git a/paddle/fluid/operators/reduce_ops/logsumexp_op.cu b/paddle/fluid/operators/reduce_ops/logsumexp_op.cu
index c25e5d01b2758a96192d6fbf8f4e881770cbbbf0..86a31595ebaabcbc07fab64779c33566d5b020eb 100644
--- a/paddle/fluid/operators/reduce_ops/logsumexp_op.cu
+++ b/paddle/fluid/operators/reduce_ops/logsumexp_op.cu
@@ -12,16 +12,10 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "paddle/fluid/operators/reduce_ops/cub_reduce.h"
 #include "paddle/fluid/operators/reduce_ops/logsumexp_op.h"
 
-REGISTER_OP_CUDA_KERNEL(logsumexp,
-                        ops::ReduceKernel<paddle::platform::CUDADeviceContext,
-                                          float, ops::LogsumexpFunctor>,
-                        ops::ReduceKernel<paddle::platform::CUDADeviceContext,
-                                          double, ops::LogsumexpFunctor>);
+namespace ops = paddle::operators;
+
 REGISTER_OP_CUDA_KERNEL(
-    logsumexp_grad, ops::ReduceGradKernel<paddle::platform::CUDADeviceContext,
-                                          float, ops::LogsumexpGradFunctor>,
-    ops::ReduceGradKernel<paddle::platform::CUDADeviceContext, double,
-                          ops::LogsumexpGradFunctor>);
+    logsumexp, ops::LogsumexpKernel<paddle::platform::CUDADeviceContext, float>,
+    ops::LogsumexpKernel<paddle::platform::CUDADeviceContext, double>);
diff --git a/paddle/fluid/operators/reduce_ops/logsumexp_op.h b/paddle/fluid/operators/reduce_ops/logsumexp_op.h
index 1d0e00262a37ff7160abd7a865e63377f8b30461..a478690976bd396db921b465d171a422451e0742 100644
--- a/paddle/fluid/operators/reduce_ops/logsumexp_op.h
+++ b/paddle/fluid/operators/reduce_ops/logsumexp_op.h
@@ -14,11 +14,20 @@
 
 #pragma once
 
-#include "paddle/fluid/operators/reduce_ops/reduce_op.h"
+#include <algorithm>
+#include <vector>
+#include "paddle/fluid/operators/reduce_ops/reduce_op_function.h"
 
 namespace paddle {
 namespace operators {
 
+#define HANDLE_DIM(NDIM, RDIM)                                            \
+  if (ndim == NDIM && rdim == RDIM) {                                     \
+    ReduceFunctor<DeviceContext, OutT, NDIM, RDIM, LogsumexpFunctor>(     \
+        context.template device_context<DeviceContext>(), *input, output, \
+        axis, keepdim);                                                   \
+  }
+
 struct LogsumexpFunctor {
   template <typename DeviceContext, typename X, typename Y, typename Dim>
   void operator()(const DeviceContext& place, X* x, Y* y, const Dim& dim) {
@@ -54,5 +63,106 @@ struct LogsumexpGradFunctor {
   }
 };
 
+template <typename DeviceContext, typename OutT>
+class LogsumexpKernel : public framework::OpKernel<OutT> {
+ public:
+  void Compute(const framework::ExecutionContext& context) const override {
+    auto* input = context.Input<Tensor>("X");
+    auto* output = context.Output<Tensor>("Out");
+    output->mutable_data<OutT>(context.GetPlace());
+
+    auto axis = context.Attr<std::vector<int>>("axis");
+    auto keepdim = context.Attr<bool>("keepdim");
+    auto reduce_all = context.Attr<bool>("reduce_all");
+
+    const auto& input_dim_size = input->dims().size();
+    // The dims has full dim, set the reduce_all is True
+    reduce_all |= (static_cast<const int>(axis.size()) == input_dim_size);
+
+    if (reduce_all) {
+      // Flatten and reduce 1-D tensor
+      auto x = EigenVector<OutT>::Flatten(*input);
+      auto out = EigenScalar<OutT>::From(*output);
+      auto& place =
+          *context.template device_context<DeviceContext>().eigen_device();
+      auto reduce_dim = Eigen::array<int, 1>({{0}});
+      LogsumexpFunctor()(place, &x, &out, reduce_dim);
+    } else {
+      int ndim = input_dim_size;
+      int rdim = axis.size();
+      // comments for accelerating compiling temporarily.
+      // HANDLE_DIM(6, 5);
+      // HANDLE_DIM(6, 4);
+      // HANDLE_DIM(6, 3);
+      // HANDLE_DIM(6, 2);
+      // HANDLE_DIM(6, 1);
+      // HANDLE_DIM(5, 4);
+      // HANDLE_DIM(5, 3);
+      // HANDLE_DIM(5, 2);
+      // HANDLE_DIM(5, 1);
+      HANDLE_DIM(4, 3);
+      HANDLE_DIM(4, 2);
+      HANDLE_DIM(4, 1);
+      HANDLE_DIM(3, 2);
+      HANDLE_DIM(3, 1);
+      HANDLE_DIM(2, 1);
+    }
+  }
+};
+
+template <typename DeviceContext, typename T>
+class LogsumexpGradKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& context) const override {
+    auto* input = context.Input<Tensor>("X");
+    auto* output = context.Input<Tensor>("Out");
+    auto* output_grad = context.Input<Tensor>(framework::GradVarName("Out"));
+    auto* input_grad = context.Output<Tensor>(framework::GradVarName("X"));
+    input_grad->mutable_data<T>(context.GetPlace());
+
+    auto axis = context.Attr<std::vector<int>>("axis");
+    auto reduce_all = context.Attr<bool>("reduce_all");
+    const auto input_dim_size = context.Input<Tensor>("X")->dims().size();
+    reduce_all |= (static_cast<const int>(axis.size()) == input_dim_size);
+
+    if (reduce_all) {
+      auto x = EigenVector<T>::Flatten(*input);
+      auto y = EigenVector<T>::Flatten(*output);
+      auto dy = EigenVector<T>::Flatten(*output_grad);
+      auto dx = EigenVector<T>::Flatten(*input_grad);
+      auto& place =
+          *context.template device_context<DeviceContext>().eigen_device();
+      auto broadcast_dim =
+          Eigen::array<int, 1>({{static_cast<int>(input->numel())}});
+      LogsumexpGradFunctor()(place, &x, &y, &dx, &dy, broadcast_dim,
+                             broadcast_dim[0]);
+    } else {
+      int rank = input->dims().size();
+      switch (rank) {
+        case 1:
+          ReduceGradFunctor<DeviceContext, T, 1, LogsumexpGradFunctor>(
+              context.template device_context<DeviceContext>(), *input, *output,
+              *output_grad, input_grad, axis);
+          break;
+        case 2:
+          ReduceGradFunctor<DeviceContext, T, 2, LogsumexpGradFunctor>(
+              context.template device_context<DeviceContext>(), *input, *output,
+              *output_grad, input_grad, axis);
+          break;
+        case 3:
+          ReduceGradFunctor<DeviceContext, T, 3, LogsumexpGradFunctor>(
+              context.template device_context<DeviceContext>(), *input, *output,
+              *output_grad, input_grad, axis);
+          break;
+        case 4:
+          ReduceGradFunctor<DeviceContext, T, 4, LogsumexpGradFunctor>(
+              context.template device_context<DeviceContext>(), *input, *output,
+              *output_grad, input_grad, axis);
+          break;
+      }
+    }
+  }
+};
+
 }  // namespace operators
 }  // namespace paddle
diff --git a/paddle/fluid/operators/reduce_ops/logsumexp_op.part.cu b/paddle/fluid/operators/reduce_ops/logsumexp_op.part.cu
new file mode 100644
index 0000000000000000000000000000000000000000..81124e4f070a54444f4305dc903280548ac10b60
--- /dev/null
+++ b/paddle/fluid/operators/reduce_ops/logsumexp_op.part.cu
@@ -0,0 +1,23 @@
+// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// .part used to speed up nvcc compile
+#include "paddle/fluid/operators/reduce_ops/logsumexp_op.h"
+
+namespace ops = paddle::operators;
+
+REGISTER_OP_CUDA_KERNEL(
+    logsumexp_grad,
+    ops::LogsumexpGradKernel<paddle::platform::CUDADeviceContext, float>,
+    ops::LogsumexpGradKernel<paddle::platform::CUDADeviceContext, double>);
diff --git a/paddle/fluid/operators/reduce_ops/reduce_op.h b/paddle/fluid/operators/reduce_ops/reduce_op.h
index 67a19cb83c36f9cb6ef0cdd65e9fc04a7bb4d169..25f9453571ac632e70b0755ca1e5566eb5bf6ee6 100644
--- a/paddle/fluid/operators/reduce_ops/reduce_op.h
+++ b/paddle/fluid/operators/reduce_ops/reduce_op.h
@@ -18,9 +18,10 @@ limitations under the License. */
 #include <set>
 #include <string>
 #include <vector>
-
 #include "paddle/fluid/framework/data_type_transform.h"
+#include "paddle/fluid/framework/tensor_util.h"
 #include "paddle/fluid/operators/cast_op.h"
+#include "paddle/fluid/operators/math/math_function.h"
 #include "paddle/fluid/operators/reduce_ops/reduce_op_function.h"
 
 namespace paddle {
@@ -34,6 +35,110 @@ namespace operators {
   }
 
 using Tensor = framework::Tensor;
+using DDim = framework::DDim;
+
+inline void GetShuffledDim(const DDim& src_dims, DDim* dst_dims,
+                           const std::vector<int>& reduced_dims,
+                           std::vector<int>* perm_axis) {
+  // check if it's a reduced dim
+  std::vector<bool> src_dims_check(src_dims.size(), false);
+  size_t src_size = src_dims.size();
+  size_t reduce_size = reduced_dims.size();
+  for (size_t i = 0; i < reduce_size; ++i) {
+    dst_dims->at(src_size - reduce_size + i) = src_dims[reduced_dims[i]];
+    (*perm_axis)[src_size - reduce_size + i] = reduced_dims[i];
+    src_dims_check[reduced_dims[i]] = true;
+  }
+
+  size_t offset = 0;
+  for (size_t i = 0; i < src_dims_check.size(); ++i) {
+    bool is_reduced = src_dims_check[i];
+    if (!is_reduced) {
+      (*perm_axis)[offset] = i;
+      dst_dims->at(offset++) = src_dims[i];
+    }
+  }
+}
+
+template <typename DeviceContext, typename OutT>
+void GetShuffledInput(const framework::ExecutionContext& context,
+                      const Tensor* input, Tensor* shuffled_input,
+                      const std::vector<int>& dims) {
+  DDim shuffled_dims(input->dims());
+  std::vector<int> perm_axis(input->dims().size());
+  GetShuffledDim(input->dims(), &shuffled_dims, dims, &perm_axis);
+
+  shuffled_input->Resize(shuffled_dims);
+  shuffled_input->mutable_data<OutT>(context.GetPlace());
+
+  math::TransposeNormal<DeviceContext, OutT> trans;
+  trans(context.template device_context<DeviceContext>(), *input,
+        shuffled_input, perm_axis);
+}
+
+inline void GetOriginDimFromShuffled(const DDim& src_dim,
+                                     const std::vector<int>& dims,
+                                     std::vector<int>* origin_dim) {
+  DDim shuffled_dims(src_dim);
+  size_t n = src_dim.size();
+  std::vector<int> perm_axis(n);
+  GetShuffledDim(src_dim, &shuffled_dims, dims, &perm_axis);
+  for (size_t i = 0; i < n; ++i) {
+    (*origin_dim)[perm_axis[i]] = i;
+  }
+}
+
+template <typename DeviceContext, typename OutT, typename Functor>
+void HandleLargeDim(const framework::ExecutionContext& context,
+                    const Tensor* input, Tensor* output,
+                    const std::vector<int>& dims, bool keep_dim) {
+  //  shuffle the reduced dim to the end
+  Tensor shuffled_input;
+  GetShuffledInput<DeviceContext, OutT>(context, input, &shuffled_input, dims);
+
+  // transpose to 2D tensor whose shape is {unreduced, reduced}.
+  const int64_t unreduced = output->numel();
+  const int64_t reduced = shuffled_input.numel() / unreduced;
+  shuffled_input.Resize({unreduced, reduced});
+  DDim output_dim = output->dims();
+  output->Resize({unreduced});
+  ReduceFunctor<DeviceContext, OutT, 2, 1, Functor>(
+      context.template device_context<DeviceContext>(), shuffled_input, output,
+      {1}, keep_dim);
+  output->Resize(output_dim);
+}
+
+template <typename DeviceContext, typename T, typename Functor>
+void HandleLargeDimGrad(const framework::ExecutionContext& context,
+                        const framework::Tensor* x,
+                        const framework::Tensor* out,
+                        const framework::Tensor* dout, framework::Tensor* dx,
+                        const std::vector<int>& dims) {
+  const int64_t unreduced = out->numel();
+  const int64_t reduced = x->numel() / unreduced;
+  DDim out_dim(out->dims());
+  DDim x_dim(x->dims());
+  // transpose and reshape X
+  Tensor shuffled_x;
+  GetShuffledInput<DeviceContext, T>(context, x, &shuffled_x, dims);
+  DDim shuffled_dim = shuffled_x.dims();
+  shuffled_x.Resize({unreduced, reduced});
+  // reshape dX {unreduced, reduced}
+  dx->Resize({unreduced, reduced});
+  ReduceGradFunctor<DeviceContext, T, 2, Functor>(
+      context.template device_context<DeviceContext>(), shuffled_x, *out, *dout,
+      dx, {1});
+  // transpose dX
+  std::vector<int> origin_axis(x_dim.size());
+  GetOriginDimFromShuffled(x_dim, dims, &origin_axis);
+  Tensor dx_tmp;
+  framework::TensorCopy(*dx, context.GetPlace(), &dx_tmp);
+  dx_tmp.Resize(shuffled_dim);
+  dx->Resize(x_dim);
+  math::TransposeNormal<DeviceContext, T> trans;
+  trans(context.template device_context<DeviceContext>(), dx_tmp, dx,
+        origin_axis);
+}
 
 template <typename DeviceContext, typename T, typename Functor>
 struct ReduceKernelFunctor {
@@ -69,22 +174,27 @@ struct ReduceKernelFunctor {
     } else {
       int ndim = input->dims().size();
       int rdim = dims.size();
-      HANDLE_DIM(6, 5);
-      HANDLE_DIM(6, 4);
-      HANDLE_DIM(6, 3);
-      HANDLE_DIM(6, 2);
-      HANDLE_DIM(6, 1);
-      HANDLE_DIM(5, 4);
-      HANDLE_DIM(5, 3);
-      HANDLE_DIM(5, 2);
-      HANDLE_DIM(5, 1);
-      HANDLE_DIM(4, 3);
-      HANDLE_DIM(4, 2);
-      HANDLE_DIM(4, 1);
-      HANDLE_DIM(3, 2);
-      HANDLE_DIM(3, 1);
-      HANDLE_DIM(2, 1);
-      HANDLE_DIM(1, 1);
+      if (ndim > 6) {
+        HandleLargeDim<DeviceContext, OutT, Functor>(context, input, output,
+                                                     dims, keep_dim);
+      } else {
+        HANDLE_DIM(6, 5);
+        HANDLE_DIM(6, 4);
+        HANDLE_DIM(6, 3);
+        HANDLE_DIM(6, 2);
+        HANDLE_DIM(6, 1);
+        HANDLE_DIM(5, 4);
+        HANDLE_DIM(5, 3);
+        HANDLE_DIM(5, 2);
+        HANDLE_DIM(5, 1);
+        HANDLE_DIM(4, 3);
+        HANDLE_DIM(4, 2);
+        HANDLE_DIM(4, 1);
+        HANDLE_DIM(3, 2);
+        HANDLE_DIM(3, 1);
+        HANDLE_DIM(2, 1);
+        HANDLE_DIM(1, 1);
+      }
     }
   }
 };
@@ -137,7 +247,6 @@ class ReduceKernel : public framework::OpKernel<T> {
     }
   }
 };
-
 template <typename DeviceContext, typename OutT, typename Functor>
 class BoolReduceKernel : public framework::OpKernel<OutT> {
  public:
@@ -175,22 +284,27 @@ class BoolReduceKernel : public framework::OpKernel<OutT> {
       int ndim = input->dims().size();
       int rdim = dims.size();
       // comments for accelerating compiling temporarily.
-      //      HANDLE_DIM(6, 5);
-      //      HANDLE_DIM(6, 4);
-      //      HANDLE_DIM(6, 3);
-      //      HANDLE_DIM(6, 2);
-      //      HANDLE_DIM(6, 1);
-      //      HANDLE_DIM(5, 4);
-      //      HANDLE_DIM(5, 3);
-      //      HANDLE_DIM(5, 2);
-      //      HANDLE_DIM(5, 1);
-      HANDLE_DIM(4, 3);
-      HANDLE_DIM(4, 2);
-      HANDLE_DIM(4, 1);
-      HANDLE_DIM(3, 2);
-      HANDLE_DIM(3, 1);
-      HANDLE_DIM(2, 1);
-      HANDLE_DIM(1, 1);
+      if (ndim > 6) {
+        HandleLargeDim<DeviceContext, OutT, Functor>(context, input, output,
+                                                     dims, keep_dim);
+      } else {
+        HANDLE_DIM(6, 5);
+        HANDLE_DIM(6, 4);
+        HANDLE_DIM(6, 3);
+        HANDLE_DIM(6, 2);
+        HANDLE_DIM(6, 1);
+        HANDLE_DIM(5, 4);
+        HANDLE_DIM(5, 3);
+        HANDLE_DIM(5, 2);
+        HANDLE_DIM(5, 1);
+        HANDLE_DIM(4, 3);
+        HANDLE_DIM(4, 2);
+        HANDLE_DIM(4, 1);
+        HANDLE_DIM(3, 2);
+        HANDLE_DIM(3, 1);
+        HANDLE_DIM(2, 1);
+        HANDLE_DIM(1, 1);
+      }
     }
   }
 };
@@ -279,6 +393,10 @@ class ReduceGradKernel : public framework::OpKernel<T> {
               context.template device_context<DeviceContext>(), *input0,
               *input1, *input2, output, dims);
           break;
+        default:
+          HandleLargeDimGrad<DeviceContext, T, Functor>(context, input0, input1,
+                                                        input2, output, dims);
+          break;
       }
     }
   }
@@ -313,12 +431,6 @@ class ReduceOp : public framework::OperatorWithKernel {
     OP_INOUT_CHECK(ctx->HasOutput("Out"), "Output", "Out", "ReduceOp");
     auto x_dims = ctx->GetInputDim("X");
     auto x_rank = x_dims.size();
-    PADDLE_ENFORCE_LE(x_rank, 6,
-                      platform::errors::InvalidArgument(
-                          "The input tensor X's dimensions of ReduceOp "
-                          "should be less equal than 6. But received X's "
-                          "dimensions = %d, X's shape = [%s].",
-                          x_rank, x_dims));
     auto dims = ctx->Attrs().Get<std::vector<int>>("dim");
     PADDLE_ENFORCE_GT(dims.size(), 0,
                       platform::errors::InvalidArgument(
@@ -402,11 +514,6 @@ class ReduceGradOp : public framework::OperatorWithKernel {
                    "Out@GRAD", "ReduceOp");
     auto x_dims = ctx->GetInputDim("X");
     auto x_rank = x_dims.size();
-    PADDLE_ENFORCE_LE(x_rank, 6,
-                      platform::errors::InvalidArgument(
-                          "Tensors with rank at most 6 are supported by "
-                          "ReduceOp. Received tensor with rank %d.",
-                          x_rank));
     auto dims = ctx->Attrs().Get<std::vector<int>>("dim");
     for (size_t i = 0; i < dims.size(); ++i) {
       PADDLE_ENFORCE_LT(dims[i], x_rank,
diff --git a/paddle/fluid/operators/reduce_ops/reduce_sum_op.cc b/paddle/fluid/operators/reduce_ops/reduce_sum_op.cc
index 6e470e3af4e5860796d898a5e3138c28958264cb..54818470b277443e411ea5f7d9c7561eddc7046a 100644
--- a/paddle/fluid/operators/reduce_ops/reduce_sum_op.cc
+++ b/paddle/fluid/operators/reduce_ops/reduce_sum_op.cc
@@ -51,6 +51,20 @@ class ReduceSumOpGradMaker : public framework::SingleGradOpMaker<T> {
   }
 };
 
+template <typename T>
+class ReduceSumDoubleOpGradMaker : public framework::SingleGradOpMaker<T> {
+ public:
+  using framework::SingleGradOpMaker<T>::SingleGradOpMaker;
+
+ protected:
+  void Apply(GradOpPtr<T> op) const override {
+    op->SetInput("X", this->OutputGrad(framework::GradVarName("X")));
+    op->SetOutput("Out", this->InputGrad(framework::GradVarName("Out")));
+    op->SetAttrMap(this->Attrs());
+    op->SetType("reduce_sum");
+  }
+};
+
 DECLARE_NO_NEED_BUFFER_VARS_INFERER(ReduceSumGradNoNeedBufferVarInferer, "X");
 class ReduceSumVarTypeInference : public paddle::framework::VarTypeInference {
  public:
@@ -77,6 +91,8 @@ REGISTER_OPERATOR(reduce_sum, ops::ReduceOp, ReduceSumOpMaker,
                   ops::ReduceSumOpGradMaker<paddle::framework::OpDesc>,
                   ops::ReduceSumOpGradMaker<paddle::imperative::OpBase>);
 REGISTER_OPERATOR(reduce_sum_grad, ops::ReduceGradOp,
+                  ops::ReduceSumDoubleOpGradMaker<paddle::framework::OpDesc>,
+                  ops::ReduceSumDoubleOpGradMaker<paddle::imperative::OpBase>,
                   ops::ReduceSumGradNoNeedBufferVarInferer);
 
 REGISTER_OP_CPU_KERNEL(
diff --git a/paddle/fluid/operators/roi_align_op.cc b/paddle/fluid/operators/roi_align_op.cc
index 911dfea50e2e2cf8ec8f230bfc1e0bf4836463b6..0eeb7e0bb24f512aa6859e92de9f490e491543aa 100644
--- a/paddle/fluid/operators/roi_align_op.cc
+++ b/paddle/fluid/operators/roi_align_op.cc
@@ -11,6 +11,7 @@ limitations under the License. */
 
 #include "paddle/fluid/operators/roi_align_op.h"
 #include <memory>
+#include "paddle/fluid/framework/op_version_registry.h"
 
 namespace paddle {
 namespace operators {
@@ -35,13 +36,13 @@ class ROIAlignOp : public framework::OperatorWithKernel {
     auto input_dims = ctx->GetInputDim("X");
     auto rois_dims = ctx->GetInputDim("ROIs");
 
-    if (ctx->HasInput("RoisLod")) {
-      auto rois_lod_dims = ctx->GetInputDim("RoisLod");
+    if (ctx->HasInput("RoisNum")) {
+      auto rois_num_dims = ctx->GetInputDim("RoisNum");
       PADDLE_ENFORCE_EQ(
-          rois_lod_dims.size(), 1,
-          platform::errors::InvalidArgument("The RoisLod dimension should be 1"
-                                            ", but got dimension = %d",
-                                            rois_lod_dims.size()));
+          rois_num_dims.size(), 1,
+          platform::errors::InvalidArgument("The size of RoisNum should be 1"
+                                            ", but received size = %d",
+                                            rois_num_dims.size()));
     }
     PADDLE_ENFORCE_EQ(
         input_dims.size(), 4,
@@ -145,9 +146,9 @@ class ROIAlignOpMaker : public framework::OpProtoAndCheckerMaker {
              "given as [[x1, y1, x2, y2], ...]. "
              "(x1, y1) is the top left coordinates, and "
              "(x2, y2) is the bottom right coordinates.");
-    AddInput("RoisLod",
+    AddInput("RoisNum",
              "(Tensor), "
-             "The lod info of rois.")
+             "The number of RoIs in each image.")
         .AsDispensable();
     AddOutput("Out",
               "(Tensor), "
@@ -203,7 +204,7 @@ class ROIAlignGradMaker : public framework::SingleGradOpMaker<T> {
     op->SetType("roi_align_grad");
     op->SetInput("X", this->Input("X"));
     op->SetInput("ROIs", this->Input("ROIs"));
-    op->SetInput("RoisLod", this->Input("RoisLod"));
+    op->SetInput("RoisNum", this->Input("RoisNum"));
     op->SetInput(framework::GradVarName("Out"), this->OutputGrad("Out"));
     op->SetOutput(framework::GradVarName("X"), this->InputGrad("X"));
     op->SetAttrMap(this->Attrs());
@@ -231,3 +232,10 @@ REGISTER_OP_CPU_KERNEL(
     ops::CPUROIAlignGradOpKernel<paddle::platform::CPUDeviceContext, float>,
     ops::CPUROIAlignGradOpKernel<paddle::platform::CPUDeviceContext, double>,
     ops::CPUROIAlignGradOpKernel<paddle::platform::CPUDeviceContext, int>);
+REGISTER_OP_VERSION(roi_align)
+    .AddCheckpoint(
+        R"ROC(
+              Upgrade roi_align add a new input [RoisNum])ROC",
+        paddle::framework::compatible::OpVersionDesc().NewInput(
+            "RoisNum",
+            "The number of RoIs in each image. RoisNum is dispensable."));
diff --git a/paddle/fluid/operators/roi_align_op.cu b/paddle/fluid/operators/roi_align_op.cu
index f7ec13e5bccd63d2f6552ed52f8d709a57320ddd..3a4ce55f4fb77160e7fc645539c1868fe2864b19 100644
--- a/paddle/fluid/operators/roi_align_op.cu
+++ b/paddle/fluid/operators/roi_align_op.cu
@@ -257,24 +257,26 @@ class GPUROIAlignOpKernel : public framework::OpKernel<T> {
     int* roi_batch_id_data = roi_batch_id_list.mutable_data<int>(cplace);
     auto& dev_ctx = ctx.cuda_device_context();
     auto gplace = BOOST_GET_CONST(platform::CUDAPlace, ctx.GetPlace());
-    if (ctx.HasInput("RoisLod")) {
-      auto* rois_lod = ctx.Input<Tensor>("RoisLod");
-      int rois_batch_size = rois_lod->numel();
+    if (ctx.HasInput("RoisNum")) {
+      auto* rois_num_t = ctx.Input<Tensor>("RoisNum");
+      int rois_batch_size = rois_num_t->numel();
       PADDLE_ENFORCE_EQ(
-          rois_batch_size - 1, batch_size,
+          rois_batch_size, batch_size,
           platform::errors::InvalidArgument(
               "The rois_batch_size and imgs "
               "batch_size must be the same. But received rois_batch_size = %d, "
               "batch_size = %d",
               rois_batch_size, batch_size));
 
-      std::vector<int64_t> rois_lod_(rois_batch_size);
-      memory::Copy(cplace, rois_lod_.data(), gplace, rois_lod->data<int64_t>(),
-                   sizeof(int64_t) * rois_batch_size, 0);
-      for (int n = 0; n < rois_batch_size - 1; ++n) {
-        for (size_t i = rois_lod_[n]; i < rois_lod_[n + 1]; ++i) {
+      std::vector<int> rois_num_list(rois_batch_size);
+      memory::Copy(cplace, rois_num_list.data(), gplace,
+                   rois_num_t->data<int>(), sizeof(int) * rois_batch_size, 0);
+      int start = 0;
+      for (int n = 0; n < rois_batch_size; ++n) {
+        for (int i = start; i < start + rois_num_list[n]; ++i) {
           roi_batch_id_data[i] = n;
         }
+        start += rois_num_list[n];
       }
     } else {
       auto lod = rois->lod();
@@ -348,16 +350,18 @@ class GPUROIAlignGradOpKernel : public framework::OpKernel<T> {
 
     auto& dev_ctx = ctx.cuda_device_context();
     auto gplace = BOOST_GET_CONST(platform::CUDAPlace, ctx.GetPlace());
-    if (ctx.HasInput("RoisLod")) {
-      auto* rois_lod = ctx.Input<Tensor>("RoisLod");
-      int rois_batch_size = rois_lod->numel();
-      std::vector<int64_t> rois_lod_(rois_batch_size);
-      memory::Copy(cplace, rois_lod_.data(), gplace, rois_lod->data<int64_t>(),
-                   sizeof(int64_t) * rois_batch_size, 0);
-      for (int n = 0; n < rois_batch_size - 1; ++n) {
-        for (size_t i = rois_lod_[n]; i < rois_lod_[n + 1]; ++i) {
+    if (ctx.HasInput("RoisNum")) {
+      auto* rois_num_t = ctx.Input<Tensor>("RoisNum");
+      int rois_batch_size = rois_num_t->numel();
+      std::vector<int> rois_num_list(rois_batch_size);
+      memory::Copy(cplace, rois_num_list.data(), gplace,
+                   rois_num_t->data<int>(), sizeof(int) * rois_batch_size, 0);
+      int start = 0;
+      for (int n = 0; n < rois_batch_size; ++n) {
+        for (size_t i = start; i < start + rois_num_list[n]; ++i) {
           roi_batch_id_data[i] = n;
         }
+        start += rois_num_list[n];
       }
     } else {
       auto rois_lod = rois->lod().back();
diff --git a/paddle/fluid/operators/roi_align_op.h b/paddle/fluid/operators/roi_align_op.h
index 366f865411461c91b4ec88203390b15fdba4414c..066125a92fbd9d1d49f0ba023366865620674e1f 100644
--- a/paddle/fluid/operators/roi_align_op.h
+++ b/paddle/fluid/operators/roi_align_op.h
@@ -165,21 +165,23 @@ class CPUROIAlignOpKernel : public framework::OpKernel<T> {
     int* roi_batch_id_data =
         roi_batch_id_list.mutable_data<int>(ctx.GetPlace());
     int rois_batch_size;
-    if (ctx.HasInput("RoisLod")) {
-      auto* rois_lod_t = ctx.Input<framework::Tensor>("RoisLod");
-      rois_batch_size = rois_lod_t->numel();
+    if (ctx.HasInput("RoisNum")) {
+      auto* rois_num_t = ctx.Input<framework::Tensor>("RoisNum");
+      rois_batch_size = rois_num_t->numel();
       PADDLE_ENFORCE_EQ(
-          rois_batch_size - 1, batch_size,
+          rois_batch_size, batch_size,
           platform::errors::InvalidArgument(
               "The batch size of rois and the batch size of images "
               " must be the same. But received the batch size of rois is %d, "
               "and the batch size of images is %d",
               rois_batch_size, batch_size));
-      auto* rois_lod = rois_lod_t->data<int64_t>();
-      for (int n = 0; n < rois_batch_size - 1; ++n) {
-        for (int i = rois_lod[n]; i < rois_lod[n + 1]; ++i) {
+      auto* rois_num_data = rois_num_t->data<int>();
+      int start = 0;
+      for (int n = 0; n < rois_batch_size; ++n) {
+        for (int i = start; i < start + rois_num_data[n]; ++i) {
           roi_batch_id_data[i] = n;
         }
+        start += rois_num_data[n];
       }
     } else {
       auto lod = rois->lod();
@@ -303,14 +305,16 @@ class CPUROIAlignGradOpKernel : public framework::OpKernel<T> {
         roi_batch_id_list.mutable_data<int>(ctx.GetPlace());
 
     int rois_batch_size;
-    if (ctx.HasInput("RoisLod")) {
-      auto* rois_lod_t = ctx.Input<framework::Tensor>("RoisLod");
-      rois_batch_size = rois_lod_t->numel();
-      auto* rois_lod = rois_lod_t->data<int64_t>();
-      for (int n = 0; n < rois_batch_size - 1; ++n) {
-        for (int i = rois_lod[n]; i < rois_lod[n + 1]; ++i) {
+    if (ctx.HasInput("RoisNum")) {
+      auto* rois_num_t = ctx.Input<framework::Tensor>("RoisNum");
+      rois_batch_size = rois_num_t->numel();
+      auto* rois_num_data = rois_num_t->data<int>();
+      int start = 0;
+      for (int n = 0; n < rois_batch_size; ++n) {
+        for (int i = start; i < start + rois_num_data[n]; ++i) {
           roi_batch_id_data[i] = n;
         }
+        start += rois_num_data[n];
       }
     } else {
       auto rois_lod = rois->lod().back();
diff --git a/paddle/fluid/operators/roi_pool_op.cc b/paddle/fluid/operators/roi_pool_op.cc
index 8a34cb35f6bf8dde97d29c02749d8a52fdf5f090..be3187b7513144f583458f3d7902a102e531a981 100644
--- a/paddle/fluid/operators/roi_pool_op.cc
+++ b/paddle/fluid/operators/roi_pool_op.cc
@@ -14,6 +14,7 @@ limitations under the License. */
 
 #include "paddle/fluid/operators/roi_pool_op.h"
 #include <memory>
+#include "paddle/fluid/framework/op_version_registry.h"
 
 namespace paddle {
 namespace operators {
@@ -34,12 +35,13 @@ class ROIPoolOp : public framework::OperatorWithKernel {
     auto input_dims = ctx->GetInputDim("X");
     auto rois_dims = ctx->GetInputDim("ROIs");
 
-    if (ctx->HasInput("RoisLod")) {
-      auto rois_lod_dims = ctx->GetInputDim("RoisLod");
-      PADDLE_ENFORCE_EQ(rois_lod_dims.size(), 1,
+    if (ctx->HasInput("RoisNum")) {
+      auto rois_num_dims = ctx->GetInputDim("RoisNum");
+      PADDLE_ENFORCE_EQ(rois_num_dims.size(), 1,
                         platform::errors::InvalidArgument(
-                            "The lod information tensor of ROIs should "
-                            "be one-dimensional"));
+                            "The second dimension of RoisNum should "
+                            "be 1, but received dimension is %d",
+                            rois_num_dims.size()));
     }
     PADDLE_ENFORCE_EQ(input_dims.size(), 4,
                       platform::errors::InvalidArgument(
@@ -140,7 +142,8 @@ class ROIPoolOpMaker : public framework::OpProtoAndCheckerMaker {
              "Where batch_id is the id of the data, "
              "(x1, y1) is the top left coordinates, and "
              "(x2, y2) is the bottom right coordinates.");
-    AddInput("RoisLod", "(Tensor), The lod info of rois.").AsDispensable();
+    AddInput("RoisNum", "(Tensor), The number of RoIs in each image.")
+        .AsDispensable();
     AddOutput("Out",
               "(Tensor), "
               "The output of ROIPoolOp is a 4-D tensor with shape "
@@ -197,7 +200,7 @@ class ROIPoolGradMaker : public framework::SingleGradOpMaker<T> {
     op->SetType("roi_pool_grad");
     op->SetInput("X", this->Input("X"));
     op->SetInput("ROIs", this->Input("ROIs"));
-    op->SetInput("RoisLod", this->Input("RoisLod"));
+    op->SetInput("RoisNum", this->Input("RoisNum"));
     op->SetInput("Argmax", this->Output("Argmax"));
     op->SetInput(framework::GradVarName("Out"), this->OutputGrad("Out"));
     op->SetOutput(framework::GradVarName("X"), this->InputGrad("X"));
@@ -223,3 +226,10 @@ REGISTER_OP_CPU_KERNEL(
     ops::CPUROIPoolGradOpKernel<paddle::platform::CPUDeviceContext, float>,
     ops::CPUROIPoolGradOpKernel<paddle::platform::CPUDeviceContext, double>,
     ops::CPUROIPoolGradOpKernel<paddle::platform::CPUDeviceContext, int>);
+REGISTER_OP_VERSION(roi_pool)
+    .AddCheckpoint(
+        R"ROC(
+              Upgrade roi_pool add a new input [RoisNum])ROC",
+        paddle::framework::compatible::OpVersionDesc().NewInput(
+            "RoisNum",
+            "The number of RoIs in each image. RoisNum is dispensable."));
diff --git a/paddle/fluid/operators/roi_pool_op.cu b/paddle/fluid/operators/roi_pool_op.cu
index 1e8a8e3037d84f980d963d359ce791b1ddba47d3..98d9ef6b6e11440d38abbedbfd93f6d3544d77bc 100644
--- a/paddle/fluid/operators/roi_pool_op.cu
+++ b/paddle/fluid/operators/roi_pool_op.cu
@@ -157,19 +157,21 @@ class GPUROIPoolOpKernel : public framework::OpKernel<T> {
     int* roi_batch_id_data = roi_batch_id_list.mutable_data<int>(cplace);
     auto& dev_ctx = ctx.cuda_device_context();
     auto gplace = BOOST_GET_CONST(platform::CUDAPlace, ctx.GetPlace());
-    if (ctx.HasInput("RoisLod")) {
-      auto* rois_lod = ctx.Input<Tensor>("RoisLod");
-      int rois_batch_size = rois_lod->numel();
+    if (ctx.HasInput("RoisNum")) {
+      auto* rois_num_t = ctx.Input<Tensor>("RoisNum");
+      int rois_batch_size = rois_num_t->numel();
       PADDLE_ENFORCE_EQ(
-          rois_batch_size - 1, batch_size,
+          rois_batch_size, batch_size,
           "The rois_batch_size and imgs batch_size must be the same.");
-      std::vector<int64_t> rois_lod_(rois_batch_size);
-      memory::Copy(cplace, rois_lod_.data(), gplace, rois_lod->data<int64_t>(),
-                   sizeof(int64_t) * rois_batch_size, 0);
-      for (int n = 0; n < rois_batch_size - 1; ++n) {
-        for (size_t i = rois_lod_[n]; i < rois_lod_[n + 1]; ++i) {
+      std::vector<int> rois_num_list(rois_batch_size);
+      memory::Copy(cplace, rois_num_list.data(), gplace,
+                   rois_num_t->data<int>(), sizeof(int) * rois_batch_size, 0);
+      int start = 0;
+      for (int n = 0; n < rois_batch_size; ++n) {
+        for (int i = start; i < start + rois_num_list[n]; ++i) {
           roi_batch_id_data[i] = n;
         }
+        start += rois_num_list[n];
       }
     } else {
       auto rois_lod = rois->lod().back();
@@ -206,7 +208,7 @@ class GPUROIPoolGradOpKernel : public framework::OpKernel<T> {
   void Compute(const framework::ExecutionContext& ctx) const override {
     auto* in = ctx.Input<Tensor>("X");
     auto* rois = ctx.Input<LoDTensor>("ROIs");
-    auto* rois_lod = ctx.Input<Tensor>("RoisLod");
+    auto* rois_lod = ctx.Input<Tensor>("RoisNum");
     auto* argmax = ctx.Input<Tensor>("Argmax");
 
     auto* out_grad = ctx.Input<Tensor>(framework::GradVarName("Out"));
@@ -229,17 +231,18 @@ class GPUROIPoolGradOpKernel : public framework::OpKernel<T> {
 
       auto& dev_ctx = ctx.cuda_device_context();
       auto gplace = BOOST_GET_CONST(platform::CUDAPlace, ctx.GetPlace());
-      if (ctx.HasInput("RoisLod")) {
-        auto* rois_lod = ctx.Input<Tensor>("RoisLod");
-        int rois_batch_size = rois_lod->numel();
-        std::vector<int64_t> rois_lod_(rois_batch_size);
-        memory::Copy(cplace, rois_lod_.data(), gplace,
-                     rois_lod->data<int64_t>(),
-                     sizeof(int64_t) * rois_batch_size, 0);
-        for (int n = 0; n < rois_batch_size - 1; ++n) {
-          for (size_t i = rois_lod_[n]; i < rois_lod_[n + 1]; ++i) {
+      if (ctx.HasInput("RoisNum")) {
+        auto* rois_num_t = ctx.Input<Tensor>("RoisNum");
+        int rois_batch_size = rois_num_t->numel();
+        std::vector<int> rois_num_list(rois_batch_size);
+        memory::Copy(cplace, rois_num_list.data(), gplace,
+                     rois_num_t->data<int>(), sizeof(int) * rois_batch_size, 0);
+        int start = 0;
+        for (int n = 0; n < rois_batch_size; ++n) {
+          for (int i = start; i < start + rois_num_list[n]; ++i) {
             roi_batch_id_data[i] = n;
           }
+          start += rois_num_list[n];
         }
       } else {
         auto rois_lod = rois->lod().back();
diff --git a/paddle/fluid/operators/roi_pool_op.h b/paddle/fluid/operators/roi_pool_op.h
index 145b170dedf0613328223526b0a40a3c064f3028..40de6d0cf6abbcc4a1505cb6eb121ca70813c780 100644
--- a/paddle/fluid/operators/roi_pool_op.h
+++ b/paddle/fluid/operators/roi_pool_op.h
@@ -58,18 +58,20 @@ class CPUROIPoolOpKernel : public framework::OpKernel<T> {
         roi_batch_id_list.mutable_data<int>(ctx.GetPlace());
 
     int rois_batch_size;
-    if (ctx.HasInput("RoisLod")) {
-      auto* rois_lod_t = ctx.Input<framework::Tensor>("RoisLod");
-      rois_batch_size = rois_lod_t->numel();
+    if (ctx.HasInput("RoisNum")) {
+      auto* rois_num_t = ctx.Input<framework::Tensor>("RoisNum");
+      rois_batch_size = rois_num_t->numel();
       PADDLE_ENFORCE_EQ(
-          rois_batch_size - 1, batch_size,
+          rois_batch_size, batch_size,
           platform::errors::InvalidArgument("The rois_batch_size and imgs "
                                             "batch_size must be the same."));
-      auto* rois_lod = rois_lod_t->data<int64_t>();
-      for (int n = 0; n < rois_batch_size - 1; ++n) {
-        for (int i = rois_lod[n]; i < rois_lod[n + 1]; ++i) {
+      auto* rois_num_data = rois_num_t->data<int>();
+      int start = 0;
+      for (int n = 0; n < rois_batch_size; ++n) {
+        for (int i = start; i < start + rois_num_data[n]; ++i) {
           roi_batch_id_data[i] = n;
         }
+        start += rois_num_data[n];
       }
     } else {
       auto rois_lod = rois->lod().back();
@@ -185,14 +187,16 @@ class CPUROIPoolGradOpKernel : public framework::OpKernel<T> {
           roi_batch_id_list.mutable_data<int>(ctx.GetPlace());
 
       int rois_batch_size;
-      if (ctx.HasInput("RoisLod")) {
-        auto* rois_lod_t = ctx.Input<framework::Tensor>("RoisLod");
-        rois_batch_size = rois_lod_t->numel();
-        auto* rois_lod = rois_lod_t->data<int64_t>();
-        for (int n = 0; n < rois_batch_size - 1; ++n) {
-          for (int i = rois_lod[n]; i < rois_lod[n + 1]; ++i) {
+      if (ctx.HasInput("RoisNum")) {
+        auto* rois_num_t = ctx.Input<framework::Tensor>("RoisNum");
+        rois_batch_size = rois_num_t->numel();
+        auto* rois_num_data = rois_num_t->data<int>();
+        int start = 0;
+        for (int n = 0; n < rois_batch_size; ++n) {
+          for (int i = start; i < start + rois_num_data[n]; ++i) {
             roi_batch_id_data[i] = n;
           }
+          start += rois_num_data[n];
         }
       } else {
         auto rois_lod = rois->lod().back();
diff --git a/paddle/fluid/operators/run_program_op.cc b/paddle/fluid/operators/run_program_op.cc
index 04559a93c866c72f2d0b309a5005557134355666..2d599716443901053aa3d5dc8e93759320175b24 100644
--- a/paddle/fluid/operators/run_program_op.cc
+++ b/paddle/fluid/operators/run_program_op.cc
@@ -27,9 +27,6 @@ class RunProgramOp : public framework::OperatorWithKernel {
     PADDLE_ENFORCE_EQ(ctx->HasInputs("X"), true,
                       platform::errors::NotFound(
                           "Input(X) of RunProgramOp should not be null."));
-    PADDLE_ENFORCE_EQ(ctx->HasInputs("Params"), true,
-                      platform::errors::NotFound(
-                          "Input(Params) of RunProgramOp should not be null."));
     PADDLE_ENFORCE_EQ(ctx->HasOutputs("Out"), true,
                       platform::errors::NotFound(
                           "Output(Out) of RunProgramOp should not be null."));
@@ -73,7 +70,8 @@ class RunProgramOpMaker : public framework::OpProtoAndCheckerMaker {
              "(vector<LoDTensor or SelecetedRows>)"
              "The input parameter of RunProgram operator, also the parameters "
              "of the loaded program.")
-        .AsDuplicable();
+        .AsDuplicable()
+        .AsDispensable();
     AddOutput("Out",
               "(vector<LoDTensor>)"
               "The output tensors of RunProgram operator, also the fetch "
@@ -121,10 +119,6 @@ class RunProgramGradOp : public framework::OperatorWithKernel {
     PADDLE_ENFORCE_EQ(ctx->HasInputs("X"), true,
                       platform::errors::NotFound(
                           "Input(X) of RunProgramGradOp should not be null."));
-    PADDLE_ENFORCE_EQ(
-        ctx->HasInputs("Params"), true,
-        platform::errors::NotFound(
-            "Input(Params) of RunProgramGradOp should not be null."));
     PADDLE_ENFORCE_EQ(
         ctx->HasInputs(framework::GradVarName("Out")), true,
         platform::errors::NotFound(
diff --git a/paddle/fluid/operators/run_program_op.h b/paddle/fluid/operators/run_program_op.h
index 1c493fc6be093a2af8f58c8e78d1be43de34306f..5afe25cf687fc96d1eaac33b2d0516c96c394a46 100644
--- a/paddle/fluid/operators/run_program_op.h
+++ b/paddle/fluid/operators/run_program_op.h
@@ -209,9 +209,14 @@ class RunProgramOpKernel : public framework::OpKernel<T> {
     auto output_vars = ctx.MultiOutputVar("Out");
 
     auto input_var_names = ctx.InputNames("X");
-    auto param_names = ctx.InputNames("Params");
     auto output_var_names = ctx.OutputNames("Out");
 
+    // current program may not hold parameters
+    std::vector<std::string> param_names;
+    if (!param_vars.empty()) {
+      param_names = ctx.InputNames("Params");
+    }
+
     auto *block = ctx.Attr<BlockDesc *>("global_block");
     auto *program = block->Program();
     auto start_op_index = ctx.Attr<int64_t>("start_op_index");
diff --git a/paddle/fluid/operators/scale_op.h b/paddle/fluid/operators/scale_op.h
index 64ee868fb6d8b1cf55f6400a28c10038efc7884e..11c81d23b2ed271ce89e6a27b1179e7d06dd0ebd 100644
--- a/paddle/fluid/operators/scale_op.h
+++ b/paddle/fluid/operators/scale_op.h
@@ -60,7 +60,10 @@ class ScaleKernel : public framework::OpKernel<T> {
     out->mutable_data<T>(in->place());
 
     PADDLE_ENFORCE_EQ(in->dims(), out->dims(),
-                      "in and out should have the same dim");
+                      paddle::platform::errors::InvalidArgument(
+                          "the input and output should have the same dim"
+                          "but input dim is %s, output dim is %s",
+                          in->dims(), out->dims()));
 
     auto eigen_out = framework::EigenVector<T>::Flatten(*out);
     auto eigen_in = framework::EigenVector<T>::Flatten(*in);
diff --git a/paddle/fluid/operators/segment_pool_op.cc b/paddle/fluid/operators/segment_pool_op.cc
new file mode 100644
index 0000000000000000000000000000000000000000..322cd97f01c3ad97ba74f049696fdec592ee524e
--- /dev/null
+++ b/paddle/fluid/operators/segment_pool_op.cc
@@ -0,0 +1,166 @@
+/* Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/operators/segment_pool_op.h"
+#include <memory>
+#include <string>
+
+namespace paddle {
+namespace operators {
+
+class SegmentPoolOp : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+
+  void InferShape(framework::InferShapeContext* ctx) const override {
+    OP_INOUT_CHECK(ctx->HasInput("X"), "Input", "X", "SegmentPool");
+    OP_INOUT_CHECK(ctx->HasInput("SegmentIds"), "Input", "SegmentIds",
+                   "SegmentPool");
+    OP_INOUT_CHECK(ctx->HasOutput("Out"), "Output", "Out", "SegmentPool");
+    auto dims = ctx->GetInputDim("X");
+    dims[0] = -1;
+    ctx->SetOutputDim("Out", dims);
+
+    if (ctx->Attrs().Get<std::string>("pooltype") == "MEAN") {
+      OP_INOUT_CHECK(ctx->HasOutput("SummedIds"), "Output", "SummedIds",
+                     "SegmentPool");
+      ctx->SetOutputDim("SummedIds", {-1, 1});
+    }
+  }
+
+ protected:
+  framework::OpKernelType GetExpectedKernelType(
+      const framework::ExecutionContext& ctx) const override {
+    return framework::OpKernelType(
+        OperatorWithKernel::IndicateVarDataType(ctx, "X"),
+        ctx.device_context());
+  }
+};
+
+class SegmentPoolOpMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  void Make() override {
+    AddInput("X", "(Tensor) The input data of SegmentPoolOp");
+    AddInput("SegmentIds",
+             "(Tensor) 1-D tensor which have the same size with the fist "
+             "dimension of input X.");
+    AddOutput("Out", "(Tensor) The output of SegmentPoolOp.");
+    AddOutput("SummedIds",
+              "(Tensor) This tensor is used to counts of segment ids for the "
+              "backward of the mean pool.")
+        .AsIntermediate();
+    AddAttr<std::string>(
+        "pooltype",
+        "(string, default 'SUM') the pooling type of SegmentPoolOp.")
+        .SetDefault("SUM")
+        .InEnum({"SUM", "MEAN", "MIN", "MAX"});
+    AddComment(R"DOC(
+Segment Pool Operator.
+
+This operator will pool the elements of input `X` which with the same index
+in `SegmentIds`.
+
+For SUM operation, it computes a tensor such that $Out_i = \sum_{j} X_{j}$
+where sum is over j such that `SegmentIds[j] == i`.
+
+For MEAN operation, it computes a tensor such that
+$Out_i = \frac{1}{n_i}  \sum_{j} X_{j}$ where sum is over j such that
+`SegmentIds[j] == i` and $n_i$ is the number of all index `SegmentIds[j] == i`.
+
+For MIN operation, it computes a tensor such that $Out_i = \min_{j} X_{j}$
+where min is over j such that `SegmentIds[j] == i`.
+
+For MAX operation, it computes a tensor such that $Out_i = \max_{j} X_{j}$
+where max is over j such that `SegmentIds[j] == i`.
+    )DOC");
+  }
+};
+
+class SegmentPoolGradOp : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+
+  void InferShape(framework::InferShapeContext* ctx) const override {
+    OP_INOUT_CHECK(ctx->HasInput(framework::GradVarName("Out")), "Input",
+                   framework::GradVarName("Out"), "SegmentPoolGrad");
+    OP_INOUT_CHECK(ctx->HasInput("X"), "Input", "X", "SegmentPoolGrad");
+    auto og_dims = ctx->GetInputDim(framework::GradVarName("Out"));
+    auto x_dims = ctx->GetInputDim("X");
+    PADDLE_ENFORCE_EQ(og_dims.size(), x_dims.size(),
+                      platform::errors::InvalidArgument(
+                          "The rank of output grad must equal to Input(X). But "
+                          "received: input rank %u, input shape [%s].",
+                          og_dims.size(), og_dims));
+    for (int64_t i = 1; i < og_dims.size(); ++i) {
+      PADDLE_ENFORCE_EQ(
+          og_dims[i], x_dims[i],
+          platform::errors::InvalidArgument(
+              "The dimension mismatch between Input(OUT@GRAD) and "
+              "Input(X). Received Input(OUT@GRAD): input rank %u, "
+              "input shape [%s]; received Input(X): input rank %u, "
+              "input shape [%s].",
+              og_dims.size(), og_dims, x_dims.size(), x_dims));
+    }
+
+    ctx->ShareDim("X", /*->*/ framework::GradVarName("X"));
+  }
+
+ protected:
+  framework::OpKernelType GetExpectedKernelType(
+      const framework::ExecutionContext& ctx) const override {
+    return framework::OpKernelType(OperatorWithKernel::IndicateVarDataType(
+                                       ctx, framework::GradVarName("Out")),
+                                   ctx.device_context());
+  }
+};
+
+template <typename T>
+class SegmentPoolGradOpMaker : public framework::SingleGradOpMaker<T> {
+ public:
+  using framework::SingleGradOpMaker<T>::SingleGradOpMaker;
+
+ protected:
+  void Apply(GradOpPtr<T> op_desc_ptr) const override {
+    op_desc_ptr->SetType("segment_pool_grad");
+    op_desc_ptr->SetInput("X", this->Input("X"));
+    op_desc_ptr->SetInput("SegmentIds", this->Input("SegmentIds"));
+    op_desc_ptr->SetInput("Out", this->Output("Out"));
+    if (BOOST_GET_CONST(std::string, this->GetAttr("pooltype")) == "MEAN") {
+      op_desc_ptr->SetInput("SummedIds", this->Output("SummedIds"));
+    }
+    op_desc_ptr->SetInput(framework::GradVarName("Out"),
+                          this->OutputGrad("Out"));
+    op_desc_ptr->SetOutput(framework::GradVarName("X"), this->InputGrad("X"));
+    op_desc_ptr->SetAttrMap(this->Attrs());
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+REGISTER_OPERATOR(segment_pool, ops::SegmentPoolOp, ops::SegmentPoolOpMaker,
+                  ops::SegmentPoolGradOpMaker<paddle::framework::OpDesc>,
+                  ops::SegmentPoolGradOpMaker<paddle::imperative::OpBase>);
+REGISTER_OPERATOR(segment_pool_grad, ops::SegmentPoolGradOp);
+
+REGISTER_OP_CPU_KERNEL(
+    segment_pool,
+    ops::SegmentPoolKernel<paddle::platform::CPUDeviceContext, float>,
+    ops::SegmentPoolKernel<paddle::platform::CPUDeviceContext, double>);
+
+REGISTER_OP_CPU_KERNEL(
+    segment_pool_grad,
+    ops::SegmentPoolGradKernel<paddle::platform::CPUDeviceContext, float>,
+    ops::SegmentPoolGradKernel<paddle::platform::CPUDeviceContext, double>);
diff --git a/paddle/fluid/operators/segment_pool_op.h b/paddle/fluid/operators/segment_pool_op.h
new file mode 100644
index 0000000000000000000000000000000000000000..a505946b9f5229425f724ae5469beb77863e9aaf
--- /dev/null
+++ b/paddle/fluid/operators/segment_pool_op.h
@@ -0,0 +1,130 @@
+/* Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+#include <string>
+#include "paddle/fluid/framework/eigen.h"
+#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/operators/math/math_function.h"
+#include "paddle/fluid/operators/math/segment_pooling.h"
+#include "paddle/fluid/platform/macros.h"
+
+namespace paddle {
+namespace operators {
+
+using Tensor = framework::Tensor;
+
+template <typename DeviceContext, typename T, typename IndexT>
+void SegmentKernelLaunchHelper(const framework::ExecutionContext& context) {
+  auto* input = context.Input<Tensor>("X");
+  auto* segment = context.Input<Tensor>("SegmentIds");
+  auto* output = context.Output<Tensor>("Out");
+  std::string pooltype = context.Attr<std::string>("pooltype");
+  Tensor* summed_ids = nullptr;
+
+  int64_t num_indices = segment->numel();
+  PADDLE_ENFORCE_EQ(
+      num_indices, input->dims()[0],
+      platform::errors::InvalidArgument(
+          "Segment_ids should be the same size as dimension 0 of input X."));
+  PADDLE_ENFORCE_EQ(num_indices, segment->dims()[0],
+                    platform::errors::InvalidArgument(
+                        "Segment_ids should be 1-D tensor, or it's other "
+                        "dimension size is 1. Segment_ids's shape is: [%s].",
+                        segment->dims()));
+
+  if (input->numel() == 0 || segment->numel() == 0) {
+    return;
+  }
+
+  bool cpu_place = context.GetPlace().type() == typeid(platform::CPUPlace);
+  if (cpu_place) {
+    auto dims = input->dims();
+    auto* segment_ids = segment->data<IndexT>();
+    dims[0] = static_cast<int64_t>(segment_ids[segment->numel() - 1] + 1);
+    PADDLE_ENFORCE_GT(
+        dims[0], 0,
+        platform::errors::InvalidArgument(
+            "Segment ids must be >= 0, but got last id %d", dims[0]));
+    output->Resize({dims});
+    output->mutable_data<T>(context.GetPlace());
+    math::SetConstant<DeviceContext, T> set_zero;
+    auto& dev_ctx = context.template device_context<DeviceContext>();
+    set_zero(dev_ctx, output, static_cast<T>(0));
+  }
+
+  SegmentPoolFunctor<DeviceContext, T, IndexT> pool;
+
+  pool(context.template device_context<DeviceContext>(), *input, *segment,
+       output, summed_ids, pooltype);
+}
+
+template <typename DeviceContext, typename T>
+class SegmentPoolKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& context) const override {
+    auto* segment = context.Input<Tensor>("SegmentIds");
+    auto index_type = segment->type();
+    if (index_type == framework::proto::VarType::INT32) {
+      SegmentKernelLaunchHelper<DeviceContext, T, int>(context);
+    } else if (index_type == framework::proto::VarType::INT64) {
+      SegmentKernelLaunchHelper<DeviceContext, T, int64_t>(context);
+    } else {
+      PADDLE_THROW(platform::errors::InvalidArgument(
+          "Unsupported index type, Expected int, int64, but got %s.",
+          index_type));
+    }
+  }
+};
+
+template <typename DeviceContext, typename T>
+class SegmentPoolGradKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& context) const override {
+    auto* input = context.Input<Tensor>("X");
+    auto* output = context.Input<Tensor>("Out");
+    auto* segment = context.Input<Tensor>("SegmentIds");
+    auto* out_g = context.Input<Tensor>(framework::GradVarName("Out"));
+    auto* in_g = context.Output<Tensor>(framework::GradVarName("X"));
+    std::string pooltype = context.Attr<std::string>("pooltype");
+
+    const Tensor* summed_ids = nullptr;
+    if (pooltype == "MEAN") {
+      summed_ids = context.Input<Tensor>("SummedIds");
+    }
+
+    in_g->mutable_data<T>(context.GetPlace());
+    math::SetConstant<DeviceContext, T> set_zero;
+    auto& dev_ctx = context.template device_context<DeviceContext>();
+    set_zero(dev_ctx, in_g, static_cast<T>(0));
+
+    auto index_type = segment->type();
+    if (index_type == framework::proto::VarType::INT32) {
+      SegmentPoolGradFunctor<DeviceContext, T, int> pool;
+      pool(context.template device_context<DeviceContext>(), *input, *output,
+           *out_g, *segment, in_g, summed_ids, pooltype);
+    } else if (index_type == framework::proto::VarType::INT64) {
+      SegmentPoolGradFunctor<DeviceContext, T, int64_t> pool;
+      pool(context.template device_context<DeviceContext>(), *input, *output,
+           *out_g, *segment, in_g, summed_ids, pooltype);
+    } else {
+      PADDLE_THROW(platform::errors::InvalidArgument(
+          "Unsupported index type, Expected int, int64, but got %s.",
+          index_type));
+    }
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/fluid/operators/sequence_ops/sequence_expand_op.h b/paddle/fluid/operators/sequence_ops/sequence_expand_op.h
index 013170199a6bbe0246406b9c35a326bd063875a9..1186ed891e8c080c023aae5076cf1cb086fbc231 100644
--- a/paddle/fluid/operators/sequence_ops/sequence_expand_op.h
+++ b/paddle/fluid/operators/sequence_ops/sequence_expand_op.h
@@ -92,9 +92,11 @@ class SequenceExpandKernel : public framework::OpKernel<T> {
     auto& x_lod = x->lod();
     auto& y_lod = y->lod();
 
-    PADDLE_ENFORCE_EQ(y_lod.empty(), false,
-                      "Input(Y) Tensor of SequenceExpandOp does not contain "
-                      "LoD information.");
+    PADDLE_ENFORCE_EQ(
+        y_lod.empty(), false,
+        platform::errors::InvalidArgument(
+            "Input(Y) Tensor of SequenceExpandOp does not contain "
+            "LoD information."));
 
     if (ref_level == -1) ref_level = y_lod.size() - 1;
 
diff --git a/paddle/fluid/operators/shape_op.cc b/paddle/fluid/operators/shape_op.cc
index 62bffe630484e3ab30bedcf2324f6516bca3b27e..0ecf9bfb5d8c0ccadbdd8b7a0b8f6193d4dc5310 100644
--- a/paddle/fluid/operators/shape_op.cc
+++ b/paddle/fluid/operators/shape_op.cc
@@ -68,6 +68,6 @@ REGISTER_OPERATOR(
     shape, ops::ShapeOp, ops::ShapeOpMaker,
     paddle::framework::EmptyGradOpMaker<paddle::framework::OpDesc>,
     paddle::framework::EmptyGradOpMaker<paddle::imperative::OpBase>);
-REGISTER_OP_CPU_KERNEL(shape, ops::ShapeKernel<int>, ops::ShapeKernel<int32_t>,
+REGISTER_OP_CPU_KERNEL(shape, ops::ShapeKernel<bool>, ops::ShapeKernel<int>,
                        ops::ShapeKernel<int64_t>, ops::ShapeKernel<float>,
                        ops::ShapeKernel<double>);
diff --git a/paddle/fluid/operators/shape_op.cu b/paddle/fluid/operators/shape_op.cu
index 4b9dca0d4028be36ad8ba46ebe35db101e003ee9..5d50b17818cbb8068db2fded1f3f4e76bad44430 100644
--- a/paddle/fluid/operators/shape_op.cu
+++ b/paddle/fluid/operators/shape_op.cu
@@ -15,8 +15,8 @@ limitations under the License. */
 #include "paddle/fluid/operators/shape_op.h"
 
 REGISTER_OP_CUDA_KERNEL(
-    shape, paddle::operators::ShapeKernel<int>,
-    paddle::operators::ShapeKernel<int32_t>,
+    shape, paddle::operators::ShapeKernel<bool>,
+    paddle::operators::ShapeKernel<int>,
     paddle::operators::ShapeKernel<int64_t>,
     paddle::operators::ShapeKernel<float>,
     paddle::operators::ShapeKernel<double>,
diff --git a/paddle/fluid/operators/size_op.cc b/paddle/fluid/operators/size_op.cc
index b45fa7c791ff22be422ce12a8348a071c60ddd0f..70733d643673ad8acde9a45f273a52a9723fb0d3 100644
--- a/paddle/fluid/operators/size_op.cc
+++ b/paddle/fluid/operators/size_op.cc
@@ -53,7 +53,7 @@ REGISTER_OPERATOR(
     size, ops::SizeOp, ops::SizeOpMaker,
     paddle::framework::EmptyGradOpMaker<paddle::framework::OpDesc>,
     paddle::framework::EmptyGradOpMaker<paddle::imperative::OpBase>);
-REGISTER_OP_CPU_KERNEL(size, ops::SizeKernel<int>, ops::SizeKernel<int32_t>,
+REGISTER_OP_CPU_KERNEL(size, ops::SizeKernel<int>, ops::SizeKernel<int64_t>,
                        ops::SizeKernel<paddle::platform::float16>,
                        ops::SizeKernel<float>, ops::SizeKernel<double>,
                        ops::SizeKernel<bool>);
diff --git a/paddle/fluid/operators/size_op.cu b/paddle/fluid/operators/size_op.cu
index 3ea3032693236d5618ff6f0c858cbd85e34633ab..de56ecd95270577689f699462b9273b43f34595e 100644
--- a/paddle/fluid/operators/size_op.cu
+++ b/paddle/fluid/operators/size_op.cu
@@ -16,7 +16,7 @@ limitations under the License. */
 
 REGISTER_OP_CUDA_KERNEL(
     size, paddle::operators::SizeKernel<int>,
-    paddle::operators::SizeKernel<int32_t>,
+    paddle::operators::SizeKernel<int64_t>,
     paddle::operators::SizeKernel<paddle::platform::float16>,
     paddle::operators::SizeKernel<float>, paddle::operators::SizeKernel<bool>,
     paddle::operators::SizeKernel<double>);
diff --git a/paddle/fluid/operators/size_op.h b/paddle/fluid/operators/size_op.h
index fb44070897156ef88062231322e28a2db1f244a7..e8c53d6e683305bfc1ff7c052a2dc54ecf465936 100644
--- a/paddle/fluid/operators/size_op.h
+++ b/paddle/fluid/operators/size_op.h
@@ -26,8 +26,18 @@ class SizeKernel : public framework::OpKernel<T> {
   void Compute(const framework::ExecutionContext& ctx) const override {
     auto* in_t = ctx.Input<Tensor>("Input");
     auto* out_t = ctx.Output<Tensor>("Out");
-    auto out_data = out_t->mutable_data<int64_t>(platform::CPUPlace());
-    out_data[0] = in_t->numel();
+    auto place = ctx.GetPlace();
+    auto out_data = out_t->mutable_data<int64_t>(place);
+    auto cpu_place = platform::CPUPlace();
+    if (place == cpu_place) {
+      out_data[0] = in_t->numel();
+    } else {
+      Tensor cpu_tensor;
+      auto cpu_data =
+          cpu_tensor.mutable_data<int64_t>(out_t->dims(), cpu_place);
+      cpu_data[0] = in_t->numel();
+      TensorCopy(cpu_tensor, place, out_t);
+    }
   }
 };
 }  // namespace operators
diff --git a/paddle/fluid/operators/strided_memcpy.h b/paddle/fluid/operators/strided_memcpy.h
index f20bada8ab288fe74fd8ca82a73522a22b234191..142b00b4de66caaedda5c4f0723d31e3a819b8a4 100644
--- a/paddle/fluid/operators/strided_memcpy.h
+++ b/paddle/fluid/operators/strided_memcpy.h
@@ -60,20 +60,33 @@ inline void StridedNumelCopyWithAxis(const platform::DeviceContext& ctx,
   auto place = ctx.GetPlace();
 
   PADDLE_ENFORCE_EQ(src_stride_numel.size(), dst_stride_numel.size(),
-                    "src and dst tensor should have the same dims size.");
+                    platform::errors::InvalidArgument(
+                        "Source and destination tensor should have the same "
+                        "dimension size, but source tensor dimension size is "
+                        "%u, destination tensor size is %u.",
+                        src_stride_numel.size(), dst_stride_numel.size()));
 
   for (int64_t i = 0; i < axis; ++i) {
     if (i < axis) {
-      PADDLE_ENFORCE_EQ(src_stride_numel[i] / src_stride_numel[axis],
-                        dst_stride_numel[i] / dst_stride_numel[axis],
-                        "src and dst should have the same elements "
-                        "except the specified axis.");
+      PADDLE_ENFORCE_EQ(
+          src_stride_numel[i] / src_stride_numel[axis],
+          dst_stride_numel[i] / dst_stride_numel[axis],
+          platform::errors::InvalidArgument(
+              "Source and destination tensor should have the same number of "
+              "elements except the specified axis, but the source elements "
+              "number is %d, destination elements number is %d.",
+              src_stride_numel[i] / src_stride_numel[axis],
+              dst_stride_numel[i] / dst_stride_numel[axis]));
     } else if (i == axis) {
       continue;
     } else {
-      PADDLE_ENFORCE_EQ(src_stride_numel[i], dst_stride_numel[i],
-                        "src and dst should have the same elements "
-                        "except the specified axis.");
+      PADDLE_ENFORCE_EQ(
+          src_stride_numel[i], dst_stride_numel[i],
+          platform::errors::InvalidArgument(
+              "Source and destination tensor should have the same number of "
+              "elements except the specified axis, but the source elements "
+              "number is %d, destination elements number is %d.",
+              src_stride_numel[i], dst_stride_numel[i]));
     }
   }
 
@@ -90,7 +103,8 @@ inline void StridedNumelCopyWithAxis(const platform::DeviceContext& ctx,
       memory::Copy(gpu_place, dst + i * dst_after, gpu_place,
                    src + i * src_after, sizeof(T) * size, cuda_ctx.stream());
 #else
-      PADDLE_THROW("Paddle is not compiled with GPU");
+      PADDLE_THROW(platform::errors::PreconditionNotMet(
+          "Paddle is not compiled with GPU."));
 #endif
     }
   }
diff --git a/paddle/fluid/operators/sum_op.cc b/paddle/fluid/operators/sum_op.cc
index b06e8202cc79f017e26e3c8339ad05951a5a2bf7..52c4c63b473c443bb97fb7962179ce27e06fb16c 100644
--- a/paddle/fluid/operators/sum_op.cc
+++ b/paddle/fluid/operators/sum_op.cc
@@ -186,10 +186,17 @@ class SumOp : public framework::OperatorWithKernel {
           }
         }
       }
-      PADDLE_THROW("Cannot find the input data type by all input data");
+      PADDLE_THROW(platform::errors::InvalidArgument(
+          "Expected each tensor in Input(x) in sum op has be initialized, but "
+          "some tensor in Input(x) is not be initialized, please check your "
+          "code.",
+          framework::ToTypeName(x_vars[0]->Type())));
     }
-    PADDLE_THROW("Unexpected branch. Input type is %s",
-                 framework::ToTypeName(x_vars[0]->Type()));
+    PADDLE_THROW(platform::errors::InvalidArgument(
+        "Expected type of Input(X) must be Tensor,  SelectedRows or "
+        "LodTensorArray. But got "
+        "unsupport type: %s.",
+        framework::ToTypeName(x_vars[0]->Type())));
   }
 };
 
diff --git a/paddle/fluid/operators/sum_op.cu b/paddle/fluid/operators/sum_op.cu
index d0bf3a0abf58c47720216bd839eb84260ac207d8..6034cda50c32a857d8a501bf243a91df2f966eea 100644
--- a/paddle/fluid/operators/sum_op.cu
+++ b/paddle/fluid/operators/sum_op.cu
@@ -169,8 +169,18 @@ void SumToLoDTensor(const framework::ExecutionContext &context) {
       auto row_numel = sr_value.numel() / sr_rows.size();
       auto out_dims = out->dims();
 
-      PADDLE_ENFORCE_EQ(sr.height(), out_dims[0]);
-      PADDLE_ENFORCE_EQ(row_numel, out->numel() / sr.height());
+      PADDLE_ENFORCE_EQ(sr.height(), out_dims[0],
+                        platform::errors::InvalidArgument(
+                            "The table height of input must be same as output, "
+                            "but received input height is %d"
+                            ", output height is %d",
+                            sr.height(), out_dims[0]));
+      PADDLE_ENFORCE_EQ(row_numel, out->numel() / sr.height(),
+                        platform::errors::InvalidArgument(
+                            "The table width of input must be same as output, "
+                            "but received input width is %d"
+                            ", output width is %d",
+                            row_numel, out->numel() / sr.height()));
 
       auto *sr_data = sr_value.data<T>();
       auto *sr_out_data = out->data<T>();
@@ -231,8 +241,11 @@ class SumKernel<platform::CUDADeviceContext, T>
     } else if (out_var->IsType<framework::LoDTensorArray>()) {
       LodTensorArrayCompute<platform::CUDADeviceContext, T>(context);
     } else {
-      PADDLE_THROW("Unexpected branch, output variable type is %s",
-                   framework::ToTypeName(out_var->Type()));
+      PADDLE_THROW(platform::errors::InvalidArgument(
+          "Expected type of Ouput(out) must be Tensor,  SelectedRows or "
+          "LodTensorArray. But got "
+          "unsupport type: %s.",
+          framework::ToTypeName(out_var->Type())));
     }
   }
 };
diff --git a/paddle/fluid/operators/sum_op.h b/paddle/fluid/operators/sum_op.h
index 6847a81377979ab05aec03f43ba08fbec646d974..4c8f7be6ea26394bd3143058260c1fc94ce1e7e1 100644
--- a/paddle/fluid/operators/sum_op.h
+++ b/paddle/fluid/operators/sum_op.h
@@ -182,7 +182,11 @@ class SumKernel : public framework::OpKernel<T> {
           auto &in_t = in_vars[i]->Get<framework::SelectedRows>();
           functor(context.template device_context<DeviceContext>(), in_t, out);
         } else {
-          PADDLE_THROW("Variable type must be LoDTensor/SelectedRows.");
+          PADDLE_THROW(platform::errors::InvalidArgument(
+              "Expected type of Input(X) of %d-th must be Tensor, "
+              "SelectedRows. But got "
+              "unsupport type: %s.",
+              framework::ToTypeName(in_vars[i]->Type())));
         }
       }
     } else if (out_var->IsType<framework::SelectedRows>()) {
@@ -190,8 +194,11 @@ class SumKernel : public framework::OpKernel<T> {
     } else if (out_var->IsType<framework::LoDTensorArray>()) {
       LodTensorArrayCompute<DeviceContext, T>(context);
     } else {
-      PADDLE_THROW("Unexpected branch, output variable type is %s",
-                   framework::ToTypeName(out_var->Type()));
+      PADDLE_THROW(platform::errors::InvalidArgument(
+          "Expected type of Output(out) must be Tensor, SelectedRows, "
+          "LoDTensorArray. But got "
+          "unsupport type: %s.",
+          framework::ToTypeName(out_var->Type())));
     }
   }
 };
diff --git a/paddle/fluid/operators/tensorrt/tensorrt_engine_op.h b/paddle/fluid/operators/tensorrt/tensorrt_engine_op.h
index 012cae0ef5b442dce5e5ab0507f845b321eed567..4258ef9ba6fdbfebdc9cef450051d3efa11b9ee8 100644
--- a/paddle/fluid/operators/tensorrt/tensorrt_engine_op.h
+++ b/paddle/fluid/operators/tensorrt/tensorrt_engine_op.h
@@ -208,8 +208,11 @@ class TensorRTEngineOp : public framework::OperatorBase {
     auto stream =
         reinterpret_cast<const platform::CUDADeviceContext &>(dev_ctx).stream();
 
-    PADDLE_ENFORCE_EQ(input_names_.empty(), false,
-                      "should pass at least one input");
+    PADDLE_ENFORCE_EQ(
+        input_names_.empty(), false,
+        platform::errors::PreconditionNotMet(
+            "TensorRT engine needs at least one input, but no input is found. "
+            "Please check if you set the input correctly."));
 
     std::vector<std::string> output_maps =
         Attr<std::vector<std::string>>("output_name_mapping");
@@ -298,12 +301,19 @@ class TensorRTEngineOp : public framework::OperatorBase {
 #endif
       }
       auto *fluid_v = scope.FindVar(y);
-      PADDLE_ENFORCE_NOT_NULL(fluid_v, "no output variable called %s", y);
+      PADDLE_ENFORCE_NOT_NULL(
+          fluid_v,
+          platform::errors::NotFound(
+              "Output variable %s is not found in TensorRT subgraph.", y));
       auto *fluid_t = fluid_v->GetMutable<framework::LoDTensor>();
       fluid_t->Resize(framework::make_ddim(ddim));
 
-      PADDLE_ENFORCE(bind_index < num_bindings,
-                     "The bind index should be less than num_bindings");
+      PADDLE_ENFORCE_LT(bind_index, num_bindings,
+                        platform::errors::InvalidArgument(
+                            "The binding index in TRT engine should be less "
+                            "than the number of bindings, but got binding "
+                            "index = %d, number of bindings = %d.",
+                            bind_index, num_bindings));
       buffers[bind_index] = static_cast<void *>(fluid_t->mutable_data<float>(
           BOOST_GET_CONST(platform::CUDAPlace, dev_place)));
 
diff --git a/paddle/fluid/operators/tile_op.cc b/paddle/fluid/operators/tile_op.cc
index da4ca87296d92fc1052f462ae6ee8a3acb05eb49..bc1cb3b4aa1c1bdd0a9be39a4e113301d65ce5b5 100644
--- a/paddle/fluid/operators/tile_op.cc
+++ b/paddle/fluid/operators/tile_op.cc
@@ -241,6 +241,26 @@ class TileGradOpMaker : public framework::SingleGradOpMaker<T> {
   }
 };
 
+template <typename T>
+class TileDoubleGradOpMaker : public framework::SingleGradOpMaker<T> {
+ public:
+  using framework::SingleGradOpMaker<T>::SingleGradOpMaker;
+
+ protected:
+  void Apply(GradOpPtr<T> op) const override {
+    op->SetType("tile");
+    op->SetInput("X", this->OutputGrad(framework::GradVarName("X")));
+    op->SetOutput("Out", this->InputGrad(framework::GradVarName("Out")));
+    if (this->HasInput("repeat_times_tensor")) {
+      op->SetInput("repeat_times_tensor", this->Input("repeat_times_tensor"));
+    }
+    if (this->HasInput("RepeatTimes")) {
+      op->SetInput("RepeatTimes", this->Input("RepeatTimes"));
+    }
+    op->SetAttrMap(this->Attrs());
+  }
+};
+
 DECLARE_NO_NEED_BUFFER_VARS_INFERER(TileGradNoNeedBufVarsInferer, "X");
 
 }  // namespace operators
@@ -251,6 +271,8 @@ REGISTER_OPERATOR(tile, ops::TileOp, ops::TileOpMaker,
                   ops::TileGradOpMaker<paddle::framework::OpDesc>,
                   ops::TileGradOpMaker<paddle::imperative::OpBase>);
 REGISTER_OPERATOR(tile_grad, ops::TileGradOp,
+                  ops::TileDoubleGradOpMaker<paddle::framework::OpDesc>,
+                  ops::TileDoubleGradOpMaker<paddle::imperative::OpBase>,
                   ops::TileGradNoNeedBufVarsInferer);
 REGISTER_OP_CPU_KERNEL(
     tile, ops::TileKernel<paddle::platform::CPUDeviceContext, float>,
diff --git a/paddle/fluid/operators/top_k_v2_op.cc b/paddle/fluid/operators/top_k_v2_op.cc
index cc72d83411f5a34561a75e7e75f98077ee5a4e5d..0e3fcced19ea8eb1580ca93fa9d6616685601f75 100644
--- a/paddle/fluid/operators/top_k_v2_op.cc
+++ b/paddle/fluid/operators/top_k_v2_op.cc
@@ -32,7 +32,6 @@ class TopkV2Op : public framework::OperatorWithKernel {
 
     auto input_dims = ctx->GetInputDim("X");
     const int& dim_size = input_dims.size();
-    const int k = static_cast<int>(ctx->Attrs().Get<int>("k"));
     int axis = static_cast<int>(ctx->Attrs().Get<int>("axis"));
     PADDLE_ENFORCE_EQ((axis < dim_size) && (axis >= (-1 * dim_size)), true,
                       "the axis of topk"
@@ -41,8 +40,18 @@ class TopkV2Op : public framework::OperatorWithKernel {
 
     if (axis < 0) axis += dim_size;
 
-    PADDLE_ENFORCE_GE(
-        k, 1, "the attribute of k in the topk must >= 1, but received %d .", k);
+    int k;
+    auto k_is_tensor = ctx->HasInput("K");
+    if (k_is_tensor) {
+      k = -1;
+    } else {
+      k = static_cast<int>(ctx->Attrs().Get<int>("k"));
+      PADDLE_ENFORCE_EQ(k >= 1, true,
+                        "the attribute of k in the topk must >= 1 or be a "
+                        "Tensor, but received %d .",
+                        k);
+    }
+
     PADDLE_ENFORCE_GE(input_dims.size(), 1,
                       "input of topk must have >= 1d shape");
 
diff --git a/paddle/fluid/operators/transpose_op.h b/paddle/fluid/operators/transpose_op.h
index d7f5c3dd457c90eefc4181cdbc662196a046853e..e4e5dfdba9f6057161051e551d1f6711ba0cd4e9 100644
--- a/paddle/fluid/operators/transpose_op.h
+++ b/paddle/fluid/operators/transpose_op.h
@@ -53,10 +53,9 @@ inline void TransCompute(const int dim, const DeviceContext& dev_ctx,
       trans6(dev_ctx, in, out, axis);
       break;
     default:
-      PADDLE_THROW(platform::errors::InvalidArgument(
-          "Tensors with rank at most 6 are supported"
-          ", but received input tensor's rank is %d,",
-          dim));
+      // for dim >= 7 situation
+      math::TransposeNormal<DeviceContext, T> trans_normal;
+      trans_normal(dev_ctx, in, out, axis);
   }
 }
 
diff --git a/paddle/fluid/operators/truncated_gaussian_random_op.cu b/paddle/fluid/operators/truncated_gaussian_random_op.cu
index 5a3510babe4d57b9e80f0e7898df98033834ca15..a838c30771a5c1229061a58b12c6777a3d24c6f3 100644
--- a/paddle/fluid/operators/truncated_gaussian_random_op.cu
+++ b/paddle/fluid/operators/truncated_gaussian_random_op.cu
@@ -15,6 +15,7 @@ limitations under the License. */
 #include <thrust/random.h>
 #include <thrust/transform.h>
 #include <limits>
+#include "paddle/fluid/framework/generator.h"
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/framework/operator.h"
 
@@ -46,6 +47,37 @@ struct TruncatedNormal {
   }
 };
 
+template <typename T>
+struct TruncatedNormalOffset {
+  T mean, std;
+  T a_normal_cdf;
+  T b_normal_cdf;
+  unsigned int seed;
+  T numeric_min;
+  int offset_;
+
+  __host__ __device__ TruncatedNormalOffset(T mean, T std, T numeric_min,
+                                            int seed, int offset)
+      : mean(mean),
+        std(std),
+        seed(seed),
+        numeric_min(numeric_min),
+        offset_(offset) {
+    a_normal_cdf = (1.0 + erff(-2.0 / sqrtf(2.0))) / 2.0;
+    b_normal_cdf = (1.0 + erff(2.0 / sqrtf(2.0))) / 2.0;
+  }
+
+  __host__ __device__ T operator()(const unsigned int n) const {
+    thrust::minstd_rand rng;
+    rng.seed(seed);
+    thrust::uniform_real_distribution<T> dist(numeric_min, 1);
+    rng.discard(n);
+    T value = dist(rng);
+    auto p = a_normal_cdf + (b_normal_cdf - a_normal_cdf) * value;
+    return std::sqrt(2.0) * erfinvf(2 * p - 1) * std + mean;
+  }
+};
+
 template <typename T>
 class GPUTruncatedGaussianRandomKernel : public framework::OpKernel<T> {
  public:
@@ -54,14 +86,31 @@ class GPUTruncatedGaussianRandomKernel : public framework::OpKernel<T> {
     T* data = tensor->mutable_data<T>(context.GetPlace());
 
     unsigned int seed = static_cast<unsigned int>(context.Attr<int>("seed"));
+    bool seed_flag = false;
     if (seed == 0) {
       std::random_device rd;
       seed = rd();
+      seed_flag = true;
     }
     T mean = static_cast<T>(context.Attr<float>("mean"));
     T std = static_cast<T>(context.Attr<float>("std"));
     thrust::counting_iterator<unsigned int> index_sequence_begin(0);
     int64_t size = tensor->numel();
+
+    int device_id =
+        BOOST_GET_CONST(platform::CUDAPlace, context.GetPlace()).GetDeviceId();
+    auto gen_cuda = framework::GetDefaultCUDAGenerator(device_id);
+
+    if (gen_cuda->GetIsInitPy() && seed_flag) {
+      auto seed_offset = gen_cuda->IncrementOffset(1);
+      int gen_offset = size * seed_offset.second;
+      thrust::transform(
+          index_sequence_begin, index_sequence_begin + size,
+          thrust::device_ptr<T>(data),
+          TruncatedNormalOffset<T>(mean, std, std::numeric_limits<T>::min(),
+                                   seed_offset.first, seed_offset.second));
+    }
+
     thrust::transform(
         index_sequence_begin, index_sequence_begin + size,
         thrust::device_ptr<T>(data),
diff --git a/paddle/fluid/operators/uniform_random_op.cc b/paddle/fluid/operators/uniform_random_op.cc
index 9cffe09a33abf29308072d6b3c8bfb8a636048da..6efada4343ca54c0d56f98cae20963bf0182f47b 100644
--- a/paddle/fluid/operators/uniform_random_op.cc
+++ b/paddle/fluid/operators/uniform_random_op.cc
@@ -54,9 +54,11 @@ class CPUUniformRandomKernel : public framework::OpKernel<T> {
       tensor = out_var->GetMutable<framework::LoDTensor>();
       if (!new_shape.empty()) tensor->Resize(framework::make_ddim(new_shape));
     } else {
-      PADDLE_THROW(
-          "uniform_random_op's output only"
-          "supports SelectedRows and LoDTensor");
+      PADDLE_THROW(platform::errors::InvalidArgument(
+          "Expected type of Output(out) in uniform_random_op must be Tensor, "
+          "SelectedRows. But got "
+          "unsupport type: %s.",
+          framework::ToTypeName(out_var->Type())));
     }
     T *data = tensor->mutable_data<T>(ctx.GetPlace());
 
diff --git a/paddle/fluid/operators/uniform_random_op.cu b/paddle/fluid/operators/uniform_random_op.cu
index 4df1e0ffeb97564803f452114d52ab03d0464f8a..563a6c165b748543516eabbcdb0e1c8b9be8a44d 100644
--- a/paddle/fluid/operators/uniform_random_op.cu
+++ b/paddle/fluid/operators/uniform_random_op.cu
@@ -51,6 +51,39 @@ struct UniformGenerator {
   }
 };
 
+template <typename T>
+struct UniformGeneratorOffset {
+  T min_, max_;
+  unsigned int seed_;
+  T diag_val_;
+  unsigned int diag_num_;
+  unsigned int diag_step_;
+  int offset_;
+  __host__ __device__ UniformGeneratorOffset(T min, T max, int seed,
+                                             int diag_num, int diag_step,
+                                             T diag_val, int offset)
+      : min_(min),
+        max_(max),
+        seed_(seed),
+        diag_num_(diag_num),
+        diag_step_(diag_step),
+        diag_val_(diag_val),
+        offset_(offset) {}
+
+  __host__ __device__ T operator()(const unsigned int n) const {
+    thrust::minstd_rand rng;
+    rng.seed(seed_);
+    thrust::uniform_real_distribution<T> dist(min_, max_);
+    rng.discard(n + offset_);
+    T out = dist(rng);
+    unsigned int remainder = n % (diag_step_ + 1);
+    if (remainder == 0 && diag_num_ > n / (diag_step_ + 1)) {
+      out = diag_val_;
+    }
+    return out;
+  }
+};
+
 // It seems that Eigen::Tensor::random in GPU will SEGFAULT.
 // Use std::random and thrust::random(thrust is a std library in CUDA) to
 // implement uniform random.
@@ -83,16 +116,19 @@ class GPUUniformRandomKernel : public framework::OpKernel<T> {
       tensor = out_var->GetMutable<framework::LoDTensor>();
       if (!new_shape.empty()) tensor->Resize(framework::make_ddim(new_shape));
     } else {
-      PADDLE_THROW(
-          "uniform_random_op's output only"
-          "supports SelectedRows and LoDTensor");
+      PADDLE_THROW(platform::errors::InvalidArgument(
+          "Expected type of Output(out) in uniform_random_op must be Tensor, "
+          "SelectedRows. But got "
+          "unsupport type: %s.",
+          framework::ToTypeName(out_var->Type())));
     }
     T* data = tensor->mutable_data<T>(context.GetPlace());
     unsigned int seed = static_cast<unsigned int>(context.Attr<int>("seed"));
-
+    bool seed_flag = false;
     if (seed == 0) {
       std::random_device rd;
       seed = rd();
+      seed_flag = true;
     }
 
     T min = static_cast<T>(context.Attr<float>("min"));
@@ -104,10 +140,23 @@ class GPUUniformRandomKernel : public framework::OpKernel<T> {
     T diag_val = static_cast<T>(context.Attr<float>("diag_val"));
     thrust::counting_iterator<unsigned int> index_sequence_begin(0);
     int64_t size = tensor->numel();
-    thrust::transform(
-        index_sequence_begin, index_sequence_begin + size,
-        thrust::device_ptr<T>(data),
-        UniformGenerator<T>(min, max, seed, diag_num, diag_step, diag_val));
+    int device_id =
+        BOOST_GET_CONST(platform::CUDAPlace, context.GetPlace()).GetDeviceId();
+    auto gen_cuda = framework::GetDefaultCUDAGenerator(device_id);
+    if (gen_cuda->GetIsInitPy() && seed_flag) {
+      auto seed_offset = gen_cuda->IncrementOffset(1);
+      int gen_offset = size * seed_offset.second;
+      thrust::transform(
+          index_sequence_begin, index_sequence_begin + size,
+          thrust::device_ptr<T>(data),
+          UniformGeneratorOffset<T>(min, max, seed_offset.first, diag_num,
+                                    diag_step, diag_val, gen_offset));
+    } else {
+      thrust::transform(
+          index_sequence_begin, index_sequence_begin + size,
+          thrust::device_ptr<T>(data),
+          UniformGenerator<T>(min, max, seed, diag_num, diag_step, diag_val));
+    }
   }
 };
 
diff --git a/paddle/fluid/operators/uniform_random_op.h b/paddle/fluid/operators/uniform_random_op.h
index d263dd03dd0de0d1b12925d0c3ec428b6730ef2e..6052e533643f3c4e5be977a87fceafa932892862 100644
--- a/paddle/fluid/operators/uniform_random_op.h
+++ b/paddle/fluid/operators/uniform_random_op.h
@@ -50,7 +50,10 @@ inline std::vector<int64_t> GetNewDataFromShapeTensor(
     }
     return vec_new_data;
   } else {
-    PADDLE_THROW("The dtype of shape tensor must be int32 or int64.");
+    PADDLE_THROW(platform::errors::InvalidArgument(
+        "Expected dtype of ShapeTensor must be int32, int64. But got "
+        "unsupport dtype: %s.",
+        paddle::framework::DataTypeToString(new_data_tensor->type())));
   }
 }
 
@@ -84,7 +87,11 @@ inline std::vector<int64_t> GetNewDataFromShapeTensorList(
         vec_new_shape.push_back(*tensor->data<int64_t>());
       }
     } else {
-      PADDLE_THROW("The dtype of shape tensor must be int32 or int64.");
+      PADDLE_THROW(platform::errors::InvalidArgument(
+          "Expected dtype of ShapeTensorList of %d-th must be int32, int64. "
+          "But got "
+          "unsupport dtype: %s.",
+          i, paddle::framework::DataTypeToString(tensor->type())));
     }
   }
 
diff --git a/paddle/fluid/operators/utils.h b/paddle/fluid/operators/utils.h
index e53981a53653a4830a39ceae47f4024bb757b039..aec995304a77118ecbf788ca3984c7e9da531f18 100644
--- a/paddle/fluid/operators/utils.h
+++ b/paddle/fluid/operators/utils.h
@@ -81,5 +81,26 @@ inline std::vector<T> GetDataFromTensorList(
   }
   return vec_new_data;
 }
+
+inline framework::DDim GetShape(const framework::ExecutionContext& ctx) {
+  // 1. shape is a Tensor
+  if (ctx.HasInput("ShapeTensor")) {
+    auto* shape_tensor = ctx.Input<framework::LoDTensor>("ShapeTensor");
+    auto vec_shape = GetDataFromTensor<int>(shape_tensor);
+    return framework::make_ddim(vec_shape);
+  }
+
+  // 2. shape is a list/tuple containing Tensor
+  auto shape_tensor_list = ctx.MultiInput<framework::Tensor>("ShapeTensorList");
+  if (shape_tensor_list.size() > 0) {
+    auto vec_shape = GetDataFromTensorList(shape_tensor_list);
+    return framework::make_ddim(vec_shape);
+  }
+
+  // 3. shape is a list/tuple without containing Tensor
+  auto vec_shape = ctx.Attr<std::vector<int64_t>>("shape");
+  return framework::make_ddim(vec_shape);
+}
+
 }  // namespace operators
 }  // namespace paddle
diff --git a/paddle/fluid/operators/var_conv_2d_op.cc b/paddle/fluid/operators/var_conv_2d_op.cc
index f8a29a52d7a3d9332b9dcb8189dfd7c1df902faa..db8b2c30501bd7f291b23728a26dcd3ea27e0ec5 100644
--- a/paddle/fluid/operators/var_conv_2d_op.cc
+++ b/paddle/fluid/operators/var_conv_2d_op.cc
@@ -78,21 +78,35 @@ void VarConv2dOP::InferShape(framework::InferShapeContext* ctx) const {
       platform::errors::NotFound("Col(Output) of VarConv2dOP is not found."));
 
   auto x_dims = ctx->GetInputDim("X");
-  PADDLE_ENFORCE_EQ(x_dims.size(), 2,
-                    "The rank of X(Input) can't be less than 2.");
+  PADDLE_ENFORCE_EQ(
+      x_dims.size(), 2,
+      platform::errors::InvalidArgument(
+          "The rank of X(Input) can't be less than 2, but received rank is %u.",
+          x_dims.size()));
 
   auto w_dims = ctx->GetInputDim("W");
 
-  PADDLE_ENFORCE_EQ(w_dims.size(), 2, "W should be 2-D tensor");
+  PADDLE_ENFORCE_EQ(
+      w_dims.size(), 2,
+      platform::errors::InvalidArgument(
+          "Input W should be a 2-D tensor, but its actual dimension is %u.",
+          w_dims.size()));
   int output_channel = ctx->Attrs().Get<int>("OutputChannel");
   int input_channel = ctx->Attrs().Get<int>("InputChannel");
   int kernel_h = ctx->Attrs().Get<int>("KernelH");
   int kernel_w = ctx->Attrs().Get<int>("KernelW");
-  PADDLE_ENFORCE_EQ(w_dims[0], output_channel,
-                    "W dim[0] should be equal to OutputChannel");
+  PADDLE_ENFORCE_EQ(
+      w_dims[0], output_channel,
+      platform::errors::InvalidArgument(
+          "Input W's dimension[0] should be equal to OutputChannel, the "
+          "dimension[0] is %d, OutputChannel is %d.",
+          w_dims[0], output_channel));
   PADDLE_ENFORCE_EQ(
       w_dims[1], input_channel * kernel_h * kernel_w,
-      "W dim[1] should be equal to InputChannel * StrideH * StrideW");
+      platform::errors::InvalidArgument(
+          "Input W's dimension[1] should be equal to InputChannel * StrideH * "
+          "StrideW, the dimension[1] is %d, expected value is %d.",
+          w_dims[1], input_channel * kernel_h * kernel_w));
 
   if (ctx->IsRuntime()) {
     framework::Variable* x_var =
@@ -103,10 +117,14 @@ void VarConv2dOP::InferShape(framework::InferShapeContext* ctx) const {
         platform::errors::InvalidArgument("The Input(X) Tensor of VarConv2dOP "
                                           "does not contain LoD information."));
 
-    PADDLE_ENFORCE_GE(x_lod.size(), 1, "The Input(X)'s lod info is corrupted.");
-    PADDLE_ENFORCE_EQ(
-        x_dims[0], static_cast<int64_t>(x_lod[0].back()),
-        "The Input(X)'s lod info mismatches the actual tensor shape.");
+    PADDLE_ENFORCE_GE(x_lod.size(), 1,
+                      platform::errors::InvalidArgument(
+                          "The Input(X)'s lod info is corrupted."));
+    PADDLE_ENFORCE_EQ(x_dims[0], static_cast<int64_t>(x_lod[0].back()),
+                      platform::errors::InvalidArgument(
+                          "The Input(X)'s lod info mismatches the actual "
+                          "tensor shape, input lod is %s, tensor shape is %s.",
+                          x_lod, x_dims));
 
     framework::Variable* row_var =
         BOOST_GET(framework::Variable*, ctx->GetInputVarPtrs("ROW")[0]);
diff --git a/paddle/fluid/platform/CMakeLists.txt b/paddle/fluid/platform/CMakeLists.txt
index 652b4dd47daa8aecdcae43e8c910d7dd61bbb64d..ef827fd74903afd007c864307e942749e3eb0bd1 100644
--- a/paddle/fluid/platform/CMakeLists.txt
+++ b/paddle/fluid/platform/CMakeLists.txt
@@ -136,6 +136,8 @@ cc_test(profiler_test SRCS profiler_test.cc DEPS profiler)
 nv_test(float16_gpu_test SRCS float16_test.cu DEPS lod_tensor)
 cc_test(float16_test SRCS float16_test.cc DEPS lod_tensor)
 
+cc_test(bfloat16_test SRCS bfloat16_test.cc DEPS lod_tensor)
+
 nv_test(test_limit_gpu_memory SRCS test_limit_gpu_memory.cu DEPS gpu_info flags)
 
 nv_library(cuda_device_guard SRCS cuda_device_guard.cc DEPS gpu_info)
diff --git a/paddle/fluid/platform/bfloat16.h b/paddle/fluid/platform/bfloat16.h
new file mode 100644
index 0000000000000000000000000000000000000000..742329abb2dae20437120c0d4ba5975d41b0a7c9
--- /dev/null
+++ b/paddle/fluid/platform/bfloat16.h
@@ -0,0 +1,439 @@
+// Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include <stdint.h>
+#include <limits>
+#if !defined(_WIN32)
+#define PADDLE_ALIGN(x) __attribute__((aligned(x)))
+#else
+#define PADDLE_ALIGN(x) __declspec(align(x))
+#endif
+
+#include <cstring>
+#include "paddle/fluid/platform/hostdevice.h"
+#include "unsupported/Eigen/CXX11/Tensor"
+
+namespace paddle {
+namespace platform {
+
+struct PADDLE_ALIGN(2) bfloat16 {
+ public:
+  uint16_t x;
+
+  bfloat16() = default;
+  bfloat16(const bfloat16& o) = default;
+  bfloat16& operator=(const bfloat16& o) = default;
+  bfloat16(bfloat16&& o) = default;
+  bfloat16& operator=(bfloat16&& o) = default;
+  ~bfloat16() = default;
+
+  HOSTDEVICE inline explicit bfloat16(float val) {
+    std::memcpy(&x, reinterpret_cast<char*>(&val) + 2, 2);
+  }
+
+  template <class T>
+  HOSTDEVICE inline explicit bfloat16(const T& val)
+      : x(bfloat16(static_cast<float>(val)).x) {}
+
+  HOSTDEVICE inline bfloat16& operator=(bool b) {
+    x = b ? 0x3f80 : 0;
+    return *this;
+  }
+
+  HOSTDEVICE inline bfloat16& operator=(int8_t val) {
+    x = bfloat16(val).x;
+    return *this;
+  }
+
+  HOSTDEVICE inline bfloat16& operator=(uint8_t val) {
+    x = bfloat16(val).x;
+    return *this;
+  }
+
+  HOSTDEVICE inline bfloat16& operator=(int16_t val) {
+    x = bfloat16(val).x;
+    return *this;
+  }
+
+  HOSTDEVICE inline bfloat16& operator=(uint16_t val) {
+    x = bfloat16(val).x;
+    return *this;
+  }
+
+  HOSTDEVICE inline bfloat16& operator=(int32_t val) {
+    x = bfloat16(val).x;
+    return *this;
+  }
+
+  HOSTDEVICE inline bfloat16& operator=(uint32_t val) {
+    x = bfloat16(val).x;
+    return *this;
+  }
+
+  HOSTDEVICE inline bfloat16& operator=(int64_t val) {
+    x = bfloat16(val).x;
+    return *this;
+  }
+
+  HOSTDEVICE inline bfloat16& operator=(uint64_t val) {
+    x = bfloat16(val).x;
+    return *this;
+  }
+
+  HOSTDEVICE inline bfloat16& operator=(float val) {
+    x = bfloat16(val).x;
+    return *this;
+  }
+
+  HOSTDEVICE inline bfloat16& operator=(double val) {
+    x = bfloat16(val).x;
+    return *this;
+  }
+
+  HOSTDEVICE inline explicit operator float() const {
+    float val = 0.f;
+    uint16_t temp = x;
+    memcpy(reinterpret_cast<char*>(&val) + 2, reinterpret_cast<char*>(&temp),
+           2);
+    return val;
+  }
+
+  HOSTDEVICE inline explicit operator bool() const { return (x & 0x7fff) != 0; }
+
+  HOSTDEVICE inline explicit operator int8_t() const {
+    return static_cast<int8_t>(static_cast<float>(*this));
+  }
+
+  HOSTDEVICE inline explicit operator uint8_t() const {
+    return static_cast<uint8_t>(static_cast<float>(*this));
+  }
+
+  HOSTDEVICE inline explicit operator int16_t() const {
+    return static_cast<int16_t>(static_cast<float>(*this));
+  }
+
+  HOSTDEVICE inline explicit operator uint16_t() const {
+    return static_cast<uint16_t>(static_cast<float>(*this));
+  }
+
+  HOSTDEVICE inline explicit operator int32_t() const {
+    return static_cast<int32_t>(static_cast<float>(*this));
+  }
+
+  HOSTDEVICE inline explicit operator uint32_t() const {
+    return static_cast<uint32_t>(static_cast<float>(*this));
+  }
+
+  HOSTDEVICE inline explicit operator int64_t() const {
+    return static_cast<int64_t>(static_cast<float>(*this));
+  }
+
+  HOSTDEVICE inline explicit operator uint64_t() const {
+    return static_cast<uint64_t>(static_cast<float>(*this));
+  }
+
+  HOSTDEVICE inline explicit operator double() const {
+    return static_cast<double>(static_cast<float>(*this));
+  }
+};
+
+HOSTDEVICE inline bfloat16 operator+(const bfloat16& a, const bfloat16& b) {
+  return bfloat16(static_cast<float>(a) + static_cast<float>(b));
+}
+
+HOSTDEVICE inline bfloat16 operator-(const bfloat16& a, const bfloat16& b) {
+  return bfloat16(static_cast<float>(a) - static_cast<float>(b));
+}
+
+HOSTDEVICE inline bfloat16 operator*(const bfloat16& a, const bfloat16& b) {
+  return bfloat16(static_cast<float>(a) * static_cast<float>(b));
+}
+
+HOSTDEVICE inline bfloat16 operator/(const bfloat16& a, const bfloat16& b) {
+  return bfloat16(static_cast<float>(a) / static_cast<float>(b));
+}
+
+HOSTDEVICE inline bfloat16 operator-(const bfloat16& a) {
+  bfloat16 res;
+  res.x = a.x ^ 0x8000;
+  return res;
+}
+
+HOSTDEVICE inline bfloat16& operator+=(bfloat16& a,  // NOLINT
+                                       const bfloat16& b) {
+  a = bfloat16(static_cast<float>(a) + static_cast<float>(b));
+  return a;
+}
+
+HOSTDEVICE inline bfloat16& operator-=(bfloat16& a,  // NOLINT
+                                       const bfloat16& b) {
+  a = bfloat16(static_cast<float>(a) - static_cast<float>(b));
+  return a;
+}
+
+HOSTDEVICE inline bfloat16& operator*=(bfloat16& a,  // NOLINT
+                                       const bfloat16& b) {
+  a = bfloat16(static_cast<float>(a) * static_cast<float>(b));
+  return a;
+}
+
+HOSTDEVICE inline bfloat16& operator/=(bfloat16& a,  // NOLINT
+                                       const bfloat16& b) {
+  a = bfloat16(static_cast<float>(a) / static_cast<float>(b));
+  return a;
+}
+
+HOSTDEVICE inline bfloat16 raw_uint16_to_bfloat16(uint16_t a) {
+  bfloat16 res;
+  res.x = a;
+  return res;
+}
+
+HOSTDEVICE inline bool operator==(const bfloat16& a, const bfloat16& b) {
+  return static_cast<float>(a) == static_cast<float>(b);
+}
+
+HOSTDEVICE inline bool operator!=(const bfloat16& a, const bfloat16& b) {
+  return static_cast<float>(a) != static_cast<float>(b);
+}
+
+HOSTDEVICE inline bool operator<(const bfloat16& a, const bfloat16& b) {
+  return static_cast<float>(a) < static_cast<float>(b);
+}
+
+HOSTDEVICE inline bool operator<=(const bfloat16& a, const bfloat16& b) {
+  return static_cast<float>(a) <= static_cast<float>(b);
+}
+
+HOSTDEVICE inline bool operator>(const bfloat16& a, const bfloat16& b) {
+  return static_cast<float>(a) > static_cast<float>(b);
+}
+
+HOSTDEVICE inline bool operator>=(const bfloat16& a, const bfloat16& b) {
+  return static_cast<float>(a) >= static_cast<float>(b);
+}
+
+HOSTDEVICE inline bool(isnan)(const bfloat16& a) {
+  return (a.x & 0x7FFF) > 0x7F80;
+}
+
+HOSTDEVICE inline bool(isinf)(const bfloat16& a) {
+  return (a.x & 0x7F80) == 0x7F80;
+}
+
+HOSTDEVICE inline bool(isfinite)(const bfloat16& a) {
+  return !((isnan)(a)) && !((isinf)(a));
+}
+
+inline std::ostream& operator<<(std::ostream& os, const bfloat16& a) {
+  os << a.x;
+  return os;
+}
+
+}  // namespace platform
+}  // namespace paddle
+
+namespace std {
+
+template <>
+struct is_pod<paddle::platform::bfloat16> {
+  static const bool value =
+      is_trivial<paddle::platform::bfloat16>::value &&
+      is_standard_layout<paddle::platform::bfloat16>::value;
+};
+
+template <>
+struct is_floating_point<paddle::platform::bfloat16>
+    : std::integral_constant<
+          bool, std::is_same<paddle::platform::bfloat16,
+                             typename std::remove_cv<
+                                 paddle::platform::bfloat16>::type>::value> {};
+template <>
+struct is_signed<paddle::platform::bfloat16> {
+  static const bool value = true;
+};
+
+template <>
+struct is_unsigned<paddle::platform::bfloat16> {
+  static const bool value = false;
+};
+
+inline bool isnan(const paddle::platform::bfloat16& a) {
+  return paddle::platform::isnan(a);
+}
+
+inline bool isinf(const paddle::platform::bfloat16& a) {
+  return paddle::platform::isinf(a);
+}
+
+template <>
+struct numeric_limits<paddle::platform::bfloat16> {
+  static const bool is_specialized = true;
+  static const bool is_signed = true;
+  static const bool is_integer = false;
+  static const bool is_exact = false;
+  static const bool has_infinity = true;
+  static const bool has_quiet_NaN = true;
+  static const bool has_signaling_NaN = true;
+  static const float_denorm_style has_denorm = denorm_present;
+  static const bool has_denorm_loss = false;
+  static const std::float_round_style round_style = std::round_to_nearest;
+  static const bool is_iec559 = false;
+  static const bool is_bounded = false;
+  static const bool is_modulo = false;
+  static const int digits = 8;
+  static const int digits10 = 2;
+  static const int max_digits10 = 9;
+  static const int radix = 2;
+  static const int min_exponent = -125;
+  static const int min_exponent10 = -37;
+  static const int max_exponent = 128;
+  static const int max_exponent10 = 38;
+  static const bool traps = true;
+  static const bool tinyness_before = false;
+
+  static paddle::platform::bfloat16(min)() {
+    return paddle::platform::raw_uint16_to_bfloat16(0x007f);
+  }
+  static paddle::platform::bfloat16 lowest() {
+    return paddle::platform::raw_uint16_to_bfloat16(0xff7f);
+  }
+  static paddle::platform::bfloat16(max)() {
+    return paddle::platform::raw_uint16_to_bfloat16(0x7f7f);
+  }
+  static paddle::platform::bfloat16 epsilon() {
+    return paddle::platform::raw_uint16_to_bfloat16(0x3400);
+  }
+  static paddle::platform::bfloat16 round_error() {
+    return paddle::platform::bfloat16(0.5);
+  }
+  static paddle::platform::bfloat16 infinity() {
+    return paddle::platform::raw_uint16_to_bfloat16(0x7f80);
+  }
+  static paddle::platform::bfloat16 quiet_NaN() {
+    return paddle::platform::raw_uint16_to_bfloat16(0xffc1);
+  }
+  static paddle::platform::bfloat16 signaling_NaN() {
+    return paddle::platform::raw_uint16_to_bfloat16(0xff81);
+  }
+  static paddle::platform::bfloat16 denorm_min() {
+    return paddle::platform::raw_uint16_to_bfloat16(0x0001);
+  }
+};
+
+}  // namespace std
+
+namespace Eigen {
+
+using bfloat16 = paddle::platform::bfloat16;
+
+template <>
+struct NumTraits<bfloat16> : GenericNumTraits<bfloat16> {
+  enum {
+    IsSigned = true,
+    IsInteger = false,
+    IsComplex = false,
+    RequireInitialization = false
+  };
+
+  HOSTDEVICE static inline bfloat16 epsilon() {
+    return paddle::platform::raw_uint16_to_bfloat16(0x3400);
+  }
+  HOSTDEVICE static inline bfloat16 dummy_precision() {
+    return bfloat16(1e-5f);
+  }
+  HOSTDEVICE static inline bfloat16 highest() {
+    return paddle::platform::raw_uint16_to_bfloat16(0x7f7f);
+  }
+  HOSTDEVICE static inline bfloat16 lowest() {
+    return paddle::platform::raw_uint16_to_bfloat16(0xff7f);
+  }
+  HOSTDEVICE static inline bfloat16 infinity() {
+    return paddle::platform::raw_uint16_to_bfloat16(0x7f80);
+  }
+  HOSTDEVICE static inline bfloat16 quiet_NaN() {
+    return paddle::platform::raw_uint16_to_bfloat16(0xffc1);
+  }
+};
+namespace numext {
+
+template <>
+HOSTDEVICE inline bool(isnan)(const bfloat16& a) {
+  return (paddle::platform::isnan)(a);
+}
+
+template <>
+HOSTDEVICE inline bool(isinf)(const bfloat16& a) {
+  return (paddle::platform::isinf)(a);
+}
+
+template <>
+HOSTDEVICE inline bool(isfinite)(const bfloat16& a) {
+  return (paddle::platform::isfinite)(a);
+}
+
+template <>
+HOSTDEVICE inline bfloat16 exp(const bfloat16& a) {
+  return bfloat16(::expf(static_cast<float>(a)));
+}
+
+template <>
+HOSTDEVICE inline bfloat16 erf(const bfloat16& a) {
+  return bfloat16(::erff(static_cast<float>(a)));
+}
+
+template <>
+HOSTDEVICE inline bfloat16 log(const bfloat16& a) {
+  return bfloat16(::logf(static_cast<float>(a)));
+}
+
+template <>
+HOSTDEVICE inline bfloat16 tanh(const bfloat16& a) {
+  return bfloat16(::tanhf(static_cast<float>(a)));
+}
+
+template <>
+HOSTDEVICE inline bfloat16 sqrt(const bfloat16& a) {
+  return bfloat16(::sqrtf(static_cast<float>(a)));
+}
+
+template <>
+HOSTDEVICE inline bfloat16 ceil(const bfloat16& a) {
+  return bfloat16(::ceilf(static_cast<float>(a)));
+}
+
+template <>
+HOSTDEVICE inline bfloat16 floor(const bfloat16& a) {
+  return bfloat16(::floorf(static_cast<float>(a)));
+}
+
+template <>
+HOSTDEVICE inline bfloat16 round(const bfloat16& a) {
+  return bfloat16(::roundf(static_cast<float>(a)));
+}
+
+template <>
+HOSTDEVICE inline bfloat16 pow(const bfloat16& a, const bfloat16& b) {
+  return bfloat16(::powf(static_cast<float>(a), static_cast<float>(b)));
+}
+
+template <>
+HOSTDEVICE inline bfloat16 abs(const bfloat16& a) {
+  return bfloat16(::fabs(static_cast<float>(a)));
+}
+
+}  // namespace numext
+}  // namespace Eigen
diff --git a/paddle/fluid/platform/bfloat16_test.cc b/paddle/fluid/platform/bfloat16_test.cc
new file mode 100644
index 0000000000000000000000000000000000000000..bdb508ee33630004daae132fcdcf71146a50e640
--- /dev/null
+++ b/paddle/fluid/platform/bfloat16_test.cc
@@ -0,0 +1,162 @@
+/* Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/platform/bfloat16.h"
+
+#include <vector>
+
+#define GLOG_NO_ABBREVIATED_SEVERITIES  // msvc conflict logging with windows.h
+#include "gtest/gtest.h"
+#include "paddle/fluid/framework/lod_tensor.h"
+#include "paddle/fluid/platform/enforce.h"
+#include "paddle/fluid/platform/init.h"
+
+namespace paddle {
+namespace platform {
+
+using bfloat16 = paddle::platform::bfloat16;
+
+TEST(bfloat16, conversion_cpu) {
+  // Conversion from float
+  EXPECT_EQ(bfloat16(1.0f).x, 0x3f80);
+  EXPECT_EQ(bfloat16(0.5f).x, 0x3f00);
+  EXPECT_EQ(bfloat16(0.33333f).x, 0x3eaa);
+  EXPECT_EQ(bfloat16(0.0f).x, 0x0000);
+  EXPECT_EQ(bfloat16(-0.0f).x, 0x8000);
+  EXPECT_EQ(bfloat16(65504.0f).x, 0x477f);
+  EXPECT_EQ(bfloat16(65536.0f).x, 0x4780);
+
+  // Conversion from double
+  EXPECT_EQ(bfloat16(1.0).x, 0x3f80);
+  EXPECT_EQ(bfloat16(0.5).x, 0x3f00);
+  EXPECT_EQ(bfloat16(0.33333).x, 0x3eaa);
+  EXPECT_EQ(bfloat16(0.0).x, 0x0000);
+  EXPECT_EQ(bfloat16(-0.0).x, 0x8000);
+  EXPECT_EQ(bfloat16(65504.0).x, 0x477f);
+  EXPECT_EQ(bfloat16(65536.0).x, 0x4780);
+
+  // Conversion from int
+  EXPECT_EQ(bfloat16(-1).x, 0xbf80);
+  EXPECT_EQ(bfloat16(0).x, 0x0000);
+  EXPECT_EQ(bfloat16(1).x, 0x3f80);
+  EXPECT_EQ(bfloat16(2).x, 0x4000);
+  EXPECT_EQ(bfloat16(3).x, 0x4040);
+
+  // Conversion from bool
+  EXPECT_EQ(bfloat16(true).x, 0x3f80);
+  EXPECT_EQ(bfloat16(false).x, 0x0000);
+
+  // Assignment operator
+  bfloat16 v_assign;
+  v_assign = bfloat16(0.f);
+  EXPECT_EQ(v_assign.x, 0x0000);
+  v_assign = 0.5f;
+  EXPECT_EQ(v_assign.x, 0x3f00);
+  v_assign = 0.33333;
+  EXPECT_EQ(v_assign.x, 0x3eaa);
+  v_assign = -1;
+  EXPECT_EQ(v_assign.x, 0xbf80);
+
+  // Conversion operator
+  EXPECT_EQ(static_cast<float>(bfloat16(0.5f)), 0.5f);
+  EXPECT_NEAR(static_cast<double>(bfloat16(0.33333)), 0.33333, 0.01);
+  EXPECT_EQ(static_cast<int>(bfloat16(-1)), -1);
+  EXPECT_EQ(static_cast<bool>(bfloat16(true)), true);
+}
+
+TEST(bfloat16, arithmetic_cpu) {
+  EXPECT_NEAR(static_cast<float>(bfloat16(1) + bfloat16(1)), 2, 0.001);
+  EXPECT_EQ(static_cast<float>(bfloat16(5) + bfloat16(-5)), 0);
+  EXPECT_NEAR(static_cast<float>(bfloat16(0.33333f) + bfloat16(0.66667f)), 1.0f,
+              0.01);
+  EXPECT_EQ(static_cast<float>(bfloat16(3) - bfloat16(5)), -2);
+  EXPECT_NEAR(static_cast<float>(bfloat16(0.66667f) - bfloat16(0.33333f)),
+              0.33334f, 0.01);
+  EXPECT_NEAR(static_cast<float>(bfloat16(3.3f) * bfloat16(2.0f)), 6.6f, 0.01);
+  EXPECT_NEAR(static_cast<float>(bfloat16(-2.1f) * bfloat16(-3.0f)), 6.3f, 0.1);
+  EXPECT_NEAR(static_cast<float>(bfloat16(2.0f) / bfloat16(3.0f)), 0.66667f,
+              0.01);
+  EXPECT_EQ(static_cast<float>(bfloat16(1.0f) / bfloat16(2.0f)), 0.5f);
+  EXPECT_EQ(static_cast<float>(-bfloat16(512.0f)), -512.0f);
+  EXPECT_EQ(static_cast<float>(-bfloat16(-512.0f)), 512.0f);
+}
+
+TEST(bfloat16, comparison_cpu) {
+  EXPECT_TRUE(bfloat16(1.0f) == bfloat16(1.0f));
+  EXPECT_FALSE(bfloat16(-1.0f) == bfloat16(-0.5f));
+  EXPECT_TRUE(bfloat16(1.0f) != bfloat16(0.5f));
+  EXPECT_FALSE(bfloat16(-1.0f) != bfloat16(-1.0f));
+  EXPECT_TRUE(bfloat16(1.0f) < bfloat16(2.0f));
+  EXPECT_FALSE(bfloat16(-1.0f) < bfloat16(-1.0f));
+  EXPECT_TRUE(bfloat16(1.0f) <= bfloat16(1.0f));
+  EXPECT_TRUE(bfloat16(2.0f) > bfloat16(1.0f));
+  EXPECT_FALSE(bfloat16(-2.0f) > bfloat16(-2.0f));
+  EXPECT_TRUE(bfloat16(2.0f) >= bfloat16(2.0f));
+}
+
+TEST(bfloat16, lod_tensor_cpu) {
+  framework::LoDTensor lod_tensor;
+
+  std::vector<bfloat16> input_data = {bfloat16(1.0f), bfloat16(0.5f),
+                                      bfloat16(0.33333f), bfloat16(0.0f)};
+  EXPECT_EQ(input_data[0].x, 0x3f80);
+  EXPECT_EQ(input_data[1].x, 0x3f00);
+  EXPECT_EQ(input_data[2].x, 0x3eaa);
+  EXPECT_EQ(input_data[3].x, 0x0000);
+
+  lod_tensor.Resize({4, 1});
+  lod_tensor.set_lod(framework::LoD({{0, 2, 4}}));
+  bfloat16* data_ptr = lod_tensor.mutable_data<bfloat16>(CPUPlace());
+
+  EXPECT_NE(data_ptr, nullptr);
+  EXPECT_EQ(input_data.size(), static_cast<size_t>(lod_tensor.numel()));
+  for (size_t i = 0; i < input_data.size(); ++i) {
+    data_ptr[i] = input_data[i];
+    EXPECT_EQ(data_ptr[i].x, input_data[i].x);
+  }
+}
+
+TEST(bfloat16, floating) {
+  // compile time assert.
+  PADDLE_ENFORCE_EQ(
+      std::is_floating_point<bfloat16>::value, true,
+      platform::errors::Fatal("std::is_floating_point with bfloat16 data type "
+                              "should be equal to true but it is not"));
+}
+
+TEST(bfloat16, print) {
+  bfloat16 a = bfloat16(1.0f);
+  std::cout << a << std::endl;
+}
+
+// CPU test
+TEST(bfloat16, isinf) {
+  bfloat16 a;
+  a.x = 0x7f80;
+  bfloat16 b = bfloat16(INFINITY);
+  bfloat16 c = static_cast<bfloat16>(INFINITY);
+  EXPECT_EQ(std::isinf(a), true);
+  EXPECT_EQ(std::isinf(b), true);
+  EXPECT_EQ(std::isinf(c), true);
+}
+
+TEST(bfloat16, isnan) {
+  bfloat16 a;
+  a.x = 0x7fff;
+  bfloat16 b = bfloat16(NAN);
+  bfloat16 c = static_cast<bfloat16>(NAN);
+  EXPECT_EQ(std::isnan(a), true);
+  EXPECT_EQ(std::isnan(b), true);
+  EXPECT_EQ(std::isnan(c), true);
+}
+
+}  // namespace platform
+}  // namespace paddle
diff --git a/paddle/fluid/platform/cuda_profiler.h b/paddle/fluid/platform/cuda_profiler.h
index 957bdf1e698d0aedb86c5b0cb732ab545c260bcc..a9382f2c8adcb18e320ef44086a312f89c03ad09 100644
--- a/paddle/fluid/platform/cuda_profiler.h
+++ b/paddle/fluid/platform/cuda_profiler.h
@@ -24,7 +24,11 @@ namespace platform {
 
 void CudaProfilerInit(std::string output_file, std::string output_mode,
                       std::string config_file) {
-  PADDLE_ENFORCE(output_mode == "kvp" || output_mode == "csv");
+  PADDLE_ENFORCE(output_mode == "kvp" || output_mode == "csv",
+                 platform::errors::InvalidArgument(
+                     "Unsupported cuda profiler output mode, expect `kvp` or "
+                     "`csv`, but received `%s`.",
+                     output_mode));
   cudaOutputMode_t mode = output_mode == "csv" ? cudaCSV : cudaKeyValuePair;
   PADDLE_ENFORCE_CUDA_SUCCESS(
       cudaProfilerInitialize(config_file.c_str(), output_file.c_str(), mode));
diff --git a/paddle/fluid/platform/cudnn_helper.h b/paddle/fluid/platform/cudnn_helper.h
index efb57e12fdbe650e74101355da73be929f072be7..4b9c5c429dabc32fad6f05e4f066ab063057e733 100644
--- a/paddle/fluid/platform/cudnn_helper.h
+++ b/paddle/fluid/platform/cudnn_helper.h
@@ -273,11 +273,123 @@ class ScopedTensorDescriptor {
                       groups);
   }
 
+  inline cudnnTensorDescriptor_t descriptor(const cudnnDataType_t cudnn_type,
+                                            const std::vector<int>& dim,
+                                            const std::vector<int>& stride) {
+    PADDLE_ENFORCE_CUDA_SUCCESS(dynload::cudnnSetTensorNdDescriptor(
+        desc_, cudnn_type, dim.size(), dim.data(), stride.data()));
+    return desc_;
+  }
+
+  template <typename T>
+  inline cudnnTensorDescriptor_t descriptor(const std::vector<int>& dim,
+                                            const std::vector<int>& stride) {
+    return descriptor(CudnnDataType<T>::type, dim, stride);
+  }
+
+  inline cudnnTensorDescriptor_t desc() { return desc_; }
+
  private:
   cudnnTensorDescriptor_t desc_;
   DISABLE_COPY_AND_ASSIGN(ScopedTensorDescriptor);
 };
 
+#if CUDNN_VERSION >= 7201
+class ScopedRNNTensorDescriptor {
+ public:
+  ScopedRNNTensorDescriptor() {
+    PADDLE_ENFORCE_CUDA_SUCCESS(dynload::cudnnCreateRNNDataDescriptor(&desc_));
+  }
+
+  ~ScopedRNNTensorDescriptor() PADDLE_MAY_THROW {
+    PADDLE_ENFORCE_CUDA_SUCCESS(dynload::cudnnDestroyRNNDataDescriptor(desc_));
+  }
+
+  inline cudnnRNNDataDescriptor_t descriptor(
+      const cudnnDataType_t cudnn_type, int max_seq_length, int batch_size,
+      int input_size, bool time_major, const std::vector<int>& seq_length) {
+    static float padding_fill = 0.0f;
+    cudnnRNNDataLayout_t layout;
+
+    if (time_major) {
+      layout = CUDNN_RNN_DATA_LAYOUT_SEQ_MAJOR_UNPACKED;
+    } else {
+      layout = CUDNN_RNN_DATA_LAYOUT_BATCH_MAJOR_UNPACKED;
+    }
+
+    PADDLE_ENFORCE_CUDA_SUCCESS(dynload::cudnnSetRNNDataDescriptor(
+        desc_, cudnn_type, layout, max_seq_length, batch_size, input_size,
+        seq_length.data(), static_cast<void*>(&padding_fill)));
+
+    return desc_;
+  }
+
+  template <typename T>
+  inline cudnnRNNDataDescriptor_t descriptor(
+      int max_length, int batch_size, int input_size, bool time_major,
+      const std::vector<int>& seq_length) {
+    return descriptor(CudnnDataType<T>::type, max_length, batch_size,
+                      input_size, time_major, seq_length);
+  }
+
+  inline cudnnRNNDataDescriptor_t desc() { return desc_; }
+
+ private:
+  cudnnRNNDataDescriptor_t desc_;
+  DISABLE_COPY_AND_ASSIGN(ScopedRNNTensorDescriptor);
+};
+#endif
+
+class ScopedDropoutDescriptor {
+ public:
+  ScopedDropoutDescriptor() {
+    PADDLE_ENFORCE_CUDA_SUCCESS(dynload::cudnnCreateDropoutDescriptor(&desc_));
+  }
+  ~ScopedDropoutDescriptor() PADDLE_MAY_THROW {
+    PADDLE_ENFORCE_CUDA_SUCCESS(dynload::cudnnDestroyDropoutDescriptor(desc_));
+  }
+
+  inline cudnnDropoutDescriptor_t descriptor(const cudnnHandle_t& handle,
+                                             const platform::Place& place,
+                                             bool initialized,
+                                             float dropout_prob_,
+                                             framework::Tensor* dropout_state_,
+                                             int seed, size_t state_size) {
+    auto* dropout_state_data = dropout_state_->data<uint8_t>();
+    if (!initialized) {
+      PADDLE_ENFORCE_CUDA_SUCCESS(dynload::cudnnSetDropoutDescriptor(
+          desc_, handle, dropout_prob_, dropout_state_data, state_size, seed));
+    } else {
+      auto dropout_state_dims = dropout_state_->dims();
+      state_size = dropout_state_dims[0];
+      PADDLE_ENFORCE_CUDA_SUCCESS(dynload::cudnnRestoreDropoutDescriptor(
+          desc_, handle, dropout_prob_, dropout_state_data, state_size, 0));
+    }
+    return desc_;
+  }
+  inline cudnnDropoutDescriptor_t desc() { return desc_; }
+
+ private:
+  cudnnDropoutDescriptor_t desc_;
+  DISABLE_COPY_AND_ASSIGN(ScopedDropoutDescriptor);
+};
+
+class ScopedRNNDescriptor {
+ public:
+  ScopedRNNDescriptor() {
+    PADDLE_ENFORCE_CUDA_SUCCESS(dynload::cudnnCreateRNNDescriptor(&desc_));
+  }
+  ~ScopedRNNDescriptor() PADDLE_MAY_THROW {
+    PADDLE_ENFORCE_CUDA_SUCCESS(dynload::cudnnDestroyRNNDescriptor(desc_));
+  }
+
+  inline cudnnRNNDescriptor_t desc() { return desc_; }
+
+ private:
+  cudnnRNNDescriptor_t desc_;
+  DISABLE_COPY_AND_ASSIGN(ScopedRNNDescriptor);
+};
+
 class ScopedFilterDescriptor {
  public:
   ScopedFilterDescriptor() {
@@ -314,6 +426,8 @@ class ScopedFilterDescriptor {
                       kernel, groups);
   }
 
+  inline cudnnFilterDescriptor_t desc() { return desc_; }
+
  private:
   cudnnFilterDescriptor_t desc_;
   DISABLE_COPY_AND_ASSIGN(ScopedFilterDescriptor);
diff --git a/paddle/fluid/platform/device_context.cc b/paddle/fluid/platform/device_context.cc
index 29982c13c8ca88bc8b4a168f92e4116a283a97e8..34305c404b4df72ed28547e46817be00d6722a42 100644
--- a/paddle/fluid/platform/device_context.cc
+++ b/paddle/fluid/platform/device_context.cc
@@ -12,6 +12,7 @@ limitations under the License. */
 #include "paddle/fluid/platform/device_context.h"
 #include <set>
 #include <string>
+#include <thread>  //NOLINT
 #include <unordered_set>
 #include <vector>
 
@@ -23,6 +24,7 @@ limitations under the License. */
 #endif
 
 #include "glog/logging.h"
+#include "unsupported/Eigen/CXX11/ThreadPool"
 
 namespace paddle {
 namespace memory {
@@ -131,16 +133,31 @@ DeviceContextPool::DeviceContextPool(
 
 CPUDeviceContext::CPUDeviceContext() {
   eigen_device_.reset(new Eigen::DefaultDevice());
+  InitPoolDevice();
 }
 
 CPUDeviceContext::CPUDeviceContext(CPUPlace place) : place_(place) {
   eigen_device_.reset(new Eigen::DefaultDevice());
+  InitPoolDevice();
+}
+
+void CPUDeviceContext::InitPoolDevice() {
+  using EigenEnv = Eigen::StlThreadEnvironment;
+  using EigenThreadPool = Eigen::ThreadPoolTempl<EigenEnv>;
+  int num_threads = std::thread::hardware_concurrency();
+  eigen_threadpool_.reset(new EigenThreadPool(num_threads));
+  eigen_pool_device_.reset(
+      new Eigen::ThreadPoolDevice(eigen_threadpool_.get(), num_threads));
 }
 
 Eigen::DefaultDevice* CPUDeviceContext::eigen_device() const {
   return eigen_device_.get();
 }
 
+Eigen::ThreadPoolDevice* CPUDeviceContext::eigen_pool_device() const {
+  return eigen_pool_device_.get();
+}
+
 Place CPUDeviceContext::GetPlace() const { return place_; }
 
 #ifdef PADDLE_WITH_XPU
diff --git a/paddle/fluid/platform/device_context.h b/paddle/fluid/platform/device_context.h
index 8bfdfc8a1c6033a79c197e1cd425197f77079bda..28d94627f9575573075beaa328a682314b5c3b71 100644
--- a/paddle/fluid/platform/device_context.h
+++ b/paddle/fluid/platform/device_context.h
@@ -41,6 +41,7 @@ limitations under the License. */
 #ifdef PADDLE_WITH_CUDA
 #include "paddle/fluid/platform/stream/cuda_stream.h"
 #endif
+#define EIGEN_USE_THREADS
 #include "unsupported/Eigen/CXX11/Tensor"
 
 #ifdef PADDLE_WITH_XPU
@@ -65,11 +66,17 @@ class CPUDeviceContext : public DeviceContext {
 
   Eigen::DefaultDevice* eigen_device() const;
 
+  Eigen::ThreadPoolDevice* eigen_pool_device() const;
+
   Place GetPlace() const override;
 
+  inline void InitPoolDevice();
+
  private:
   CPUPlace place_;
   std::unique_ptr<Eigen::DefaultDevice> eigen_device_;
+  std::unique_ptr<Eigen::ThreadPoolDevice> eigen_pool_device_;
+  std::unique_ptr<Eigen::ThreadPool> eigen_threadpool_;
 };
 
 template <typename Place>
diff --git a/paddle/fluid/platform/dynload/cublas.h b/paddle/fluid/platform/dynload/cublas.h
index 7e32720c1d733411178c102d5c4500f722e7d005..562e7542012247c86add9e64f182d857ea969c60 100644
--- a/paddle/fluid/platform/dynload/cublas.h
+++ b/paddle/fluid/platform/dynload/cublas.h
@@ -38,14 +38,15 @@ extern void *cublas_dso_handle;
  */
 #define DECLARE_DYNAMIC_LOAD_CUBLAS_WRAP(__name)                             \
   struct DynLoad__##__name {                                                 \
-    using FUNC_TYPE = decltype(&::__name);                                   \
     template <typename... Args>                                              \
-    inline cublasStatus_t operator()(Args... args) {                         \
+    inline auto operator()(Args... args) -> DECLARE_TYPE(__name, args...) {  \
+      using cublas_func =                                                    \
+          decltype(::__name(std::declval<Args>()...)) (*)(Args...);          \
       std::call_once(cublas_dso_flag, []() {                                 \
         cublas_dso_handle = paddle::platform::dynload::GetCublasDsoHandle(); \
       });                                                                    \
       static void *p_##__name = dlsym(cublas_dso_handle, #__name);           \
-      return reinterpret_cast<FUNC_TYPE>(p_##__name)(args...);               \
+      return reinterpret_cast<cublas_func>(p_##__name)(args...);             \
     }                                                                        \
   };                                                                         \
   extern DynLoad__##__name __name
diff --git a/paddle/fluid/platform/dynload/cudnn.cc b/paddle/fluid/platform/dynload/cudnn.cc
index 44a03d6f14a3ba07d73cfbc944d8db9601394103..1166dc5e4ad93fa23ef00623de6777b78b56ea09 100644
--- a/paddle/fluid/platform/dynload/cudnn.cc
+++ b/paddle/fluid/platform/dynload/cudnn.cc
@@ -46,6 +46,10 @@ CUDNN_DNN_ROUTINE_EACH_R6(DEFINE_WRAP);
 CUDNN_DNN_ROUTINE_EACH_R7(DEFINE_WRAP);
 #endif
 
+#ifdef CUDNN_DNN_ROUTINE_EACH_AFTER_TWO_R7
+CUDNN_DNN_ROUTINE_EACH_AFTER_TWO_R7(DEFINE_WRAP);
+#endif
+
 #ifdef CUDNN_DNN_ROUTINE_EACH_AFTER_R7
 CUDNN_DNN_ROUTINE_EACH_AFTER_R7(DEFINE_WRAP);
 #endif
diff --git a/paddle/fluid/platform/dynload/cudnn.h b/paddle/fluid/platform/dynload/cudnn.h
index ebeb14e940e5fd904e506bca565c4aeae84c93cf..fba41417648ba606727d00e71f48766f47479989 100644
--- a/paddle/fluid/platform/dynload/cudnn.h
+++ b/paddle/fluid/platform/dynload/cudnn.h
@@ -180,6 +180,19 @@ CUDNN_DNN_ROUTINE_EACH_R6(DECLARE_DYNAMIC_LOAD_CUDNN_WRAP)
 CUDNN_DNN_ROUTINE_EACH_R7(DECLARE_DYNAMIC_LOAD_CUDNN_WRAP)
 #endif
 
+#if CUDNN_VERSION >= 7201
+#define CUDNN_DNN_ROUTINE_EACH_AFTER_TWO_R7(__macro) \
+  __macro(cudnnCreateRNNDataDescriptor);             \
+  __macro(cudnnDestroyRNNDataDescriptor);            \
+  __macro(cudnnSetRNNDataDescriptor);                \
+  __macro(cudnnSetRNNPaddingMode);                   \
+  __macro(cudnnRNNForwardTrainingEx);                \
+  __macro(cudnnRNNBackwardDataEx);                   \
+  __macro(cudnnRNNBackwardWeightsEx);                \
+  __macro(cudnnRNNForwardInferenceEx);
+CUDNN_DNN_ROUTINE_EACH_AFTER_TWO_R7(DECLARE_DYNAMIC_LOAD_CUDNN_WRAP)
+#endif
+
 #if CUDNN_VERSION >= 7401
 #define CUDNN_DNN_ROUTINE_EACH_AFTER_R7(__macro)                     \
   __macro(cudnnGetBatchNormalizationForwardTrainingExWorkspaceSize); \
diff --git a/paddle/fluid/platform/flags.cc b/paddle/fluid/platform/flags.cc
index af8798a4b7cf5a8832ce9345cad45ce3096484e4..9116edd01b040e793d23c76a04b2c93ed4d2586b 100644
--- a/paddle/fluid/platform/flags.cc
+++ b/paddle/fluid/platform/flags.cc
@@ -521,3 +521,18 @@ DEFINE_int32(
 DEFINE_bool(sort_sum_gradient, false,
             "Sum gradients by the reverse order of "
             "the forward execution sequence.");
+
+/**
+ * Performance related FLAG
+ * Name: max_inplace_grad_add
+ * Since Version: 2.0.0
+ * Value Range: int32, default=0
+ * Example:
+ * Note: The maximum number of inplace grad_add.
+ */
+DEFINE_int32(
+    max_inplace_grad_add, 0,
+    "The maximum number of inplace grad_add. When doing "
+    "gradient accumulation, if the number of gradients need to that "
+    "less FLAGS_max_inplace_grad_add, than it will be use several grad_add"
+    "instead of sum. Default is 0.");
diff --git a/paddle/fluid/platform/mkldnn_helper.h b/paddle/fluid/platform/mkldnn_helper.h
index 3782eb684f21f8c09e9dac124082ae596fe5d1bc..b012a103ea3031efb381d7039b15e82b2af52bf7 100644
--- a/paddle/fluid/platform/mkldnn_helper.h
+++ b/paddle/fluid/platform/mkldnn_helper.h
@@ -161,6 +161,12 @@ inline mkldnn::memory::data_type MKLDNNGetDataType<uint8_t>() {
   return mkldnn::memory::data_type::u8;
 }
 
+template <>
+inline mkldnn::memory::data_type
+MKLDNNGetDataType<paddle::platform::bfloat16>() {
+  return mkldnn::memory::data_type::bf16;
+}
+
 inline void Reorder(mkldnn::memory src, mkldnn::memory dst,
                     const mkldnn::engine& engine) {
   auto reorder_prim = mkldnn::reorder(src, dst);
@@ -437,6 +443,13 @@ inline bool HasOpINT8DataType(const paddle::framework::OpDesc* op) {
           op->GetAttrIfExists<bool>("use_quantizer"));
 }
 
+inline bool HasOpBFLOAT16DataType(const paddle::framework::OpDesc* op) {
+  return op->GetAttrIfExists<std::string>("mkldnn_data_type") == "bfloat16";
+}
+
+inline bool HasOpFLOAT32DataType(const paddle::framework::OpDesc* op) {
+  return op->GetAttrIfExists<std::string>("mkldnn_data_type") == "float32";
+}
 enum class RNNReorderType { PP_NTC, PP_TNC, NTC_PP, TNC_PP };
 
 }  // namespace platform
diff --git a/paddle/fluid/pybind/CMakeLists.txt b/paddle/fluid/pybind/CMakeLists.txt
index d733cf26ed209bcb86eaf2d366e45cfa0e7f9a90..92d9473141009216e3c7e64ccb793884dc67aadc 100644
--- a/paddle/fluid/pybind/CMakeLists.txt
+++ b/paddle/fluid/pybind/CMakeLists.txt
@@ -38,6 +38,7 @@ set(PYBIND_SRCS
   imperative.cc
   ir.cc
   inference_api.cc
+  compatible.cc
   generator_py.cc)
 
 if(WITH_GLOO)
diff --git a/paddle/fluid/pybind/compatible.cc b/paddle/fluid/pybind/compatible.cc
new file mode 100644
index 0000000000000000000000000000000000000000..971d230458db4bc2196ca529e01b0586da79567c
--- /dev/null
+++ b/paddle/fluid/pybind/compatible.cc
@@ -0,0 +1,38 @@
+// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/fluid/pybind/compatible.h"
+
+#include <memory>
+#include <string>
+
+#include "paddle/fluid/framework/op_version_registry.h"
+
+namespace py = pybind11;
+
+using paddle::framework::compatible::PassVersionCheckerRegistrar;
+
+namespace paddle {
+namespace pybind {
+
+void BindCompatible(py::module* m) {
+  py::class_<PassVersionCheckerRegistrar>(*m, "PassVersionChecker")
+      .def_static("IsCompatible", [](const std::string& name) -> bool {
+        auto instance = PassVersionCheckerRegistrar::GetInstance();
+        return instance.IsPassCompatible(name);
+      });
+}
+
+}  // namespace pybind
+}  // namespace paddle
diff --git a/paddle/fluid/pybind/compatible.h b/paddle/fluid/pybind/compatible.h
new file mode 100644
index 0000000000000000000000000000000000000000..f9d4cf5888fee8f62ce2e64636da6b98542b1a75
--- /dev/null
+++ b/paddle/fluid/pybind/compatible.h
@@ -0,0 +1,23 @@
+// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include <pybind11/pybind11.h>
+
+namespace paddle {
+namespace pybind {
+void BindCompatible(pybind11::module *m);
+}  // namespace pybind
+}  // namespace paddle
diff --git a/paddle/fluid/pybind/generator_py.cc b/paddle/fluid/pybind/generator_py.cc
index 90b7f501052530a306ba22ea6a244f0ef8fad563..67121e24089f7c6c5b8de985da89039eca85f094 100644
--- a/paddle/fluid/pybind/generator_py.cc
+++ b/paddle/fluid/pybind/generator_py.cc
@@ -59,6 +59,7 @@ void BindGenerator(py::module* m_ptr) {
       .def_property("_is_init_py", &framework::Generator::GetIsInitPy,
                     &framework::Generator::SetIsInitPy);
   m.def("default_cpu_generator", &framework::DefaultCPUGenerator);
-}  // end Generator
-}  // end namespace pybind
+  m.def("default_cuda_generator", &framework::GetDefaultCUDAGenerator);
+}
+}  // namespace pybind
 }  // namespace paddle
diff --git a/paddle/fluid/pybind/global_value_getter_setter.cc b/paddle/fluid/pybind/global_value_getter_setter.cc
index 318178d5eb927e45fa6472a695ce57f4b2a058b8..894740e25c018b09f8604006ae06fa5b9dc14bf0 100644
--- a/paddle/fluid/pybind/global_value_getter_setter.cc
+++ b/paddle/fluid/pybind/global_value_getter_setter.cc
@@ -62,6 +62,7 @@ DECLARE_bool(use_system_allocator);
 // others
 DECLARE_bool(benchmark);
 DECLARE_int32(inner_op_parallelism);
+DECLARE_int32(max_inplace_grad_add);
 DECLARE_string(tracer_profile_fname);
 #ifdef PADDLE_WITH_CUDA
 // cudnn
@@ -348,7 +349,7 @@ static void RegisterGlobalVarGetterSetter() {
       FLAGS_init_allocated_mem, FLAGS_initial_cpu_memory_in_mb,
       FLAGS_memory_fraction_of_eager_deletion, FLAGS_use_pinned_memory,
       FLAGS_benchmark, FLAGS_inner_op_parallelism, FLAGS_tracer_profile_fname,
-      FLAGS_paddle_num_threads, FLAGS_use_mkldnn);
+      FLAGS_paddle_num_threads, FLAGS_use_mkldnn, FLAGS_max_inplace_grad_add);
 
 #ifdef PADDLE_WITH_CUDA
   REGISTER_PUBLIC_GLOBAL_VAR(
diff --git a/paddle/fluid/pybind/inference_api.cc b/paddle/fluid/pybind/inference_api.cc
index c6be52d1cd082316b02372613b938adc0fa9d6d8..be4d90597e1e1c647ac6750ee7cebdc2ede8a551 100644
--- a/paddle/fluid/pybind/inference_api.cc
+++ b/paddle/fluid/pybind/inference_api.cc
@@ -60,6 +60,9 @@ void BindAnalysisConfig(py::module *m);
 void BindAnalysisPredictor(py::module *m);
 void BindZeroCopyTensor(py::module *m);
 void BindPaddlePassBuilder(py::module *m);
+void BindPaddleInferPredictor(py::module *m);
+void BindPaddleInferTensor(py::module *m);
+void BindPredictorPool(py::module *m);
 
 #ifdef PADDLE_WITH_MKLDNN
 void BindMkldnnQuantizerConfig(py::module *m);
@@ -139,6 +142,15 @@ void ZeroCopyTensorCreate(ZeroCopyTensor &tensor,  // NOLINT
   tensor.copy_from_cpu(static_cast<const T *>(data.data()));
 }
 
+template <typename T>
+void PaddleInferTensorCreate(paddle_infer::Tensor &tensor,  // NOLINT
+                             py::array_t<T> data) {
+  std::vector<int> shape;
+  std::copy_n(data.shape(), data.ndim(), std::back_inserter(shape));
+  tensor.Reshape(std::move(shape));
+  tensor.CopyFromCpu(static_cast<const T *>(data.data()));
+}
+
 size_t PaddleGetDTypeSize(PaddleDType dt) {
   size_t size{0};
   switch (dt) {
@@ -183,6 +195,30 @@ py::array ZeroCopyTensorToNumpy(ZeroCopyTensor &tensor) {  // NOLINT
   return array;
 }
 
+py::array PaddleInferTensorToNumpy(paddle_infer::Tensor &tensor) {  // NOLINT
+  py::dtype dt = PaddleDTypeToNumpyDType(tensor.type());
+  auto tensor_shape = tensor.shape();
+  py::array::ShapeContainer shape(tensor_shape.begin(), tensor_shape.end());
+  py::array array(dt, std::move(shape));
+
+  switch (tensor.type()) {
+    case PaddleDType::INT32:
+      tensor.CopyToCpu(static_cast<int32_t *>(array.mutable_data()));
+      break;
+    case PaddleDType::INT64:
+      tensor.CopyToCpu(static_cast<int64_t *>(array.mutable_data()));
+      break;
+    case PaddleDType::FLOAT32:
+      tensor.CopyToCpu<float>(static_cast<float *>(array.mutable_data()));
+      break;
+    default:
+      PADDLE_THROW(platform::errors::Unimplemented(
+          "Unsupported data type. Now only supports INT32, INT64 and "
+          "FLOAT32."));
+  }
+  return array;
+}
+
 py::bytes SerializePDTensorToBytes(PaddleTensor &tensor) {  // NOLINT
   std::stringstream ss;
   paddle::inference::SerializePDTensorToStream(&ss, tensor);
@@ -200,8 +236,11 @@ void BindInferenceApi(py::module *m) {
   BindNativePredictor(m);
   BindAnalysisConfig(m);
   BindAnalysisPredictor(m);
+  BindPaddleInferPredictor(m);
   BindZeroCopyTensor(m);
+  BindPaddleInferTensor(m);
   BindPaddlePassBuilder(m);
+  BindPredictorPool(m);
 #ifdef PADDLE_WITH_MKLDNN
   BindMkldnnQuantizerConfig(m);
 #endif
@@ -209,8 +248,17 @@ void BindInferenceApi(py::module *m) {
          &paddle::CreatePaddlePredictor<AnalysisConfig>, py::arg("config"));
   m->def("create_paddle_predictor",
          &paddle::CreatePaddlePredictor<NativeConfig>, py::arg("config"));
+  m->def("create_predictor", [](const paddle_infer::Config &config)
+                                 -> std::unique_ptr<paddle_infer::Predictor> {
+                                   auto pred =
+                                       std::unique_ptr<paddle_infer::Predictor>(
+                                           new paddle_infer::Predictor(config));
+                                   return std::move(pred);
+                                 });
   m->def("paddle_dtype_size", &paddle::PaddleDtypeSize);
   m->def("paddle_tensor_to_bytes", &SerializePDTensorToBytes);
+  m->def("get_version", &paddle_infer::GetVersion);
+  m->def("get_num_bytes_of_data_type", &paddle_infer::GetNumBytesOfDataType);
 }
 
 namespace {
@@ -448,6 +496,7 @@ void BindAnalysisConfig(py::module *m) {
            &AnalysisConfig::cpu_math_library_num_threads)
       .def("to_native_config", &AnalysisConfig::ToNativeConfig)
       .def("enable_quantizer", &AnalysisConfig::EnableMkldnnQuantizer)
+      .def("enable_mkldnn_bfloat16", &AnalysisConfig::EnableMkldnnBfloat16)
 #ifdef PADDLE_WITH_MKLDNN
       .def("quantizer_config", &AnalysisConfig::mkldnn_quantizer_config,
            py::return_value_policy::reference)
@@ -524,6 +573,19 @@ void BindAnalysisPredictor(py::module *m) {
            py::arg("dir"));
 }
 
+void BindPaddleInferPredictor(py::module *m) {
+  py::class_<paddle_infer::Predictor>(*m, "PaddleInferPredictor")
+      .def(py::init<const paddle_infer::Config &>())
+      .def("get_input_names", &paddle_infer::Predictor::GetInputNames)
+      .def("get_output_names", &paddle_infer::Predictor::GetOutputNames)
+      .def("get_input_handle", &paddle_infer::Predictor::GetInputHandle)
+      .def("get_output_handle", &paddle_infer::Predictor::GetOutputHandle)
+      .def("run", &paddle_infer::Predictor::Run)
+      .def("clone", &paddle_infer::Predictor::Clone)
+      .def("clear_intermediate_tensor",
+           &paddle_infer::Predictor::ClearIntermediateTensor);
+}
+
 void BindZeroCopyTensor(py::module *m) {
   py::class_<ZeroCopyTensor>(*m, "ZeroCopyTensor")
       .def("reshape", &ZeroCopyTensor::Reshape)
@@ -537,6 +599,26 @@ void BindZeroCopyTensor(py::module *m) {
       .def("type", &ZeroCopyTensor::type);
 }
 
+void BindPaddleInferTensor(py::module *m) {
+  py::class_<paddle_infer::Tensor>(*m, "PaddleInferTensor")
+      .def("reshape", &paddle_infer::Tensor::Reshape)
+      .def("copy_from_cpu", &PaddleInferTensorCreate<int32_t>)
+      .def("copy_from_cpu", &PaddleInferTensorCreate<int64_t>)
+      .def("copy_from_cpu", &PaddleInferTensorCreate<float>)
+      .def("copy_to_cpu", &PaddleInferTensorToNumpy)
+      .def("shape", &paddle_infer::Tensor::shape)
+      .def("set_lod", &paddle_infer::Tensor::SetLoD)
+      .def("lod", &paddle_infer::Tensor::lod)
+      .def("type", &paddle_infer::Tensor::type);
+}
+
+void BindPredictorPool(py::module *m) {
+  py::class_<paddle_infer::services::PredictorPool>(*m, "PredictorPool")
+      .def(py::init<const paddle_infer::Config &, size_t>())
+      .def("retrive", &paddle_infer::services::PredictorPool::Retrive,
+           py::return_value_policy::reference);
+}
+
 void BindPaddlePassBuilder(py::module *m) {
   py::class_<PaddlePassBuilder>(*m, "PaddlePassBuilder")
       .def(py::init<const std::vector<std::string> &>())
@@ -565,6 +647,7 @@ void BindPaddlePassBuilder(py::module *m) {
       .def("enable_cudnn", &PassStrategy::EnableCUDNN)
       .def("enable_mkldnn", &PassStrategy::EnableMKLDNN)
       .def("enable_mkldnn_quantizer", &PassStrategy::EnableMkldnnQuantizer)
+      .def("enable_mkldnn_bfloat16", &PassStrategy::EnableMkldnnBfloat16)
       .def("use_gpu", &PassStrategy::use_gpu);
 
   py::class_<CpuPassStrategy, PassStrategy>(*m, "CpuPassStrategy")
@@ -572,14 +655,16 @@ void BindPaddlePassBuilder(py::module *m) {
       .def(py::init<const CpuPassStrategy &>())
       .def("enable_cudnn", &CpuPassStrategy::EnableCUDNN)
       .def("enable_mkldnn", &CpuPassStrategy::EnableMKLDNN)
-      .def("enable_mkldnn_quantizer", &CpuPassStrategy::EnableMkldnnQuantizer);
+      .def("enable_mkldnn_quantizer", &CpuPassStrategy::EnableMkldnnQuantizer)
+      .def("enable_mkldnn_bfloat16", &CpuPassStrategy::EnableMkldnnBfloat16);
 
   py::class_<GpuPassStrategy, PassStrategy>(*m, "GpuPassStrategy")
       .def(py::init<>())
       .def(py::init<const GpuPassStrategy &>())
       .def("enable_cudnn", &GpuPassStrategy::EnableCUDNN)
       .def("enable_mkldnn", &GpuPassStrategy::EnableMKLDNN)
-      .def("enable_mkldnn_quantizer", &GpuPassStrategy::EnableMkldnnQuantizer);
+      .def("enable_mkldnn_quantizer", &GpuPassStrategy::EnableMkldnnQuantizer)
+      .def("enable_mkldnn_bfloat16", &GpuPassStrategy::EnableMkldnnBfloat16);
 }
 }  // namespace
 }  // namespace pybind
diff --git a/paddle/fluid/pybind/op_function_generator.cc b/paddle/fluid/pybind/op_function_generator.cc
index 256faf04ea6de5835f22113537caac49ca1dbab4..d3052ebd351ef4844d7563935172ed4b7eb1654c 100644
--- a/paddle/fluid/pybind/op_function_generator.cc
+++ b/paddle/fluid/pybind/op_function_generator.cc
@@ -43,6 +43,11 @@ std::map<std::string, std::set<std::string>> op_ins_map = {
     {"nll_loss", {"X", "Label", "Weight"}},
     {"bilinear_tensor_product", {"X", "Y", "Weight", "Bias"}},
     {"gather", {"X", "Index", "Axis"}},
+    {"roi_pool", {"X", "ROIs", "RoisNum"}},
+    {"roi_align", {"X", "ROIs", "RoisNum"}},
+    {"collect_fpn_proposals",
+     {"MultiLevelRois", "MultiLevelScores", "MultiLevelRoIsNum"}},
+    {"distribute_fpn_proposals", {"FpnRois", "RoisNum"}},
 };
 
 // NOTE(zhiqiu): Like op_ins_map.
@@ -63,6 +68,10 @@ std::map<std::string, std::set<std::string>> op_outs_map = {
      {"Y", "MeanOut", "VarianceOut", "SavedMean", "SavedVariance",
       "ReserveSpace"}},
     {"unique", {"Out", "Index", "Indices", "Counts"}},
+    {"generate_proposals", {"RpnRois", "RpnRoiProbs", "RpnRoisNum"}},
+    {"collect_fpn_proposals", {"FpnRois", "RoisNum"}},
+    {"distribute_fpn_proposals",
+     {"MultiFpnRois", "RestoreIndex", "MultiLevelRoIsNum"}},
 };
 
 // NOTE(zhiqiu): Commonly, the outputs in auto-generated OP function are
@@ -102,7 +111,10 @@ std::map<std::string, std::set<std::string>> op_passing_outs_map = {
     {"fake_quantize_dequantize_moving_average_abs_max",
      {"Out", "OutScale", "OutAccum", "OutState"}},
     {"fake_quantize_dequantize_abs_max", {"Out", "OutScale"}},
-    {"amp_check_finite_and_scale", {"Out", "FoundInfinite"}},
+    {"fake_channel_wise_quantize_dequantize_abs_max", {"Out", "OutScale"}},
+    {"check_finite_and_unscale", {"Out", "FoundInfinite"}},
+    {"update_loss_scaling",
+     {"Out", "LossScaling", "OutGoodSteps", "OutBadSteps"}},
 };
 
 // clang-format off
diff --git a/paddle/fluid/pybind/protobuf.cc b/paddle/fluid/pybind/protobuf.cc
index 9950eb9adc241ca5c82b4b0289dd57da4195e558..97056eca411f29e9a2c379cbcb2f88775242f692 100644
--- a/paddle/fluid/pybind/protobuf.cc
+++ b/paddle/fluid/pybind/protobuf.cc
@@ -184,6 +184,7 @@ void BindVarDsec(pybind11::module *m) {
       .value("FP16", pd::proto::VarType::FP16)
       .value("FP32", pd::proto::VarType::FP32)
       .value("FP64", pd::proto::VarType::FP64)
+      .value("BF16", pd::proto::VarType::BF16)
       .value("LOD_TENSOR", pd::proto::VarType::LOD_TENSOR)
       .value("SELECTED_ROWS", pd::proto::VarType::SELECTED_ROWS)
       .value("FEED_MINIBATCH", pd::proto::VarType::FEED_MINIBATCH)
diff --git a/paddle/fluid/pybind/pybind.cc b/paddle/fluid/pybind/pybind.cc
index 4b8f7c853ceaf2148722a9c65f38e0ec3d9f4df5..04087cb241c9cd4975773e646bc0ef6e1287518f 100644
--- a/paddle/fluid/pybind/pybind.cc
+++ b/paddle/fluid/pybind/pybind.cc
@@ -12,6 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 #include <Python.h>
+
 #include <algorithm>
 #include <cstdlib>
 #include <map>
@@ -22,6 +23,7 @@ limitations under the License. */
 #include <unordered_set>
 #include <utility>
 #include <vector>
+
 #include "paddle/fluid/framework/executor.h"
 #include "paddle/fluid/framework/feed_fetch_method.h"
 #include "paddle/fluid/framework/feed_fetch_type.h"
@@ -60,6 +62,7 @@ limitations under the License. */
 #include "paddle/fluid/platform/place.h"
 #include "paddle/fluid/platform/profiler.h"
 #include "paddle/fluid/pybind/box_helper_py.h"
+#include "paddle/fluid/pybind/compatible.h"
 #include "paddle/fluid/pybind/const_value.h"
 #include "paddle/fluid/pybind/data_set_py.h"
 #include "paddle/fluid/pybind/exception.h"
@@ -2527,6 +2530,10 @@ All parameter, weight, gradient are variables in Paddle.
           "enable_inplace",
           [](const BuildStrategy &self) { return self.enable_inplace_; },
           [](BuildStrategy &self, bool b) { self.enable_inplace_ = b; })
+      .def_property(
+          "enable_addto",
+          [](const BuildStrategy &self) { return self.enable_addto_; },
+          [](BuildStrategy &self, bool b) { self.enable_addto_ = b; })
       .def_property(
           "fuse_all_reduce_ops",
           [](const BuildStrategy &self) {
@@ -2619,6 +2626,7 @@ All parameter, weight, gradient are variables in Paddle.
   BindGraph(&m);
   BindNode(&m);
   BindInferenceApi(&m);
+  BindCompatible(&m);
   BindDataset(&m);
   BindGenerator(&m);
 #ifdef PADDLE_WITH_CRYPTO
diff --git a/paddle/fluid/pybind/tensor_py.h b/paddle/fluid/pybind/tensor_py.h
index 4377a8c2cef5aab7a200955cd25830d448014817..5ee15073267b6eac8978022a70ead5d0f439c62f 100644
--- a/paddle/fluid/pybind/tensor_py.h
+++ b/paddle/fluid/pybind/tensor_py.h
@@ -26,6 +26,7 @@ limitations under the License. */
 #include "paddle/fluid/memory/memcpy.h"
 #include "paddle/fluid/operators/math/concat_and_split.h"
 #include "paddle/fluid/operators/strided_memcpy.h"
+#include "paddle/fluid/platform/bfloat16.h"
 #include "paddle/fluid/platform/device_context.h"
 #include "paddle/fluid/platform/float16.h"
 #include "pybind11/numpy.h"
@@ -104,6 +105,7 @@ struct ValidDTypeToPyArrayChecker {
   }
 
 DECLARE_VALID_DTYPE_TO_PY_ARRAY(platform::float16);
+DECLARE_VALID_DTYPE_TO_PY_ARRAY(platform::bfloat16);
 DECLARE_VALID_DTYPE_TO_PY_ARRAY(float);
 DECLARE_VALID_DTYPE_TO_PY_ARRAY(double);
 DECLARE_VALID_DTYPE_TO_PY_ARRAY(bool);
@@ -119,6 +121,9 @@ inline std::string TensorDTypeToPyDTypeStr(
   if (type == proto_type) {                                                 \
     if (std::is_same<T, platform::float16>::value) {                        \
       return "e";                                                           \
+    } else if (std::is_same<T, platform::bfloat16>::value) {                \
+      /* NumPy character code of uint16 due to no support for bfloat16 */   \
+      return "H";                                                           \
     } else {                                                                \
       constexpr auto kIsValidDType = ValidDTypeToPyArrayChecker<T>::kValue; \
       PADDLE_ENFORCE_EQ(                                                    \
@@ -262,10 +267,10 @@ void SetTensorFromPyArray(framework::Tensor *self, const py::object &obj,
     SetTensorFromPyArrayT<paddle::platform::float16, P>(self, array, place,
                                                         zero_copy);
   } else if (py::isinstance<py::array_t<uint16_t>>(array)) {
-    // TODO(cql): temporary keeping uint16, which is used for casting float16
-    // before. It should be depracated later.
-    SetTensorFromPyArrayT<paddle::platform::float16, P>(self, array, place,
-                                                        zero_copy);
+    // since there is still no support for bfloat16 in NumPy,
+    // uint16 is used for casting bfloat16
+    SetTensorFromPyArrayT<paddle::platform::bfloat16, P>(self, array, place,
+                                                         zero_copy);
   } else if (py::isinstance<py::array_t<bool>>(array)) {
     SetTensorFromPyArrayT<bool, P>(self, array, place, zero_copy);
   } else {
@@ -479,6 +484,8 @@ inline framework::Tensor *_sliceTensor(const framework::Tensor &self,
   switch (src_type) {
     case framework::proto::VarType::FP16:
       return _sliceAndConcat<paddle::platform::float16>(self, obj, dim);
+    case framework::proto::VarType::BF16:
+      return _sliceAndConcat<paddle::platform::bfloat16>(self, obj, dim);
     case framework::proto::VarType::FP32:
       return _sliceAndConcat<float>(self, obj, dim);
     case framework::proto::VarType::FP64:
diff --git a/paddle/fluid/train/CMakeLists.txt b/paddle/fluid/train/CMakeLists.txt
index 1f88eb2109aa23b6b60104451908b0a70c41c898..235d92ac4f9e88947cea04425b0916b8a0290979 100644
--- a/paddle/fluid/train/CMakeLists.txt
+++ b/paddle/fluid/train/CMakeLists.txt
@@ -27,10 +27,10 @@ function(train_test TARGET_NAME)
         endif()
         set_tests_properties(test_train_${TARGET_NAME}${arg}
                 PROPERTIES DEPENDS test_${TARGET_NAME})
-        set_tests_properties(test_train_${TARGET_NAME}${arg}
-                PROPERTIES LABELS "RUN_TYPE=DIST")
-        set_tests_properties(test_train_${TARGET_NAME}${arg}
-                PROPERTIES TIMEOUT 150)
+        if(NOT WIN32 AND NOT APPLE)
+            set_tests_properties(test_train_${TARGET_NAME}${arg}
+                    PROPERTIES TIMEOUT 150)
+        endif()
     endforeach()
 endfunction(train_test)
 
diff --git a/paddle/fluid/train/demo/README.md b/paddle/fluid/train/demo/README.md
index bd53ab4b0c023b2591d792b504ab496a42d2835d..8a44c25aea9a0d7133ef915815d5e60227bd3e54 100644
--- a/paddle/fluid/train/demo/README.md
+++ b/paddle/fluid/train/demo/README.md
@@ -7,7 +7,7 @@
 # WITH_MKLDNN=ON|OFF
 
 PADDLE_LIB=/paddle/lib/dir
-cmake .. -DFLUID_INSTALL_DIR=$PADDLE_LIB \
+cmake .. -DPADDLE_INSTALL_DIR=$PADDLE_LIB \
          -DCMAKE_BUILD_TYPE=Release \
          -DWITH_GPU=OFF \
          -DWITH_STYLE_CHECK=OFF \
@@ -41,7 +41,7 @@ cd build
 # WITH_MKLDNN=ON|OFF
 PADDLE_LIB=/paddle/lib/dir
 
-# PADDLE_LIB is the same with FLUID_INSTALL_DIR when building the lib
+# PADDLE_LIB is the same with PADDLE_INSTALL_DIR when building the lib
 cmake .. -DPADDLE_LIB=$PADDLE_LIB \
          -DWITH_MKLDNN=OFF \
          -DWITH_MKL=OFF
diff --git a/paddle/fluid/train/demo/demo_trainer.cc b/paddle/fluid/train/demo/demo_trainer.cc
index 1087f5672459506cc7b824127cd822c0df7ba566..1ef98720f83697715c05e868177faba489fd8760 100644
--- a/paddle/fluid/train/demo/demo_trainer.cc
+++ b/paddle/fluid/train/demo/demo_trainer.cc
@@ -29,7 +29,9 @@ namespace train {
 
 void ReadBinaryFile(const std::string& filename, std::string* contents) {
   std::ifstream fin(filename, std::ios::in | std::ios::binary);
-  PADDLE_ENFORCE(static_cast<bool>(fin), "Cannot open file %s", filename);
+  PADDLE_ENFORCE_EQ(
+      fin.is_open(), true,
+      platform::errors::Unavailable("Failed to open file %s.", filename));
   fin.seekg(0, std::ios::end);
   contents->clear();
   contents->resize(fin.tellg());
@@ -70,7 +72,8 @@ int main() {
     }
   }
 
-  PADDLE_ENFORCE_NE(loss_name, "", "loss not found");
+  PADDLE_ENFORCE_NE(loss_name, "",
+                    platform::errors::NotFound("Loss name is not found."));
 
   // init all parameters
   executor.Run(*startup_program, &scope, 0);
diff --git a/paddle/fluid/train/demo/run.sh b/paddle/fluid/train/demo/run.sh
index f7efb3b3b7d5d9bf45e4b728006d7e24daa4be74..2955e7574daa2d2e41bbade95c3c213917d07d4f 100755
--- a/paddle/fluid/train/demo/run.sh
+++ b/paddle/fluid/train/demo/run.sh
@@ -14,12 +14,12 @@ function download() {
 download
 
 # build demo trainer
-fluid_install_dir=${PADDLE_ROOT}/build/fluid_install_dir
+paddle_install_dir=${PADDLE_ROOT}/build/paddle_install_dir
 
 mkdir -p build
 cd build
 rm -rf *
-cmake .. -DPADDLE_LIB=$fluid_install_dir \
+cmake .. -DPADDLE_LIB=$paddle_install_dir \
          -DWITH_MKLDNN=$TURN_ON_MKL \
          -DWITH_MKL=$TURN_ON_MKL
 make
diff --git a/paddle/fluid/train/imdb_demo/README.md b/paddle/fluid/train/imdb_demo/README.md
index ecc985e13f8a7a2e9d2da037b98ccd2d1574794c..28fd66710f80dda06b1c87266362cb969b42534c 100644
--- a/paddle/fluid/train/imdb_demo/README.md
+++ b/paddle/fluid/train/imdb_demo/README.md
@@ -11,7 +11,7 @@ PADDLE_ROOT=./Paddle
 cd Paddle
 mkdir build
 cd build
-cmake -DFLUID_INFERENCE_INSTALL_DIR=$PADDLE_ROOT \
+cmake -DPADDLE_INFERENCE_INSTALL_DIR=$PADDLE_ROOT \
       -DCMAKE_BUILD_TYPE=Release \
       -DWITH_PYTHON=OFF \
       -DWITH_MKL=OFF \
@@ -40,7 +40,7 @@ see: [IMDB Dataset of 50K Movie Reviews | Kaggle](https://www.kaggle.com/lakshmi
     mkdir build
     cd build
     rm -rf *
-    PADDLE_LIB=path/to/Paddle/build/fluid_install_dir
+    PADDLE_LIB=path/to/Paddle/build/paddle_install_dir
     cmake .. -DPADDLE_LIB=$PADDLE_LIB  -DWITH_MKLDNN=OFF -DWITH_MKL=OFF
     make
 ```
diff --git a/paddle/fluid/train/imdb_demo/demo_trainer.cc b/paddle/fluid/train/imdb_demo/demo_trainer.cc
index d45edd563f03d7a1b156d063d5e7296290d0eaba..a08069a57ca824f307b4bf8836237f573ab3c429 100644
--- a/paddle/fluid/train/imdb_demo/demo_trainer.cc
+++ b/paddle/fluid/train/imdb_demo/demo_trainer.cc
@@ -45,7 +45,9 @@ namespace train {
 
 void ReadBinaryFile(const std::string& filename, std::string* contents) {
   std::ifstream fin(filename, std::ios::in | std::ios::binary);
-  PADDLE_ENFORCE(static_cast<bool>(fin), "Cannot open file %s", filename);
+  PADDLE_ENFORCE_EQ(
+      fin.is_open(), true,
+      platform::errors::Unavailable("Failed to open file %s.", filename));
   fin.seekg(0, std::ios::end);
   contents->clear();
   contents->resize(fin.tellg());
@@ -98,7 +100,11 @@ int main(int argc, char* argv[]) {
       file_vec.push_back(filename);
     }
   }
-  PADDLE_ENFORCE_GE(file_vec.size(), 1, "At least one file to train");
+  PADDLE_ENFORCE_GE(
+      file_vec.size(), 1,
+      platform::errors::InvalidArgument(
+          "At least one file to train, but received number of file is %d.",
+          file_vec.size()));
   paddle::framework::InitDevices(false);
   const auto cpu_place = paddle::platform::CPUPlace();
   paddle::framework::Executor executor(cpu_place);
@@ -148,7 +154,9 @@ int main(int argc, char* argv[]) {
     const std::vector<paddle::framework::DataFeed*> readers =
         dataset_ptr->GetReaders();
     PADDLE_ENFORCE_EQ(readers.size(), 1,
-                      "readers num should be equal to thread num");
+                      platform::errors::InvalidArgument(
+                          "Readers num(%d) should be equal to thread num(1).",
+                          readers.size()));
     readers[0]->SetPlace(paddle::platform::CPUPlace());
     const std::vector<std::string>& input_feed_names =
         readers[0]->GetUseSlotAlias();
diff --git a/paddle/fluid/train/test_train_recognize_digits.cc b/paddle/fluid/train/test_train_recognize_digits.cc
index 45c438e8925b4e0a88e61ad509b88cd6226773a4..e7b698e1a34e267e392d696b67b92cd2e8c23f3b 100644
--- a/paddle/fluid/train/test_train_recognize_digits.cc
+++ b/paddle/fluid/train/test_train_recognize_digits.cc
@@ -51,7 +51,8 @@ void Train() {
     }
   }
 
-  PADDLE_ENFORCE_NE(loss_name, "", "loss not found");
+  PADDLE_ENFORCE_NE(loss_name, "",
+                    platform::errors::NotFound("Loss name is not found."));
 
   // prepare data
   auto x_var = scope.Var("img");
diff --git a/paddle/http.log b/paddle/http.log
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/paddle/scripts/paddle_build.bat b/paddle/scripts/paddle_build.bat
index cfb59a04f8147f5c09aa08a01bcd304bf8ccc120..524c086c07925c880dfb46a70a1f930686bae867 100644
--- a/paddle/scripts/paddle_build.bat
+++ b/paddle/scripts/paddle_build.bat
@@ -21,14 +21,53 @@ rem =================================================
 
 rem -------clean up environment-----------
 set work_dir=%cd%
-if exist build rmdir build /s/q
-mkdir build
+wmic process where name="op_function_generator.exe" call terminate  2>NUL
+
+rem ------initialize common variable------
+if not defined CUDA_TOOLKIT_ROOT_DIR set CUDA_TOOLKIT_ROOT_DIR="C:/Program Files/NVIDIA GPU Computing Toolkit/CUDA/v10.0"
+if not defined BRANCH set BRANCH=develop
+if not defined TENSORRT_ROOT set TENSORRT_ROOT="C:/TensorRT-5.1.5.0"
+if not defined WITH_MKL set WITH_MKL=ON
+if not defined WITH_GPU set WITH_GPU=OFF
+if not defined WITH_AVX set WITH_AVX=ON
+if not defined WITH_TESTING set WITH_TESTING=ON
+if not defined WITH_PYTHON set WITH_PYTHON=ON
+if not defined ON_INFER set ON_INFER=ON
+if not defined WITH_INFERENCE_API_TEST set WITH_INFERENCE_API_TEST=ON
+if not defined WITH_STATIC_LIB set WITH_STATIC_LIB=ON
+if not defined WITH_CACHE set WITH_CACHE=ON
+if not defined WITH_TPCACHE set WITH_TPCACHE=ON
+
+
+rem -------set cache build work directory-----------
+if "%WITH_CACHE%"=="OFF" (
+    rmdir build /s/q
+    goto :mkbuild
+)
+
+for /F %%# in ('wmic os get localdatetime^|findstr 20') do set datetime=%%#
+set day_now=%datetime:~6,2%
+set day_before=-1
+set /p day_before=<day.txt
+if %day_now% NEQ %day_before% (
+    echo %day_now% > day.txt
+    type day.txt
+    rmdir build /s/q
+)
+git diff origin/develop --stat --name-only | findstr "cmake CMakeLists.txt paddle_build.bat"
+if %ERRORLEVEL% EQU 0 (
+    rmdir build /s/q
+)
+
+:mkbuild
+if not exist build (
+    mkdir build
+)
 cd /d build
-tree .
+dir .
 dir paddle\fluid\pybind\Release
-taskkill /f /im op_function_generator.exe  2>NUL
 
-rem ------initialize the virtual environment------
+rem ------initialize the python environment------
 if not defined PYTHON_ROOT set PYTHON_ROOT=C:\Python37
 set PATH=%PYTHON_ROOT%;%PYTHON_ROOT%\Scripts;%PATH%
 
@@ -39,7 +78,7 @@ rem %PYTHON_EXECUTABLE% -m pip install virtualenv
 rem %PYTHON_EXECUTABLE% -m virtualenv paddle_winci
 rem call paddle_winci\Scripts\activate.bat
 
-rem ------pre install requirement----------
+rem ------pre install python requirement----------
 where python
 where pip
 pip install --upgrade pip --user
@@ -52,15 +91,17 @@ if %ERRORLEVEL% NEQ 0 (
     exit /b 7
 )
 
-rem ------initialize common variable------
-if not defined CUDA_TOOLKIT_ROOT_DIR set CUDA_TOOLKIT_ROOT_DIR="C:/Program Files/NVIDIA GPU Computing Toolkit/CUDA/v10.0"
-if not defined BRANCH set BRANCH=develop
-if not defined WITH_AVX set WITH_AVX=ON
-if not defined WITH_TESTING set WITH_TESTING=ON
-if not defined WITH_PYTHON set WITH_PYTHON=ON
-if not defined ON_INFER set ON_INFER=ON
-if not defined WITH_INFERENCE_API_TEST set WITH_INFERENCE_API_TEST=OFF
-if not defined WITH_TPCACHE set WITH_TPCACHE=ON
+rem ------pre install clcache and init config----------
+pip install clcache
+:: set USE_CLCACHE to enable clcache
+set USE_CLCACHE=1
+:: In some scenarios, CLCACHE_HARDLINK can save one file copy.
+set CLCACHE_HARDLINK=1
+:: If it takes more than 1000s to obtain the right to use the cache, an error will be reported
+set CLCACHE_OBJECT_CACHE_TIMEOUT_MS=1000000
+:: set maximum cache size to 20G
+clcache.exe -M 21474836480
+
 
 rem ------set cache third_party------
 set cache_dir=%work_dir:Paddle=cache%
@@ -101,6 +142,7 @@ exit /b 1
 :CASE_wincheck_mkl
 set WITH_MKL=ON
 set WITH_GPU=OFF
+set MSVC_STATIC_CRT=ON
 call :cmake || goto cmake_error
 call :build || goto build_error
 call :test_whl_pacakage || goto test_whl_pacakage_error
@@ -112,9 +154,13 @@ goto:success
 :CASE_wincheck_openblas
 set WITH_MKL=OFF
 set WITH_GPU=ON
+set MSVC_STATIC_CRT=OFF
+rem Temporarily turn off WITH_INFERENCE_API_TEST on GPU due to compile hang
+set WITH_INFERENCE_API_TEST=OFF
 call :cmake || goto cmake_error
 call :build || goto build_error
 call :test_whl_pacakage || goto test_whl_pacakage_error
+:: call :test_inference || goto test_inference_error
 goto:success
 
 rem "Other configurations are added here"
@@ -133,12 +179,14 @@ set start=%start:~4,10%
 echo cmake .. -G "Visual Studio 14 2015 Win64" -DWITH_AVX=%WITH_AVX% -DWITH_GPU=%WITH_GPU% -DWITH_MKL=%WITH_MKL% ^
 -DWITH_TESTING=%WITH_TESTING% -DWITH_PYTHON=%WITH_PYTHON% -DCUDA_TOOLKIT_ROOT_DIR=%CUDA_TOOLKIT_ROOT_DIR% ^
 -DON_INFER=%ON_INFER% -DWITH_INFERENCE_API_TEST=%WITH_INFERENCE_API_TEST% -DTHIRD_PARTY_PATH=%THIRD_PARTY_PATH% ^
--DINFERENCE_DEMO_INSTALL_DIR=%INFERENCE_DEMO_INSTALL_DIR%
+-DINFERENCE_DEMO_INSTALL_DIR=%INFERENCE_DEMO_INSTALL_DIR% -DWITH_STATIC_LIB=%WITH_STATIC_LIB% ^
+-DTENSORRT_ROOT=%TENSORRT_ROOT% -DMSVC_STATIC_CRT=%MSVC_STATIC_CRT%
 
 cmake .. -G "Visual Studio 14 2015 Win64" -DWITH_AVX=%WITH_AVX% -DWITH_GPU=%WITH_GPU% -DWITH_MKL=%WITH_MKL% ^
 -DWITH_TESTING=%WITH_TESTING% -DWITH_PYTHON=%WITH_PYTHON% -DCUDA_TOOLKIT_ROOT_DIR=%CUDA_TOOLKIT_ROOT_DIR% ^
 -DON_INFER=%ON_INFER%  -DWITH_INFERENCE_API_TEST=%WITH_INFERENCE_API_TEST% -DTHIRD_PARTY_PATH=%THIRD_PARTY_PATH% ^
--DINFERENCE_DEMO_INSTALL_DIR=%INFERENCE_DEMO_INSTALL_DIR%
+-DINFERENCE_DEMO_INSTALL_DIR=%INFERENCE_DEMO_INSTALL_DIR% -DWITH_STATIC_LIB=%WITH_STATIC_LIB% ^
+-DTENSORRT_ROOT=%TENSORRT_ROOT% -DMSVC_STATIC_CRT=%MSVC_STATIC_CRT%
 goto:eof
 
 :cmake_error
@@ -153,6 +201,7 @@ echo    Step 2. Buile Paddle ...
 echo    ========================================
 call "C:\Program Files (x86)\Microsoft Visual Studio 14.0\VC\vcvarsall.bat" amd64
 
+for /F %%# in ('wmic cpu get NumberOfLogicalProcessors^|findstr [0-9]') do set /a PARALLEL_PROJECT_COUNT=%%#*8/10
 set build_times=1
 :build_tp
 echo Build third_party the %build_times% time:
@@ -171,7 +220,7 @@ echo Build third_party successfully!
 set build_times=1
 :build_paddle
 echo Build Paddle the %build_times% time:
-msbuild /m /p:Configuration=Release /verbosity:minimal paddle.sln
+msbuild /m:%PARALLEL_PROJECT_COUNT% /p:TrackFileAccess=false /p:CLToolExe=clcache.exe /p:CLToolPath=%PYTHON_ROOT%\Scripts /p:Configuration=Release /verbosity:minimal paddle.sln
 if %ERRORLEVEL% NEQ 0 (
     set /a build_times=%build_times%+1
     if %build_times% GTR 2 (
@@ -200,10 +249,10 @@ echo    ========================================
 for /F %%# in ('wmic os get localdatetime^|findstr 20') do set end=%%#
 set end=%end:~4,10%
 call :timestamp "%start%" "%end%" "Build"
-tree /F %cd%\fluid_inference_install_dir\paddle
-%cache_dir%\tools\busybox64.exe du -h -d 0 %cd%\fluid_inference_install_dir\paddle\lib > lib_size.txt
+tree /F %cd%\paddle_inference_install_dir\paddle
+%cache_dir%\tools\busybox64.exe du -h -d 0 %cd%\paddle_inference_install_dir\paddle\lib > lib_size.txt
 set /p libsize=< lib_size.txt
-for /F %%i in ("%libsize%") do echo "Windows FLuid_Inference Size: %%i"
+for /F %%i in ("%libsize%") do echo "Windows Paddle_Inference Size: %%i"
 %cache_dir%\tools\busybox64.exe du -h -d 0 %cd%\python\dist > whl_size.txt
 set /p whlsize=< whl_size.txt
 for /F %%i in ("%whlsize%") do echo "Windows PR whl Size: %%i"
@@ -216,7 +265,7 @@ pip install -U %PADDLE_WHL_FILE_WIN% --user
 if %ERRORLEVEL% NEQ 0 (
     call paddle_winci\Scripts\deactivate.bat 2>NUL
     echo pip install whl package failed!
-    exit /b 3
+    exit /b 1
 )
 
 python %work_dir%\paddle\scripts\installation_validate.py
@@ -225,7 +274,7 @@ goto:eof
 :test_whl_pacakage_error
 call paddle_winci\Scripts\deactivate.bat 2>NUL
 echo Test import paddle failed, will exit!
-exit /b 3
+exit /b 1
 
 rem ---------------------------------------------------------------------------------------------
 :unit_test
@@ -242,12 +291,17 @@ dir %THIRD_PARTY_PATH:/=\%\install\mklml\lib
 dir %THIRD_PARTY_PATH:/=\%\install\mkldnn\bin
 dir %THIRD_PARTY_PATH:/=\%\install\warpctc\bin
 
-set PATH=%THIRD_PARTY_PATH:/=\%\install\openblas\lib;%THIRD_PARTY_PATH:/=\%\install\openblas\bin;%THIRD_PARTY_PATH:/=\%\install\zlib\bin;%THIRD_PARTY_PATH:/=\%\install\mklml\lib;%THIRD_PARTY_PATH:/=\%\install\mkldnn\bin;%THIRD_PARTY_PATH:/=\%\install\warpctc\bin;%PATH%
-ctest.exe --output-on-failure -C Release -j 8
+set PATH=%THIRD_PARTY_PATH:/=\%\install\openblas\lib;%THIRD_PARTY_PATH:/=\%\install\openblas\bin;^
+%THIRD_PARTY_PATH:/=\%\install\zlib\bin;%THIRD_PARTY_PATH:/=\%\install\mklml\lib;^
+%THIRD_PARTY_PATH:/=\%\install\mkldnn\bin;%THIRD_PARTY_PATH:/=\%\install\warpctc\bin;%PATH%
+ctest.exe --output-on-failure -C Release -j 8 --repeat until-pass:4 after-timeout:4
 goto:eof
 
 :unit_test_error
 call paddle_winci\Scripts\deactivate.bat 2>NUL
+for /F %%# in ('wmic os get localdatetime^|findstr 20') do set end=%%#
+set end=%end:~4,10%
+call :timestamp "%start%" "%end%" "TestCases Total"
 echo Running unit tests failed, will exit!
 exit /b 8
 
@@ -262,13 +316,13 @@ set end=%end:~4,10%
 call :timestamp "%start%" "%end%" "TestCases Total"
 
 cd %work_dir%\paddle\fluid\inference\api\demo_ci
-%cache_dir%\tools\busybox64.exe bash run.sh %work_dir:\=/% %WITH_MKL% %WITH_GPU% %cache_dir:\=/%/inference_demo
+%cache_dir%\tools\busybox64.exe bash run.sh %work_dir:\=/% %WITH_MKL% %WITH_GPU% %cache_dir:\=/%/inference_demo %TENSORRT_ROOT%/include %TENSORRT_ROOT%/lib %MSVC_STATIC_CRT%
 goto:eof
 
 :test_inference_error
 call paddle_winci\Scripts\deactivate.bat 2>NUL
 echo Testing fluid library for inference failed!
-exit /b 5
+exit /b 1
 
 rem ---------------------------------------------------------------------------------------------
 :check_change_of_unittest
@@ -399,9 +453,10 @@ taskkill /f /im git-remote-https.exe 2>NUL
 taskkill /f /im vctip.exe 2>NUL
 taskkill /f /im cvtres.exe 2>NUL
 taskkill /f /im rc.exe 2>NUL
-taskkill /f /im op_function_generator.exe  2>NUL
+wmic process where name="op_function_generator.exe" call terminate 2>NUL
 taskkill /f /im python.exe  2>NUL
 call paddle_winci\Scripts\deactivate.bat 2>NUL
+del %PADDLE_WHL_FILE_WIN%
 taskkill /f /im python.exe  2>NUL
 echo Windows CI run successfully!
 exit /b 0
diff --git a/paddle/scripts/paddle_build.sh b/paddle/scripts/paddle_build.sh
index 6414b78172bbb3848ea8444fbef5a81fb990a4a1..69303013d2a41a049276c0d1b03b9d902b555d23 100755
--- a/paddle/scripts/paddle_build.sh
+++ b/paddle/scripts/paddle_build.sh
@@ -121,6 +121,18 @@ function cmake_base() {
             else
                 exit 1
             fi
+        elif [ "$1" == "cp38-cp38" ]; then
+            if [ -d "/Library/Frameworks/Python.framework/Versions/3.8" ]; then
+                export LD_LIBRARY_PATH=/Library/Frameworks/Python.framework/Versions/3.8/lib/
+                export DYLD_LIBRARY_PATH=/Library/Frameworks/Python.framework/Versions/3.8/lib/
+                export PATH=/Library/Frameworks/Python.framework/Versions/3.8/bin/:${PATH}
+                PYTHON_FLAGS="-DPYTHON_EXECUTABLE:FILEPATH=/Library/Frameworks/Python.framework/Versions/3.8/bin/python3
+            -DPYTHON_INCLUDE_DIR:PATH=/Library/Frameworks/Python.framework/Versions/3.8/include/python3.8/
+            -DPYTHON_LIBRARY:FILEPATH=/Library/Frameworks/Python.framework/Versions/3.8/lib/libpython3.8.dylib"
+                pip3.8 install --user -r ${PADDLE_ROOT}/python/requirements.txt
+            else
+                exit 1
+            fi
         fi
         # delete `gym` to avoid modifying requirements.txt in *.whl
         sed -i .bak "/^gym$/d" ${PADDLE_ROOT}/python/requirements.txt
@@ -128,18 +140,18 @@ function cmake_base() {
         if [ "$1" != "" ]; then
             echo "using python abi: $1"
             if [ "$1" == "cp27-cp27m" ]; then
-                export LD_LIBRARY_PATH=/opt/_internal/cpython-2.7.11-ucs2/lib:${LD_LIBRARY_PATH#/opt/_internal/cpython-2.7.11-ucs4/lib:}
+                export LD_LIBRARY_PATH=/opt/_internal/cpython-2.7.15-ucs2/lib:${LD_LIBRARY_PATH#/opt/_internal/cpython-2.7.15-ucs4/lib:}
                 export PATH=/opt/python/cp27-cp27m/bin/:${PATH}
                 PYTHON_FLAGS="-DPYTHON_EXECUTABLE:FILEPATH=/opt/python/cp27-cp27m/bin/python
             -DPYTHON_INCLUDE_DIR:PATH=/opt/python/cp27-cp27m/include/python2.7
-            -DPYTHON_LIBRARIES:FILEPATH=/opt/_internal/cpython-2.7.11-ucs2/lib/libpython2.7.so"
+            -DPYTHON_LIBRARIES:FILEPATH=/opt/_internal/cpython-2.7.15-ucs2/lib/libpython2.7.so"
                 pip install -r ${PADDLE_ROOT}/python/requirements.txt
             elif [ "$1" == "cp27-cp27mu" ]; then
-                export LD_LIBRARY_PATH=/opt/_internal/cpython-2.7.11-ucs4/lib:${LD_LIBRARY_PATH#/opt/_internal/cpython-2.7.11-ucs2/lib:}
+                export LD_LIBRARY_PATH=/opt/_internal/cpython-2.7.15-ucs4/lib:${LD_LIBRARY_PATH#/opt/_internal/cpython-2.7.15-ucs2/lib:}
                 export PATH=/opt/python/cp27-cp27mu/bin/:${PATH}
                 PYTHON_FLAGS="-DPYTHON_EXECUTABLE:FILEPATH=/opt/python/cp27-cp27mu/bin/python
             -DPYTHON_INCLUDE_DIR:PATH=/opt/python/cp27-cp27mu/include/python2.7
-            -DPYTHON_LIBRARIES:FILEPATH=/opt/_internal/cpython-2.7.11-ucs4/lib/libpython2.7.so"
+            -DPYTHON_LIBRARIES:FILEPATH=/opt/_internal/cpython-2.7.15-ucs4/lib/libpython2.7.so"
                 pip install -r ${PADDLE_ROOT}/python/requirements.txt
             elif [ "$1" == "cp27-cp27m-gcc82" ]; then
                 export LD_LIBRARY_PATH=/opt/_internal/cpython-2.7.15-ucs2/lib:${LD_LIBRARY_PATH#/opt/_internal/cpython-2.7.15-ucs4/lib:}
@@ -176,6 +188,13 @@ function cmake_base() {
             -DPYTHON_INCLUDE_DIR:PATH=/opt/_internal/cpython-3.7.0/include/python3.7m
             -DPYTHON_LIBRARIES:FILEPATH=/opt/_internal/cpython-3.7.0/lib/libpython3.so"
                 pip3.7 install -r ${PADDLE_ROOT}/python/requirements.txt
+            elif [ "$1" == "cp38-cp38" ]; then
+                export LD_LIBRARY_PATH=/opt/_internal/cpython-3.8.0/lib/:${LD_LIBRARY_PATH}
+                export PATH=/opt/_internal/cpython-3.8.0/bin/:${PATH}
+                export PYTHON_FLAGS="-DPYTHON_EXECUTABLE:FILEPATH=/opt/_internal/cpython-3.8.0/bin/python3.8
+            -DPYTHON_INCLUDE_DIR:PATH=/opt/_internal/cpython-3.8.0/include/python3.8
+            -DPYTHON_LIBRARIES:FILEPATH=/opt/_internal/cpython-3.8.0/lib/libpython3.so"
+                pip3.8 install -r ${PADDLE_ROOT}/python/requirements.txt
            fi
         else
             pip install -r ${PADDLE_ROOT}/python/requirements.txt
@@ -273,7 +292,7 @@ function cmake_gen() {
 function abort(){
     echo "Your change doesn't follow PaddlePaddle's code style." 1>&2
     echo "Please use pre-commit to check what is wrong." 1>&2
-    exit 1
+    exit 4
 }
 
 function check_style() {
@@ -296,14 +315,14 @@ function check_style() {
     commit_files=on
     for file_name in `git diff --numstat upstream/$BRANCH |awk '{print $NF}'`;do
         if ! pre-commit run --files $file_name ; then
-            git diff
             commit_files=off
         fi
     done 
     
     if [ $commit_files == 'off' ];then
         echo "code format error"
-        exit 1
+        git diff 2>&1
+        exit 4
     fi
     trap : 0
 }
@@ -343,12 +362,12 @@ function build_size() {
     Calculate /paddle/build size and PR whl size
     ============================================
 EOF
-    if [ "$1" == "fluid_inference" ]; then
+    if [ "$1" == "paddle_inference" ]; then
         cd ${PADDLE_ROOT}/build
-        cp -r fluid_inference_install_dir fluid_inference
-        tar -czf fluid_inference.tgz fluid_inference
-        buildSize=$(du -h --max-depth=0 ${PADDLE_ROOT}/build/fluid_inference.tgz |awk '{print $1}')
-        echo "FLuid_Inference Size: $buildSize"
+        cp -r paddle_inference_install_dir paddle_inference
+        tar -czf paddle_inference.tgz paddle_inference
+        buildSize=$(du -h --max-depth=0 ${PADDLE_ROOT}/build/paddle_inference.tgz |awk '{print $1}')
+        echo "Paddle_Inference Size: $buildSize"
     else
         SYSTEM=`uname -s`
         if [ "$SYSTEM" == "Darwin" ]; then
@@ -514,6 +533,8 @@ EOF
             pip3.6 uninstall -y paddlepaddle
         elif [ "$1" == "cp37-cp37m" ]; then
             pip3.7 uninstall -y paddlepaddle
+        elif [ "$1" == "cp38-cp38" ]; then
+            pip3.8 uninstall -y paddlepaddle
         fi
         set -ex
 
@@ -527,9 +548,53 @@ EOF
             pip3.6 install --user ${INSTALL_PREFIX:-/paddle/build}/opt/paddle/share/wheels/*.whl
         elif [ "$1" == "cp37-cp37m" ]; then
             pip3.7 install --user ${INSTALL_PREFIX:-/paddle/build}/opt/paddle/share/wheels/*.whl
+        elif [ "$1" == "cp38-cp38" ]; then
+            pip3.8 install --user ${INSTALL_PREFIX:-/paddle/build}/opt/paddle/share/wheels/*.whl
         fi
+        tmpfile_rand=`date +%s%N`
+        tmpfile=$tmp_dir/$tmpfile_rand
+        set +e
         ut_startTime_s=`date +%s`
-        ctest --output-on-failure -j $2;mactest_error=$?
+        ctest --output-on-failure -j $2 | tee $tmpfile
+        failed_test_lists=''
+        collect_failed_tests
+        set +x
+        mactest_error=0
+        retry_unittests_record=''
+        retry_time=3
+        exec_times=0
+        exec_time_array=('first' 'second' 'third')
+        if [ -n "$failed_test_lists" ];then
+            mactest_error=1
+            while ( [ $exec_times -lt $retry_time ] && [ -n "${failed_test_lists}" ] )
+                do
+                    retry_unittests_record="$retry_unittests_record$failed_test_lists"
+                    failed_test_lists_ult=`echo "${failed_test_lists}"`
+                    read retry_unittests <<< $(echo "$failed_test_lists" | grep -oEi "\-.+\(" | sed 's/(//' | sed 's/- //' )
+                    echo "========================================="
+                    echo "This is the ${exec_time_array[$exec_times]} time to re-run"
+                    echo "========================================="
+                    echo "The following unittest will be re-run:"
+                    echo "${retry_unittests}"
+                    echo "========================================="
+
+                    retry_unittests_regular=''
+                    for line in ${retry_unittests[@]} ;
+                        do
+                            if [[ "$retry_unittests_regular" == "" ]];then
+                                retry_unittests_regular="^$line$"
+                            else
+                                retry_unittests_regular="$retry_unittests_regular|^$line$"
+                            fi
+                        done
+                    rm -f $tmp_dir/*
+                    failed_test_lists=''
+                    ctest -R "($retry_unittests_regular)" --output-on-failure -j $2 | tee $tmpfile
+                    collect_failed_tests
+                    exec_times=$[$exec_times+1]
+                done
+        fi
+        #mactest_error=$?
         ut_endTime_s=`date +%s`
         echo "Mac testCase Time: $[ $ut_endTime_s - $ut_startTime_s ]s"
         paddle version
@@ -537,7 +602,21 @@ EOF
         export http_proxy=$my_proxy
         export https_proxy=$my_proxy
         if [ "$mactest_error" != 0 ];then
-            exit 8;
+            if [[ "$failed_test_lists" == "" ]]; then
+                echo "========================================"
+                echo "There are failed tests, which have been successful after re-run:"
+                echo "========================================"
+                echo "The following tests have been re-ran:"
+                echo "${retry_unittests_record}"
+            else
+                failed_test_lists_ult=`echo "${failed_test_lists}"`
+                echo "========================================"
+                echo "Summary Failed Tests... "
+                echo "========================================"
+                echo "The following tests FAILED: "
+                echo "${failed_test_lists_ult}"
+                exit 8;
+            fi
         fi
     fi
 }
@@ -561,9 +640,11 @@ function fetch_upstream_develop_if_not_exist() {
 function generate_upstream_develop_api_spec() {
     fetch_upstream_develop_if_not_exist
     cur_branch=`git branch | grep \* | cut -d ' ' -f2`
+    git checkout .
     git checkout -b develop_base_pr upstream/$BRANCH
     cmake_gen $1
     build $2
+    cp ${PADDLE_ROOT}/python/requirements.txt /tmp
 
     git checkout $cur_branch
     generate_api_spec "$1" "DEV"
@@ -584,7 +665,12 @@ function generate_api_spec() {
     cd ${PADDLE_ROOT}/build/.check_api_workspace
     virtualenv .${spec_kind}_env
     source .${spec_kind}_env/bin/activate
-    pip install -r ${PADDLE_ROOT}/python/requirements.txt
+
+    if [ "$spec_kind" == "DEV" ]; then
+        pip install -r /tmp/requirements.txt
+    else
+        pip install -r ${PADDLE_ROOT}/python/requirements.txt
+    fi
     pip --no-cache-dir install ${PADDLE_ROOT}/build/python/dist/*whl
     spec_path=${PADDLE_ROOT}/paddle/fluid/API_${spec_kind}.spec
     python ${PADDLE_ROOT}/tools/print_signatures.py paddle > $spec_path
@@ -603,7 +689,7 @@ function generate_api_spec() {
 
     awk -F '(' '{print $NF}' $spec_path >${spec_path}.doc
     awk -F '(' '{$NF="";print $0}' $spec_path >${spec_path}.api
-    if [ "$1" == "cp35-cp35m" ] || [ "$1" == "cp36-cp36m" ] || [ "$1" == "cp37-cp37m" ]; then 
+    if [ "$1" == "cp35-cp35m" ] || [ "$1" == "cp36-cp36m" ] || [ "$1" == "cp37-cp37m" ] || [ "$1" == "cp38-cp38" ]; then
         # Use sed to make python2 and python3 sepc keeps the same
         sed -i 's/arg0: str/arg0: unicode/g' $spec_path
         sed -i "s/\(.*Transpiler.*\).__init__ (ArgSpec(args=\['self'].*/\1.__init__ /g" $spec_path
@@ -873,6 +959,10 @@ function parallel_test_base_gpu() {
 EOF
 
 set +x
+        precison_cases=""
+        if [ ${PRECISION_TEST:-OFF} == "ON" ]; then
+            precision_cases=`python $PADDLE_ROOT/tools/get_pr_ut.py`
+        fi
         EXIT_CODE=0;
         test_cases=$(ctest -N -V) # get all test cases
         exclusive_tests=''        # cases list which would be run exclusively
@@ -902,10 +992,23 @@ set +x
                     echo $testcase" will only run at night."
                     continue
                 fi
+                if [ ${PRECISION_TEST:-OFF} == "ON" ] && [[ "$precision_cases" != "" ]]; then
+                    will_test="false"
+                    for case in $precision_cases; do
+                        if [[ $testcase == $case ]]; then
+                            will_test="true"
+                            break
+                        fi
+                    done
+                    if [[ $will_test == "false" ]]; then
+                        echo $testcase" won't run in PRECISION_TEST mode."
+                        continue
+                    fi
+                fi
 
                 if [[ "$is_multicard" == "" ]]; then
                   # trick: treat all test case with prefix "test_dist" as dist case, and would run on 2 GPUs
-                  read is_multicard <<< $(echo "$testcase"|grep -oEi "test_dist")
+                  read is_multicard <<< $(echo "$testcase"|grep -oEi "test_dist_")
                 fi
 
                 if [[ "$is_exclusive" != "" ]]; then
@@ -1020,8 +1123,6 @@ set +x
                 done
         fi
 
-
-       
         if [[ "$EXIT_CODE" != "0" ]]; then
             if [[ "$failed_test_lists" == "" ]]; then
                 echo "========================================"
@@ -1166,21 +1267,25 @@ EOF
     ref_paddle35=paddlepaddle${install_gpu}-${PADDLE_BRANCH}-cp35-cp35m-linux_x86_64.whl
     ref_paddle36=paddlepaddle${install_gpu}-${PADDLE_BRANCH}-cp36-cp36m-linux_x86_64.whl
     ref_paddle37=paddlepaddle${install_gpu}-${PADDLE_BRANCH}-cp37-cp37m-linux_x86_64.whl
+    ref_paddle38=paddlepaddle${install_gpu}-${PADDLE_BRANCH}-cp38-cp38-linux_x86_64.whl
 
     ref_paddle2_whl=paddlepaddle${install_gpu}-${PADDLE_BRANCH}-cp27-cp27mu-linux_x86_64.whl
     ref_paddle35_whl=paddlepaddle${install_gpu}-${PADDLE_BRANCH}-cp35-cp35m-linux_x86_64.whl
     ref_paddle36_whl=paddlepaddle${install_gpu}-${PADDLE_BRANCH}-cp36-cp36m-linux_x86_64.whl
     ref_paddle37_whl=paddlepaddle${install_gpu}-${PADDLE_BRANCH}-cp37-cp37m-linux_x86_64.whl
+    ref_paddle38_whl=paddlepaddle${install_gpu}-${PADDLE_BRANCH}-cp38-cp38-linux_x86_64.whl
 
     if [[ ${PADDLE_BRANCH} != "0.0.0" && ${WITH_MKL} == "ON" && ${WITH_GPU} == "ON" ]]; then
         ref_paddle2=paddlepaddle${install_gpu}-${PADDLE_BRANCH}.post${ref_CUDA_MAJOR}${CUDNN_MAJOR}-cp27-cp27mu-linux_x86_64.whl
         ref_paddle35=paddlepaddle${install_gpu}-${PADDLE_BRANCH}.post${ref_CUDA_MAJOR}${CUDNN_MAJOR}-cp35-cp35m-linux_x86_64.whl
         ref_paddle36=paddlepaddle${install_gpu}-${PADDLE_BRANCH}.post${ref_CUDA_MAJOR}${CUDNN_MAJOR}-cp36-cp36m-linux_x86_64.whl
         ref_paddle37=paddlepaddle${install_gpu}-${PADDLE_BRANCH}.post${ref_CUDA_MAJOR}${CUDNN_MAJOR}-cp37-cp37m-linux_x86_64.whl
+        ref_paddle38=paddlepaddle${install_gpu}-${PADDLE_BRANCH}.post${ref_CUDA_MAJOR}${CUDNN_MAJOR}-cp38-cp38-linux_x86_64.whl
         ref_paddle2_whl=paddlepaddle${install_gpu}-${PADDLE_BRANCH}.post${ref_CUDA_MAJOR}${CUDNN_MAJOR}-cp27-cp27mu-linux_x86_64.whl
         ref_paddle35_whl=paddlepaddle${install_gpu}-${PADDLE_BRANCH}.post${ref_CUDA_MAJOR}${CUDNN_MAJOR}-cp35-cp35m-linux_x86_64.whl
         ref_paddle36_whl=paddlepaddle${install_gpu}-${PADDLE_BRANCH}.post${ref_CUDA_MAJOR}${CUDNN_MAJOR}-cp36-cp36m-linux_x86_64.whl
         ref_paddle37_whl=paddlepaddle${install_gpu}-${PADDLE_BRANCH}.post${ref_CUDA_MAJOR}${CUDNN_MAJOR}-cp37-cp37m-linux_x86_64.whl
+        ref_paddle38_whl=paddlepaddle${install_gpu}-${PADDLE_BRANCH}.post${ref_CUDA_MAJOR}${CUDNN_MAJOR}-cp38-cp38-linux_x86_64.whl
     fi
 
     #ref_paddle2_mv1=""
@@ -1285,6 +1390,22 @@ EOF
         apt-get clean -y && \
         rm -f ${ref_paddle37} && \
         ldconfig
+EOF
+    cat >> ${PADDLE_ROOT}/build/Dockerfile <<EOF
+    # run paddle version to install python packages first
+    RUN apt-get update && ${NCCL_DEPS}
+    RUN apt-get install -y make build-essential libssl-dev zlib1g-dev libbz2-dev \
+        libreadline-dev libsqlite3-dev wget curl llvm libncurses5-dev libncursesw5-dev \
+        xz-utils tk-dev libffi-dev liblzma-dev
+    RUN wget -q https://www.python.org/ftp/python/3.8.0/Python-3.8.0.tgz && \
+        tar -xzf Python-3.8.0.tgz && cd Python-3.8.0 && \
+        CFLAGS="-Wformat" ./configure --prefix=/usr/local/ --enable-shared > /dev/null && \
+        make -j8 > /dev/null && make altinstall > /dev/null && cd ../ && rm Python-3.8.0.tgz
+    RUN apt-get install -y libgtk2.0-dev dmidecode python3-tk && ldconfig && \
+        pip3.8 install opencv-python && wget ${ref_web}/${ref_paddle38} && pip3.8 install ${ref_paddle38_whl}; apt-get install -f -y && \
+        apt-get clean -y && \
+        rm -f ${ref_paddle38} && \
+        ldconfig
 EOF
     cat >> ${PADDLE_ROOT}/build/Dockerfile <<EOF
     # run paddle version to install python packages first
@@ -1325,7 +1446,7 @@ EOF
     fi
     endTime_s=`date +%s`
     echo "Build Time: $[ $endTime_s - $startTime_s ]s"
-    build_size "fluid_inference"
+    build_size "paddle_inference"
 }
 
 function tar_fluid_lib() {
@@ -1335,10 +1456,10 @@ function tar_fluid_lib() {
     ========================================
 EOF
     cd ${PADDLE_ROOT}/build
-    cp -r fluid_install_dir fluid
+    cp -r paddle_install_dir fluid
     tar -czf fluid.tgz fluid
-    cp -r fluid_inference_install_dir fluid_inference
-    tar -czf fluid_inference.tgz fluid_inference
+    cp -r paddle_inference_install_dir paddle_inference
+    tar -czf paddle_inference.tgz paddle_inference
 }
 
 function test_fluid_lib() {
@@ -1390,7 +1511,7 @@ function example() {
     cd ${PADDLE_ROOT}/tools
     python sampcd_processor.py cpu;example_error=$?
     if [ "$example_error" != "0" ];then
-      echo "Code instance execution failed"
+      echo "Code instance execution failed" >&2
       exit 5
     fi
 }
@@ -1399,15 +1520,25 @@ function summary_check_problems() {
     set +x
     local check_style_code=$1
     local example_code=$2
+    local check_style_info=$3
+    local example_info=$4
     if [ $check_style_code -ne 0 -o $example_code -ne 0 ];then
       echo "========================================"
       echo "summary problems:"
+      if [ $check_style_code -ne 0 -a $example_code -ne 0 ];then
+        echo "There are 2 errors: Code format error and Example code error."
+      else
+        [ $check_style_code -ne 0 ] && echo "There is 1 error: Code format error."
+        [ $example_code -ne 0 ] && echo "There is 1 error: Example code error."
+      fi
       echo "========================================"
       if [ $check_style_code -ne 0 ];then
-        echo "- Check code style failed! Please check the log and fix problems."
+        echo "*****Code format error***** Please fix it according to the diff information:"
+        echo "$check_style_info" | grep "code format error" -A $(echo "$check_style_info" | wc -l)
       fi
       if [ $example_code -ne 0 ];then
-        echo "- Check example code failed! Please check the log and fix problems."
+        echo "*****Example code error***** Please fix the error listed in the information:"
+        echo "$example_info" | grep "API check -- Example Code" -A $(echo "$example_info" | wc -l)
       fi
       [ $check_style_code -ne 0 ] && exit $check_style_code
       [ $example_code -ne 0 ] && exit $example_code
@@ -1421,21 +1552,24 @@ function main() {
     init
     if [ "$CMD" != "assert_file_approvals" ];then
       python ${PADDLE_ROOT}/tools/summary_env.py
+      bash ${PADDLE_ROOT}/tools/get_cpu_info.sh
     fi
     case $CMD in
       build_only)
         cmake_gen_and_build ${PYTHON_ABI:-""} ${parallel_number}
         ;;
       build_and_check)
-        $(check_style >&2)
+        set +e
+        check_style_info=$(check_style)
         check_style_code=$?
         generate_upstream_develop_api_spec ${PYTHON_ABI:-""} ${parallel_number}
         cmake_gen_and_build ${PYTHON_ABI:-""} ${parallel_number}
         check_sequence_op_unittest
         generate_api_spec ${PYTHON_ABI:-""} "PR"
-        $(example >&2)
+        set +e
+        example_info=$(example)
         example_code=$?
-        summary_check_problems $check_style_code $example_code
+        summary_check_problems $check_style_code $example_code "$check_style_info" "$example_info"
         assert_api_spec_approvals
         ;;
       build)
diff --git a/paddle/scripts/windows_build/build.bat b/paddle/scripts/windows_build/build.bat
index 65d44877d12554c73f7d93dafb9cecb9fb55e60a..6f99c23ccd262f3cf15b1cac6b1c56a9cc2c79d8 100644
--- a/paddle/scripts/windows_build/build.bat
+++ b/paddle/scripts/windows_build/build.bat
@@ -118,8 +118,8 @@ call:Build
 echo PACKAGE INFERENCE LIBRARY
 
 mkdir inference_dist
-%PYTHON_DIR%\python.exe -c "import shutil;shutil.make_archive('inference_dist/fluid_inference_install_dir', 'zip', root_dir='fluid_inference_install_dir')"
-%PYTHON_DIR%\python.exe -c "import shutil;shutil.make_archive('inference_dist/fluid_install_dir', 'zip', root_dir='fluid_install_dir')"
+%PYTHON_DIR%\python.exe -c "import shutil;shutil.make_archive('inference_dist/paddle_inference_install_dir', 'zip', root_dir='paddle_inference_install_dir')"
+%PYTHON_DIR%\python.exe -c "import shutil;shutil.make_archive('inference_dist/paddle_install_dir', 'zip', root_dir='paddle_install_dir')"
 
 echo BUILD INFERENCE LIBRARY COMPLETE
 goto :END
diff --git a/python/paddle/__init__.py b/python/paddle/__init__.py
index 5f1ccf3f858287066e36abf9412ba1114c526e61..e749cf88b6a49846b678c1c4258d2b3c2a8c01a4 100755
--- a/python/paddle/__init__.py
+++ b/python/paddle/__init__.py
@@ -49,6 +49,7 @@ import paddle.optimizer
 import paddle.metric
 import paddle.device
 import paddle.incubate.complex as complex
+import paddle.regularizer
 
 # TODO: define alias in tensor and framework directory
 
@@ -75,6 +76,8 @@ from .tensor.creation import full_like  #DEFINE_ALIAS
 from .tensor.creation import triu  #DEFINE_ALIAS
 from .tensor.creation import tril  #DEFINE_ALIAS
 from .tensor.creation import meshgrid  #DEFINE_ALIAS
+from .tensor.creation import empty  #DEFINE_ALIAS
+from .tensor.creation import empty_like  #DEFINE_ALIAS
 from .tensor.linalg import matmul  #DEFINE_ALIAS
 from .tensor.linalg import dot  #DEFINE_ALIAS
 # from .tensor.linalg import einsum        #DEFINE_ALIAS
@@ -87,6 +90,7 @@ from .tensor.linalg import cholesky  #DEFINE_ALIAS
 # from .tensor.linalg import tensordot        #DEFINE_ALIAS
 from .tensor.linalg import bmm  #DEFINE_ALIAS
 from .tensor.linalg import histogram  #DEFINE_ALIAS
+from .tensor.linalg import mv  #DEFINE_ALIAS
 from .tensor.logic import equal  #DEFINE_ALIAS
 from .tensor.logic import greater_equal  #DEFINE_ALIAS
 from .tensor.logic import greater_than  #DEFINE_ALIAS
@@ -200,7 +204,6 @@ from .tensor.math import prod  #DEFINE_ALIAS
 from .tensor.random import standard_normal
 from .tensor.random import normal
 from .tensor.random import uniform  #DEFINE_ALIAS
-from .tensor.random import shuffle  #DEFINE_ALIAS
 from .tensor.random import randn  #DEFINE_ALIAS
 from .tensor.random import rand  #DEFINE_ALIAS
 from .tensor.random import randint  #DEFINE_ALIAS
@@ -217,6 +220,8 @@ from .tensor.search import index_select  #DEFINE_ALIAS
 from .tensor.search import nonzero  #DEFINE_ALIAS
 from .tensor.search import sort  #DEFINE_ALIAS
 from .framework.random import manual_seed  #DEFINE_ALIAS
+from .framework.random import get_cuda_rng_state  #DEFINE_ALIAS
+from .framework.random import set_cuda_rng_state  #DEFINE_ALIAS
 from .framework import Variable  #DEFINE_ALIAS
 from .framework import ParamAttr  #DEFINE_ALIAS
 from .framework import create_global_var  #DEFINE_ALIAS
@@ -230,6 +235,7 @@ from .framework import grad  #DEFINE_ALIAS
 from .framework import no_grad  #DEFINE_ALIAS
 from .framework import save  #DEFINE_ALIAS
 from .framework import load  #DEFINE_ALIAS
+from .framework import SaveLoadConfig  #DEFINE_ALIAS
 from .framework import DataParallel  #DEFINE_ALIAS
 
 from .framework import NoamDecay  #DEFINE_ALIAS
@@ -267,5 +273,8 @@ from . import static
 # high-level api
 from .hapi import Model
 from .hapi import callbacks
+from .hapi import summary
 import paddle.text
 import paddle.vision
+
+disable_static()
diff --git a/python/paddle/dataset/flowers.py b/python/paddle/dataset/flowers.py
index 969ad3c922f9c15b2e39f71ae4359cd3d2fcdcce..bb60c58211c237c56bc89741e5d3cde11aa68e81 100644
--- a/python/paddle/dataset/flowers.py
+++ b/python/paddle/dataset/flowers.py
@@ -37,7 +37,7 @@ from .common import download
 import tarfile
 import scipy.io as scio
 from paddle.dataset.image import *
-from paddle.reader import *
+from paddle.reader import map_readers, xmap_readers
 from paddle import compat as cpt
 import os
 import numpy as np
diff --git a/python/paddle/dataset/uci_housing.py b/python/paddle/dataset/uci_housing.py
index 5bc9c1444d2b34f057cd92782eb50e5fc23916eb..f7930d34f93e21bf3f832da828fb0036742b5091 100644
--- a/python/paddle/dataset/uci_housing.py
+++ b/python/paddle/dataset/uci_housing.py
@@ -74,7 +74,8 @@ def load_data(filename, feature_num=14, ratio=0.8):
     data = data.reshape(data.shape[0] // feature_num, feature_num)
     maximums, minimums, avgs = data.max(axis=0), data.min(axis=0), data.sum(
         axis=0) / data.shape[0]
-    feature_range(maximums[:-1], minimums[:-1])
+    # if you want to print the distribution of input data, you could use function of feature_range
+    #feature_range(maximums[:-1], minimums[:-1])
     for i in six.moves.range(feature_num - 1):
         data[:, i] = (data[:, i] - avgs[i]) / (maximums[i] - minimums[i])
     offset = int(data.shape[0] * ratio)
diff --git a/python/paddle/distributed/__init__.py b/python/paddle/distributed/__init__.py
index b7357eef7ad9a3abae7f9c1c09fdc00b409061ad..27c82227316309b370aefe5e0550230c3f703c8c 100644
--- a/python/paddle/distributed/__init__.py
+++ b/python/paddle/distributed/__init__.py
@@ -21,6 +21,7 @@ from .parallel import get_rank
 from .parallel import get_world_size
 from paddle.fluid.dygraph.parallel import prepare_context  #DEFINE_ALIAS
 from paddle.fluid.dygraph.parallel import ParallelEnv  #DEFINE_ALIAS
+from paddle.distributed.fleet.dataset import *
 
 from . import collective
 from .collective import *
@@ -30,11 +31,8 @@ __all__ = ["spawn"]
 
 # dygraph parallel apis
 __all__ += [
-    "init_parallel_env",
-    "get_rank",
-    "get_world_size",
-    "prepare_context",
-    "ParallelEnv",
+    "init_parallel_env", "get_rank", "get_world_size", "prepare_context",
+    "ParallelEnv", "InMemoryDataset", "QueueDataset"
 ]
 
 # collective apis
diff --git a/python/paddle/distributed/cloud_utils.py b/python/paddle/distributed/cloud_utils.py
index 345b783d60bb79e99c98c4e9d212aa11cbe91dcc..5b7268e4b64fe34e6376819a7ac5659d1a5f5959 100644
--- a/python/paddle/distributed/cloud_utils.py
+++ b/python/paddle/distributed/cloud_utils.py
@@ -19,7 +19,7 @@ from paddle.distributed.utils import get_cluster, logger
 
 def get_cloud_cluster(args_node_ips, args_node_ip, args_port, selected_gpus):
     """
-    args_node_ips, args_node_ip:string
+    args_node_ips:string, args_node_ip:string, args_port: int, selected_gpus:list
     """
     #you can automatically get ip info while using paddlecloud multi nodes mode.
     node_ips = os.getenv("PADDLE_TRAINERS")
@@ -31,6 +31,9 @@ def get_cloud_cluster(args_node_ips, args_node_ip, args_port, selected_gpus):
     node_rank = os.getenv("PADDLE_TRAINER_ID")
     assert node_rank is not None, "PADDLE_TRAINER_ID should not be None"
 
+    paddle_ports_num = int(os.getenv("TRAINER_PORTS_NUM"))
+    assert paddle_ports_num is not None, "TRAINER_PORTS_NUM should not be None"
+
     node_ips = node_ips.split(",")
     num_nodes = len(node_ips)
     node_rank = int(node_rank)
@@ -47,32 +50,47 @@ automatically got from PADDLE_TRAINERS(multi nodes) or POD_IP(single node).\
 Your input cluster_node_ips: {} doesn't equals to IPs: {} from \
 paddlecloud environment.".format(args_node_ips, node_ips))
 
-    started_port = args_port
-    print("num_nodes:", num_nodes)
-    if num_nodes > 1:
-        try:
-            paddle_port = int(os.getenv("PADDLE_PORT", ""))
-            paddle_port_num = int(os.getenv("TRAINER_PORTS_NUM", ""))
-
-            if paddle_port_num >= len(
-                    selected_gpus) and paddle_port != args_port:
-                logger.warning("Use Cloud specified port:{}.".format(
-                    paddle_port))
-                started_port = paddle_port
-
-        except Exception as e:
-            print(e)
-            pass
-
-    if started_port is None:
-        started_port = 6170
-
-    logger.debug("parsed from args:node_ips:{} \
-        node_ip:{} node_rank:{} started_port:{}"
-                 .format(node_ips, node_ip, node_rank, started_port))
-
-    ports = [x for x in range(started_port, started_port + len(selected_gpus))]
-    cluster, pod = get_cluster(node_ips, node_ip, ports, selected_gpus)
+    # DISTRIBUTED_TRAINER_ENDPOINTS: new environment since paddlecloud 1.8.4
+    # e.g: DISTRIBUTED_TRAINER_ENDPOINTS="ip1:port1,ip1:port2,ip1:port3,ip1:port4,ip2:port5,ip2:port6,ip2:port7,ip2:port8"
+    trainer_endpoints = os.getenv("DISTRIBUTED_TRAINER_ENDPOINTS")
+    if trainer_endpoints is None:
+        started_port = args_port
+        if num_nodes > 1:
+            try:
+                paddle_port = int(os.getenv("PADDLE_PORT", ""))
+
+                if paddle_ports_num >= len(
+                        selected_gpus) and paddle_port != args_port:
+                    logger.warning("Use Cloud specified port:{}.".format(
+                        paddle_port))
+                    started_port = paddle_port
+
+            except Exception as e:
+                print(e)
+                pass
+
+        if started_port is None:
+            started_port = 6170
+        ports = [
+            x for x in range(started_port, started_port + len(selected_gpus))
+        ]
+        trainer_endpoints = []
+        for ip in node_ips:
+            trainer_endpoints.append(["%s:%d" % (ip, port) for port in ports])
+    else:
+        trainer_endpoints_ori = trainer_endpoints.split(",")
+        trainer_endpoints = []
+        assert num_nodes * paddle_ports_num == len(trainer_endpoints_ori)
+        for i in range(num_nodes):
+            trainer_endpoints.append(trainer_endpoints_ori[
+                i * paddle_ports_num:(i + 1) * paddle_ports_num])
+
+    logger.debug("parsed from args: node_ips:{} \
+        node_ip:{} node_rank:{} trainer_endpoints:{}"
+                 .format(node_ips, node_ip, node_rank, trainer_endpoints))
+
+    cluster, pod = get_cluster(node_ips, node_ip, trainer_endpoints,
+                               selected_gpus)
     return cluster, cluster.pods[node_rank]
 
 
diff --git a/python/paddle/distributed/fleet/__init__.py b/python/paddle/distributed/fleet/__init__.py
index 42ac68ba1a64de54f029878ceab08435c924d087..f3ee09a6d9ec1b22171253a920b26bbf98afd36e 100644
--- a/python/paddle/distributed/fleet/__init__.py
+++ b/python/paddle/distributed/fleet/__init__.py
@@ -23,7 +23,6 @@ from .dataset import *
 __all__ = [
     "DistributedStrategy",
     "UtilBase",
-    "DatasetFactory",
     "UserDefinedRoleMaker",
     "PaddleCloudRoleMaker",
     "Fleet",
@@ -40,6 +39,7 @@ server_num = fleet.server_num
 server_index = fleet.server_index
 server_endpoints = fleet.server_endpoints
 is_server = fleet.is_server
+set_util = fleet.set_util
 util = fleet.util
 barrier_worker = fleet.barrier_worker
 init_worker = fleet.init_worker
@@ -50,3 +50,10 @@ distributed_optimizer = fleet.distributed_optimizer
 save_inference_model = fleet.save_inference_model
 save_persistables = fleet.save_persistables
 minimize = fleet.minimize
+distributed_model = fleet.distributed_model
+step = fleet.step
+clear_grad = fleet.clear_grad
+set_lr = fleet.set_lr
+get_lr = fleet.get_lr
+state_dict = fleet.state_dict
+set_state_dict = fleet.set_state_dict
diff --git a/python/paddle/distributed/fleet/base/distributed_strategy.py b/python/paddle/distributed/fleet/base/distributed_strategy.py
index 26063d1b8a9225aff63628bb37f433ec95257dc7..f1c836468daf36db753c67a3e09757be728d37a7 100755
--- a/python/paddle/distributed/fleet/base/distributed_strategy.py
+++ b/python/paddle/distributed/fleet/base/distributed_strategy.py
@@ -15,10 +15,26 @@
 import paddle
 from paddle.distributed.fleet.proto import distributed_strategy_pb2
 from paddle.fluid.framework import Variable, set_flags, core
+from paddle.fluid.wrapped_decorator import wrap_decorator
 import google.protobuf.text_format
+import google.protobuf
 
 __all__ = ["DistributedStrategy"]
 
+non_auto_func_called = True
+
+
+def __non_auto_func_called__(func):
+    def __impl__(*args, **kwargs):
+        global non_auto_func_called
+        non_auto_func_called = False
+        return func(*args, **kwargs)
+
+    return __impl__
+
+
+is_strict_auto = wrap_decorator(__non_auto_func_called__)
+
 
 def get_msg_dict(msg):
     res_dict = {}
@@ -118,7 +134,7 @@ class DistributedStrategy(object):
             strategy = fleet.DistributedStrategy()
             strategy.dgc = True
             strategy.recompute = True
-            strategy.recompute_configs = {"checkpoint": ["x"]}
+            strategy.recompute_configs = {"checkpoints": ["x"]}
             strategy.save_to_prototxt("dist_strategy.prototxt")
         """
         with open(output, "w") as fout:
@@ -133,7 +149,7 @@ class DistributedStrategy(object):
 
             import paddle.distributed.fleet as fleet
             strategy = fleet.DistributedStrategy()
-            strategy.load_from_prototxt("dist_strategy.protoxt")
+            strategy.load_from_prototxt("dist_strategy.prototxt")
         """
         with open(pb_file, 'r') as f:
             self.strategy = google.protobuf.text_format.Merge(
@@ -147,6 +163,7 @@ class DistributedStrategy(object):
         Examples:
           .. code-block:: python
 
+            import paddle
             exe_strategy = paddle.fluid.ExecutionStrategy()
             exe_strategy.num_threads = 10
             exe_strategy.num_iteration_per_drop_scope = 10
@@ -163,6 +180,7 @@ class DistributedStrategy(object):
         return execution_strategy
 
     @execution_strategy.setter
+    @is_strict_auto
     def execution_strategy(self, strategy):
         fields = self.strategy.execution_strategy.DESCRIPTOR.fields
         for f in fields:
@@ -179,6 +197,7 @@ class DistributedStrategy(object):
         Examples:
           .. code-block:: python
 
+            import paddle
             build_strategy = paddle.fluid.BuildStrategy()
             build_strategy.enable_sequential_execution = True
             build_strategy.fuse_elewise_add_act_ops = True
@@ -201,6 +220,7 @@ class DistributedStrategy(object):
         return build_strategy
 
     @build_strategy.setter
+    @is_strict_auto
     def build_strategy(self, strategy):
         fields = self.strategy.build_strategy.DESCRIPTOR.fields
         for f in fields:
@@ -235,6 +255,7 @@ class DistributedStrategy(object):
         return self.strategy.a_sync
 
     @a_sync.setter
+    @is_strict_auto
     def a_sync(self, flag):
         if isinstance(flag, bool):
             self.strategy.a_sync = flag
@@ -252,14 +273,19 @@ class DistributedStrategy(object):
         a dict.
 
         **Notes**:
-            **Detailed arguments for a_sync_configs**
-            **k_step**: number of local optimization updates before communication
-            **max_merge_var_num**: maximum number of merged gradients before communication
-            **send_queue_size**: a buffer size of worker communication
-            **independent_recv_thread**: if we are using independent recv thread for communication
-            **thread_pool_size**: number of thread pool
-            **send_wait_times**: waiting time for sending gradients
-            **runtime_split_send_recv**: if we are using Tensor split for send and recv during runtime
+            k_step(int): number of local optimization updates before communication
+
+            max_merge_var_num(int): maximum number of merged gradients before communication
+
+            send_queue_size(int): a buffer size of worker communication
+
+            independent_recv_thread(bool): if we are using independent recv thread for communication
+
+            thread_pool_size(int): number of thread pool
+
+            send_wait_times(int): waiting time for sending gradients
+
+            runtime_split_send_recv(bool): if we are using Tensor split for send and recv during runtime
 
         Examples:
           .. code-block:: python
@@ -270,15 +296,17 @@ class DistributedStrategy(object):
 
             strategy = fleet.DistributedStrategy()
             strategy.a_sync = True  # by default this is True
-            configs = {"k_step": 10000, "send_queue_size": 32}
+            configs = {"k_steps": 1024, "send_queue_size": 32}
             strategy.a_sync_configs = configs
 
             # code block for defining loss and local optimizer
             # sgd = fleet.distributed_optimizer(optimizer, strategy)
+
         """
         return get_msg_dict(self.strategy.a_sync_configs)
 
     @a_sync_configs.setter
+    @is_strict_auto
     def a_sync_configs(self, configs):
         check_configs_key(self.strategy.a_sync_configs, configs,
                           "a_sync_configs")
@@ -301,6 +329,7 @@ class DistributedStrategy(object):
         return self.strategy.amp
 
     @amp.setter
+    @is_strict_auto
     def amp(self, flag):
         if isinstance(flag, bool):
             self.strategy.amp = flag
@@ -314,14 +343,21 @@ class DistributedStrategy(object):
         settings that can be configured through a dict.
 
         **Notes**:
-            **init_loss_scaling(float)**: The initial loss scaling factor. Default 32768.
-            **use_dynamic_loss_scaling(bool)**: Whether to use dynamic loss scaling. Default True.
-            **incr_every_n_steps(int)**: Increases loss scaling every n consecutive steps with finite gradients. Default 1000.
-            **decr_every_n_nan_or_inf(int)**: Decreases loss scaling every n accumulated steps with nan or inf gradients. Default 2.
-            **incr_ratio(float)**: The multiplier to use when increasing the loss scaling. Default 2.0.
-            **decr_ratio(float)**: The less-than-one-multiplier to use when decreasing the loss scaling. Default 0.5.
-            **custom_white_list(list[str])**: Users' custom white list which always execution fp16.
-            **custom_black_list(list[str])**: Users' custom black list which forbidden execution fp16.
+            init_loss_scaling(float): The initial loss scaling factor. Default 32768.
+
+            use_dynamic_loss_scaling(bool): Whether to use dynamic loss scaling. Default True.
+
+            incr_every_n_steps(int): Increases loss scaling every n consecutive steps with finite gradients. Default 1000.
+
+            decr_every_n_nan_or_inf(int): Decreases loss scaling every n accumulated steps with nan or inf gradients. Default 2.
+
+            incr_ratio(float): The multiplier to use when increasing the loss scaling. Default 2.0.
+
+            decr_ratio(float): The less-than-one-multiplier to use when decreasing the loss scaling. Default 0.5.
+
+            custom_white_list(list[str]): Users' custom white list which always execution fp16.
+
+            custom_black_list(list[str]): Users' custom black list which forbidden execution fp16.
 
         Examples:
           .. code-block:: python
@@ -336,6 +372,7 @@ class DistributedStrategy(object):
         return get_msg_dict(self.strategy.amp_configs)
 
     @amp_configs.setter
+    @is_strict_auto
     def amp_configs(self, configs):
         check_configs_key(self.strategy.amp_configs, configs, "amp_configs")
         assign_configs_value(self.strategy.amp_configs, configs)
@@ -373,6 +410,7 @@ class DistributedStrategy(object):
         return self.strategy.sync_nccl_allreduce
 
     @sync_nccl_allreduce.setter
+    @is_strict_auto
     def sync_nccl_allreduce(self, flag):
         if isinstance(flag, bool):
             self.strategy.sync_nccl_allreduce = flag
@@ -396,6 +434,7 @@ class DistributedStrategy(object):
         return self.strategy.use_hierarchical_allreduce
 
     @use_hierarchical_allreduce.setter
+    @is_strict_auto
     def use_hierarchical_allreduce(self, flag):
         if isinstance(flag, bool):
             self.strategy.use_hierarchical_allreduce = flag
@@ -420,6 +459,7 @@ class DistributedStrategy(object):
         return self.strategy.hierarchical_allreduce_inter_nranks
 
     @hierarchical_allreduce_inter_nranks.setter
+    @is_strict_auto
     def hierarchical_allreduce_inter_nranks(self, value):
         if isinstance(value, int):
             self.strategy.hierarchical_allreduce_inter_nranks = value
@@ -446,6 +486,7 @@ class DistributedStrategy(object):
         return self.strategy.sync_batch_norm
 
     @sync_batch_norm.setter
+    @is_strict_auto
     def sync_batch_norm(self, flag):
         if isinstance(flag, bool):
             self.strategy.sync_batch_norm = flag
@@ -468,6 +509,7 @@ class DistributedStrategy(object):
         return self.strategy.fuse_all_reduce_ops
 
     @fuse_all_reduce_ops.setter
+    @is_strict_auto
     def fuse_all_reduce_ops(self, flag):
         if isinstance(flag, bool):
             self.strategy.fuse_all_reduce_ops = flag
@@ -491,6 +533,7 @@ class DistributedStrategy(object):
         return self.strategy.fuse_grad_size_in_MB
 
     @fuse_grad_size_in_MB.setter
+    @is_strict_auto
     def fuse_grad_size_in_MB(self, value):
         if isinstance(value, int):
             self.strategy.fuse_grad_size_in_MB = value
@@ -502,6 +545,7 @@ class DistributedStrategy(object):
         return self.strategy.fuse_grad_size_in_TFLOPS
 
     @_fuse_grad_size_in_TFLOPS.setter
+    @is_strict_auto
     def _fuse_grad_size_in_TFLOPS(self, value):
         if isinstance(value, float):
             self.strategy.fuse_grad_size_in_TFLOPS = value
@@ -528,6 +572,7 @@ class DistributedStrategy(object):
         return self.strategy.nccl_comm_num
 
     @nccl_comm_num.setter
+    @is_strict_auto
     def nccl_comm_num(self, value):
         if isinstance(value, int):
             self.strategy.nccl_comm_num = value
@@ -535,6 +580,7 @@ class DistributedStrategy(object):
             print("WARNING: nccl_comm_num should have value of int type")
 
     @recompute.setter
+    @is_strict_auto
     def recompute(self, flag):
         if isinstance(flag, bool):
             self.strategy.recompute = flag
@@ -553,12 +599,13 @@ class DistributedStrategy(object):
             import paddle.distributed.fleet as fleet
             strategy = fleet.DistributedStrategy()
             strategy.recompute = True
-            strategy.recompute_configs = {"checkpionts": ["x", "y"]}
+            strategy.recompute_configs = {"checkpoints": ["x", "y"]}
 
         """
         return get_msg_dict(self.strategy.recompute_configs)
 
     @recompute_configs.setter
+    @is_strict_auto
     def recompute_configs(self, configs):
         check_configs_key(self.strategy.recompute_configs, configs,
                           "checkpoint_configs")
@@ -583,6 +630,7 @@ class DistributedStrategy(object):
         return self.strategy.pipeline
 
     @pipeline.setter
+    @is_strict_auto
     def pipeline(self, flag):
         if isinstance(flag, bool):
             self.strategy.pipeline = flag
@@ -603,6 +651,7 @@ class DistributedStrategy(object):
 
         **Notes**:
             **Detailed arguments for pipeline_configs**
+
             **micro_batch**: the number of small batches in each user defined batch
 
         Examples:
@@ -618,6 +667,7 @@ class DistributedStrategy(object):
         return get_msg_dict(self.strategy.pipeline_configs)
 
     @pipeline_configs.setter
+    @is_strict_auto
     def pipeline_configs(self, configs):
         check_configs_key(self.strategy.pipeline_configs, configs,
                           "pipeline_configs")
@@ -626,10 +676,10 @@ class DistributedStrategy(object):
     @property
     def localsgd(self):
         """
-        Indicating whether we are using Local SGD training. For more details, please refer to
-        [Don't Use Large Mini-Batches, Use Local SGD](https://arxiv.org/pdf/1808.07217.pdf),
+        Indicating whether we are using Local SGD training. Default Value: False
+        For more details, please refer to
+        `Don't Use Large Mini-Batches, Use Local SGD <https://arxiv.org/pdf/1808.07217.pdf>`_.
 
-        Default Value: False
 
         Examples:
           .. code-block:: python
@@ -642,6 +692,7 @@ class DistributedStrategy(object):
         return self.strategy.localsgd
 
     @localsgd.setter
+    @is_strict_auto
     def localsgd(self, flag):
         if isinstance(flag, bool):
             self.strategy.localsgd = flag
@@ -655,13 +706,8 @@ class DistributedStrategy(object):
         setting that can be configured through a dict.
 
         **Notes**:
-            **k_steps(int)**: The local steps for training before parameter
-                synchronization. Default 1. If strategy.auto is set True, the
-                local steps will be calculated automatically during training.
-                The algorithm is referenced in this paper: 
-                [Adaptive Communication Strategies to Achieve the Best Error-Runtime Trade-off in Local-Update SGD](https://arxiv.org/pdf/1810.08313.pdf).
-                In this case, k_steps indicates the first local steps which
-                is suggested setting to 1.
+            k_steps(int) The local steps for training before parameter synchronization. Default 1.
+            begin_step(int) The step of begining training by localsgd. Default 1.
 
         Examples:
           .. code-block:: python
@@ -669,17 +715,76 @@ class DistributedStrategy(object):
             import paddle.distributed.fleet as fleet
             strategy = fleet.DistributedStrategy()
             strategy.localsgd = True
-            strategy.localsgd_configs = {"k_steps": 4}
+            strategy.localsgd_configs = {"k_steps": 4,
+                                         "begin_step": 30}
         """
 
         return get_msg_dict(self.strategy.localsgd_configs)
 
     @localsgd_configs.setter
+    @is_strict_auto
     def localsgd_configs(self, configs):
         check_configs_key(self.strategy.localsgd_configs, configs,
                           "localsgd_configs")
         assign_configs_value(self.strategy.localsgd_configs, configs)
 
+    @property
+    def adaptive_localsgd(self):
+        """
+        Indicating whether we are using Adaptive Local SGD training. Default Value: False
+        For more details, please refer to `Adaptive Communication Strategies to Achieve 
+        the Best Error-Runtime Trade-off in Local-Update SGD <https://arxiv.org/pdf/1810.08313.pdf>`_.
+
+
+        Examples:
+          .. code-block:: python
+
+            import paddle.distributed.fleet as fleet
+            strategy = fleet.DistributedStrategy()
+            strategy.adaptive_localsgd = True # by default this is false
+
+        """
+        return self.strategy.localsgd
+
+    @adaptive_localsgd.setter
+    @is_strict_auto
+    def adaptive_localsgd(self, flag):
+        if isinstance(flag, bool):
+            self.strategy.localsgd = flag
+        else:
+            print("WARNING: adaptive_localsgd should have value of bool type")
+
+    @property
+    def adaptive_localsgd_configs(self):
+        """
+        Set AdaptiveLocalSGD training configurations. AdaptiveLocalSGD has a configurable
+        setting that can be configured through a dict.
+
+        **Notes**:
+            init_k_steps(int) The initial steps for training before adaptive localsgd.
+                              Then, the adaptive localsgd method will modify init_k_steps automatically.
+                              Default 1.
+            begin_step(int) The step of begining training by adaptive localsgd. Default 1.
+
+        Examples:
+          .. code-block:: python
+
+            import paddle.distributed.fleet as fleet
+            strategy = fleet.DistributedStrategy()
+            strategy.adaptive_localsgd = True
+            strategy.adaptive_localsgd_configs = {"init_k_steps": 1,
+                                                  "begin_step": 30}
+        """
+
+        return get_msg_dict(self.strategy.adaptive_localsgd_configs)
+
+    @adaptive_localsgd_configs.setter
+    @is_strict_auto
+    def adaptive_localsgd_configs(self, configs):
+        check_configs_key(self.strategy.adaptive_localsgd_configs, configs,
+                          "adaptive_localsgd_configs")
+        assign_configs_value(self.strategy.adaptive_localsgd_configs, configs)
+
     @property
     def dgc(self):
         """
@@ -699,6 +804,7 @@ class DistributedStrategy(object):
         return self.strategy.dgc
 
     @dgc.setter
+    @is_strict_auto
     def dgc(self, flag):
         if isinstance(flag, bool):
             self.strategy.dgc = flag
@@ -712,14 +818,16 @@ class DistributedStrategy(object):
         settings that can be configured through a dict.
 
         **Notes**:
-            **rampup_begin_step(int)**: The beginning step from which gradient compression is implemented. Default 0.
-            **rampup_step(int)**: Time steps used in sparsity warm-up periods. Default is 1.
-                For example, if the sparsity is [0.75, 0.9375, 0.984375, 0.996, 0.999], and the rampup_step is 100,
-                it will use 0.75 at 0~19 steps, and 0.9375 at 20~39 steps, and so on. And when reach sparsity array
-                ends, it will use 0.999 then and after.
-            **sparsity(list[float])**: Get top important element from gradient tensor, the ratio is (1 - sparsity).
-                Default is [0.999]. For example, if the sparsity is [0.99, 0.999], the top [1%, 0.1%] important
-                element will be transmitted.
+            rampup_begin_step(int): The beginning step from which gradient compression is implemented. Default 0.
+
+            rampup_step(int): Time steps used in sparsity warm-up periods. Default is 1. \
+                    For example, if the sparsity is [0.75, 0.9375, 0.984375, 0.996, 0.999], and the rampup_step is 100, \
+                    it will use 0.75 at 0~19 steps, and 0.9375 at 20~39 steps, and so on. And when reach sparsity array \
+                    ends, it will use 0.999 then and after.
+
+            sparsity(list[float]): Get top important element from gradient tensor, the ratio is (1 - sparsity). \
+                    Default is [0.999]. For example, if the sparsity is [0.99, 0.999], the top [1%, 0.1%] important \
+                    element will be transmitted.
 
         Examples:
           .. code-block:: python
@@ -732,6 +840,7 @@ class DistributedStrategy(object):
         return get_msg_dict(self.strategy.dgc_configs)
 
     @dgc_configs.setter
+    @is_strict_auto
     def dgc_configs(self, configs):
         check_configs_key(self.strategy.dgc_configs, configs, "dgc_configs")
         assign_configs_value(self.strategy.dgc_configs, configs)
@@ -749,7 +858,8 @@ class DistributedStrategy(object):
         to model parameters.
 
         Examples:
-        .. code-block:: python
+          .. code-block:: python
+
             import paddle.distributed.fleet as fleet
             strategy = fleet.DistributedStrategy()
             strategy.gradient_merge = True
@@ -758,6 +868,7 @@ class DistributedStrategy(object):
         return self.strategy.gradient_merge
 
     @gradient_merge.setter
+    @is_strict_auto
     def gradient_merge(self, flag):
         if isinstance(flag, bool):
             self.strategy.gradient_merge = flag
@@ -768,11 +879,15 @@ class DistributedStrategy(object):
     def gradient_merge_configs(self):
         """
         the key-value configs of distribute_strategy
-        Keys: 
-            k_steps (int): the update period of the parameters
-            avg (bool): whether to average the gradients of each mini-batch,
-                the default value is `True`
-        Example:
+
+        **Note**:
+            k_steps(int): the update period of the parameters.
+
+            avg(bool): whether to average the gradients of each mini-batch, the default value is `True`
+
+        Examples:
+          .. code-block:: python
+
             import paddle.distributed.fleet as fleet
             strategy = fleet.DistributedStrategy()
             strategy.gradient_merge = True
@@ -781,6 +896,7 @@ class DistributedStrategy(object):
         return get_msg_dict(self.strategy.gradient_merge_configs)
 
     @gradient_merge_configs.setter
+    @is_strict_auto
     def gradient_merge_configs(self, configs):
         check_configs_key(self.strategy.gradient_merge_configs, configs,
                           "gradient_configs")
@@ -805,6 +921,7 @@ class DistributedStrategy(object):
         return self.strategy.lars
 
     @lars.setter
+    @is_strict_auto
     def lars(self, flag):
         if isinstance(flag, bool):
             self.strategy.lars = flag
@@ -826,6 +943,7 @@ class DistributedStrategy(object):
 
         Examples:
           .. code-block:: python
+
             import paddle.distributed.fleet as fleet
             strategy = fleet.DistributedStrategy()
             strategy.lars = True
@@ -839,6 +957,7 @@ class DistributedStrategy(object):
         return get_msg_dict(self.strategy.lars_configs)
 
     @lars_configs.setter
+    @is_strict_auto
     def lars_configs(self, configs):
         check_configs_key(self.strategy.lars_configs, configs, "lars_configs")
         assign_configs_value(self.strategy.lars_configs, configs)
@@ -864,6 +983,7 @@ class DistributedStrategy(object):
         return self.strategy.lamb
 
     @lamb.setter
+    @is_strict_auto
     def lamb(self, flag):
         if isinstance(flag, bool):
             self.strategy.lamb = flag
@@ -882,6 +1002,7 @@ class DistributedStrategy(object):
 
         Examples:
           .. code-block:: python
+
             import paddle.distributed.fleet as fleet
             strategy = fleet.DistributedStrategy()
             strategy.lamb = True
@@ -893,15 +1014,21 @@ class DistributedStrategy(object):
         return get_msg_dict(self.strategy.lamb_configs)
 
     @lamb_configs.setter
+    @is_strict_auto
     def lamb_configs(self, configs):
         check_configs_key(self.strategy.lamb_configs, configs, "lamb_configs")
         assign_configs_value(self.strategy.lamb_configs, configs)
 
     @property
     def elastic(self):
+        """
+        Indicating whether we want to do current distributed training on clusters with elastic resources.
+        Currently, this is configuration is not valid.
+        """
         return self.strategy.elastic
 
     @elastic.setter
+    @is_strict_auto
     def elastic(self, flag):
         if isinstance(flag, bool):
             self.strategy.elastic = flag
@@ -910,6 +1037,25 @@ class DistributedStrategy(object):
 
     @property
     def auto(self):
+        """
+        Indicating whether we are using auto-parallel configuration
+        This feature is currently an experimental feature. Currently, 
+        auto-parallelism can be used only when a user does not set any other
+        strategy configs except auto. For details, please reference the following
+        code example
+        Default Value: False
+
+        Examples:
+          .. code-block:: python
+
+            import paddle
+            import paddle.distributed.fleet as fleet
+            strategy = fleet.DistributedStrategy()
+            strategy.auto = True
+
+            optimizer = paddle.optimizer.SGD(learning_rate=0.01)
+            optimizer = fleet.distributed_optimizer(optimizer, strategy)
+        """
         return self.strategy.auto
 
     @auto.setter
@@ -921,9 +1067,27 @@ class DistributedStrategy(object):
 
     @property
     def cudnn_exhaustive_search(self):
+        """
+        Indicating whether to use exhaustive search method to choose convolution algorithms.
+        Exhaustive search attempts all cuDNN algorithms to choose the fastest algorithm.
+        This method is time-consuming, the choosed algorithm will be cached for the given layer specifications.
+        Once the layer specifications (like batch size, feature map size) are changed, it will search again.
+        Default Value: True
+
+        Examples:
+          .. code-block:: python
+
+            import paddle.distributed.fleet as fleet
+            strategy = fleet.DistributedStrategy()
+            strategy.cudnn_exhaustive_search = False
+
+            optimizer = paddle.optimizer.SGD(learning_rate=0.01)
+            optimizer = fleet.distributed_optimizer(optimizer, strategy)
+        """
         return self.strategy.cudnn_exhaustive_search
 
     @cudnn_exhaustive_search.setter
+    @is_strict_auto
     def cudnn_exhaustive_search(self, flag):
         if isinstance(flag, bool):
             self.strategy.cudnn_exhaustive_search = flag
@@ -934,9 +1098,28 @@ class DistributedStrategy(object):
 
     @property
     def conv_workspace_size_limit(self):
+        """
+        The workspace limit size in MB unit for choosing cuDNN convolution algorithms.
+        The inner funciton of cuDNN obtain the fastest suited algorithm that fits within this memory limit.
+        Usually, large workspace size may lead to choose faster algorithms,
+        but significant increasing memory workspace. Users need to trade-off between memory and speed.
+        Default Value: 4000
+
+        Examples:
+          .. code-block:: python
+
+            import paddle.distributed.fleet as fleet
+            strategy = fleet.DistributedStrategy()
+            strategy.conv_workspace_size_limit = 1024
+
+            optimizer = paddle.optimizer.SGD(learning_rate=0.01)
+            optimizer = fleet.distributed_optimizer(optimizer, strategy)
+        
+        """
         return self.strategy.conv_workspace_size_limit
 
     @conv_workspace_size_limit.setter
+    @is_strict_auto
     def conv_workspace_size_limit(self, value):
         if isinstance(value, int):
             self.strategy.conv_workspace_size_limit = value
@@ -947,9 +1130,26 @@ class DistributedStrategy(object):
 
     @property
     def cudnn_batchnorm_spatial_persistent(self):
+        """
+        Indicates whether to use the mode CUDNN_BATCHNORM_SPATIAL_PERSISTENT function in batchnorm.
+        This is only useful in cudnn.
+        Default Value: True
+
+        Examples:
+          .. code-block:: python
+
+            import paddle.distributed.fleet as fleet
+            strategy = fleet.DistributedStrategy()
+            strategy.cudnn_batchnorm_spatial_persistent = True
+
+            optimizer = paddle.optimizer.SGD(learning_rate=0.01)
+            optimizer = fleet.distributed_optimizer(optimizer, strategy)
+
+        """
         return self.strategy.cudnn_batchnorm_spatial_persistent
 
     @cudnn_batchnorm_spatial_persistent.setter
+    @is_strict_auto
     def cudnn_batchnorm_spatial_persistent(self, flag):
         if isinstance(flag, bool):
             self.strategy.cudnn_batchnorm_spatial_persistent = flag
@@ -981,8 +1181,98 @@ class DistributedStrategy(object):
             if core.globals().is_public(key):
                 core.globals()[key] = values[i]
 
+    def _is_strict_auto(self):
+        global non_auto_func_called
+        if self.strategy.auto and non_auto_func_called:
+            return True
+        return False
+
     def __repr__(self):
+        spacing = 2
+        max_k = 38
+        max_v = 38
+
+        length = max_k + max_v + spacing
+
+        h1_format = "    " + "|{{:^{}s}}|\n".format(length)
+        h2_format = "    " + "|{{:>{}s}}{}{{:^{}s}}|\n".format(max_k, " " *
+                                                               spacing, max_v)
+
+        border = "    +" + "".join(["="] * length) + "+"
+        line = "    +" + "".join(["-"] * length) + "+"
+
+        draws = border + "\n"
+        draws += h1_format.format("")
+        draws += h1_format.format("DistributedStrategy Overview")
+        draws += h1_format.format("")
+
         fields = self.strategy.DESCRIPTOR.fields
+        str_res = ""
+
+        env_draws = line + "\n"
         for f in fields:
-            print("{}: {}".format(f.name, f.default_value))
-        return str(self.strategy)
+            if "build_strategy" in f.name or "execution_strategy" in f.name:
+                continue
+            if "_configs" in f.name:
+                continue
+            else:
+                if isinstance(getattr(self.strategy, f.name), bool):
+                    if hasattr(self.strategy, f.name + "_configs"):
+                        if getattr(self.strategy, f.name):
+                            draws += border + "\n"
+                            draws += h1_format.format(
+                                "{} = True, please check {}_configs".format(
+                                    f.name, f.name))
+                            draws += line + "\n"
+                            my_configs = getattr(self.strategy,
+                                                 f.name + "_configs")
+                            config_fields = my_configs.DESCRIPTOR.fields
+                            for ff in config_fields:
+                                if isinstance(
+                                        getattr(my_configs, ff.name),
+                                        google.protobuf.pyext._message.
+                                        RepeatedScalarContainer):
+                                    values = getattr(my_configs, ff.name)
+                                    for i, v in enumerate(values):
+                                        if i == 0:
+                                            draws += h2_format.format(ff.name,
+                                                                      str(v))
+                                        else:
+                                            draws += h2_format.format("",
+                                                                      str(v))
+                                else:
+                                    draws += h2_format.format(
+                                        ff.name,
+                                        str(getattr(my_configs, ff.name)))
+                    else:
+                        env_draws += h2_format.format(
+                            f.name, str(getattr(self.strategy, f.name)))
+                else:
+                    env_draws += h2_format.format(
+                        f.name, str(getattr(self.strategy, f.name)))
+
+        result_res = draws + border + "\n" + h1_format.format(
+            "Environment Flags, Communication Flags")
+        result_res += env_draws
+
+        build_strategy_str = border + "\n"
+        build_strategy_str += h1_format.format("Build Strategy")
+        build_strategy_str += line + "\n"
+
+        fields = self.strategy.build_strategy.DESCRIPTOR.fields
+        for f in fields:
+            build_strategy_str += h2_format.format(
+                f.name, str(getattr(self.strategy.build_strategy, f.name)))
+        build_strategy_str += border + "\n"
+
+        execution_strategy_str = h1_format.format("Execution Strategy")
+        execution_strategy_str += line + "\n"
+
+        fields = self.strategy.execution_strategy.DESCRIPTOR.fields
+        for f in fields:
+            execution_strategy_str += h2_format.format(
+                f.name, str(getattr(self.strategy.execution_strategy, f.name)))
+        execution_strategy_str += border + "\n"
+
+        result_res += build_strategy_str + execution_strategy_str
+        return result_res
diff --git a/python/paddle/distributed/fleet/base/fleet_base.py b/python/paddle/distributed/fleet/base/fleet_base.py
index 282ac29d6f9dafb4eb3b83471157464620326348..d00faac838504f5d68e9d44d9ffa9f25c7bf2ee5 100644
--- a/python/paddle/distributed/fleet/base/fleet_base.py
+++ b/python/paddle/distributed/fleet/base/fleet_base.py
@@ -13,7 +13,11 @@
 # limitations under the License.
 
 from __future__ import print_function
+import copy
+import warnings
 import paddle
+from paddle.fluid.framework import dygraph_only
+from paddle.fluid import compiler
 from .role_maker import UserDefinedRoleMaker, PaddleCloudRoleMaker, RoleMakerBase
 from .strategy_compiler import StrategyCompiler
 from .distributed_strategy import DistributedStrategy
@@ -21,6 +25,7 @@ from .meta_optimizer_factory import MetaOptimizerFactory
 from .runtime_factory import RuntimeFactory
 from .util_factory import UtilFactory
 from paddle.fluid.wrapped_decorator import wrap_decorator
+from paddle.fluid.dygraph import parallel_helper
 
 
 def _inited_runtime_handler_(func):
@@ -35,7 +40,24 @@ def _inited_runtime_handler_(func):
     return __impl__
 
 
+def _is_non_distributed_check_(func):
+    def __impl__(*args, **kwargs):
+        cls = args[0]
+
+        if cls._role_maker is not None and cls._role_maker._is_non_distributed(
+        ) is True:
+            warnings.warn(
+                "%s() function doesn't work when use non_distributed fleet." %
+                (func.__name__))
+            return
+
+        return func(*args, **kwargs)
+
+    return __impl__
+
+
 inited_runtime_handler = wrap_decorator(_inited_runtime_handler_)
+is_non_distributed_check = wrap_decorator(_is_non_distributed_check_)
 
 
 class Fleet(object):
@@ -158,8 +180,15 @@ class Fleet(object):
                 raise ValueError(
                     "`role_maker` should be subclass of `RoleMakerBase`, but got {}".
                     format(type(role_maker)))
+        self._role_maker._generate_role()
+
         self.strategy_compiler = StrategyCompiler()
-        return None
+        if paddle.fluid.framework.in_dygraph_mode():
+            if parallel_helper._is_parallel_ctx_initialized():
+                warnings.warn(
+                    "The dygraph parallel environment has been initialized.")
+            else:
+                paddle.distributed.init_parallel_env()
 
     def is_first_worker(self):
         """
@@ -178,7 +207,7 @@ class Fleet(object):
                 fleet.is_first_worker()
 
         """
-        return self._role_maker.is_first_worker()
+        return self._role_maker._is_first_worker()
 
     def worker_index(self):
         """
@@ -195,7 +224,7 @@ class Fleet(object):
                 fleet.worker_index()
 
         """
-        return self._role_maker.worker_index()
+        return self._role_maker._worker_index()
 
     def worker_num(self):
         """
@@ -203,7 +232,7 @@ class Fleet(object):
 
         Returns:
             int: worker numbers
-        
+
         Examples:
             .. code-block:: python
 
@@ -212,7 +241,7 @@ class Fleet(object):
                 fleet.worker_num()
 
         """
-        return self._role_maker.worker_num()
+        return self._role_maker._worker_num()
 
     def is_worker(self):
         """
@@ -230,7 +259,7 @@ class Fleet(object):
                 fleet.is_worker()
 
         """
-        return self._role_maker.is_worker()
+        return self._role_maker._is_worker()
 
     def worker_endpoints(self, to_string=False):
         """
@@ -247,13 +276,10 @@ class Fleet(object):
                 fleet.worker_endpoints()
 
         """
-        '''
         if to_string:
-            return ",".join(self._role_maker.get_trainer_endpoints())
+            return ",".join(self._role_maker._get_trainer_endpoints())
         else:
-            return self._role_maker.get_trainer_endpoints()
-        '''
-        return ["127.0.0.1:1001", "127.0.0.1:1002"]
+            return self._role_maker._get_trainer_endpoints()
 
     def server_num(self):
         """
@@ -268,7 +294,7 @@ class Fleet(object):
             fleet.init()
             fleet.server_num()
         """
-        return len(self._role_maker.get_pserver_endpoints())
+        return len(self._role_maker._get_pserver_endpoints())
 
     def server_index(self):
         """
@@ -285,7 +311,7 @@ class Fleet(object):
                 fleet.server_index()
 
         """
-        return self._role_maker.server_index()
+        return self._role_maker._server_index()
 
     def server_endpoints(self, to_string=False):
         """
@@ -304,9 +330,9 @@ class Fleet(object):
         """
 
         if to_string:
-            return ",".join(self._role_maker.get_pserver_endpoints())
+            return ",".join(self._role_maker._get_pserver_endpoints())
         else:
-            return self._role_maker.get_pserver_endpoints()
+            return self._role_maker._get_pserver_endpoints()
 
     def is_server(self):
         """
@@ -324,10 +350,12 @@ class Fleet(object):
                 fleet.is_server()
 
         """
-        return self._role_maker.is_server(
+        return self._role_maker._is_server(
         ) or self._role_maker._is_heter_worker()
 
-    @property
+    def set_util(self, util):
+        self._util = util
+
     def util(self):
         """
         Utility functions that can be used under certain runtime
@@ -348,16 +376,6 @@ class Fleet(object):
         """
         return self._util
 
-    @util.setter
-    def util(self, util):
-        """
-        Set Utility functions for userd-defined runtime
-
-        Returns:
-            None
-        """
-        self._util = util
-
     def barrier_worker(self):
         """
         barrier all workers
@@ -365,8 +383,9 @@ class Fleet(object):
         Returns:
             None
         """
-        self._role_maker.barrier_worker()
+        self._role_maker._barrier("worker")
 
+    @is_non_distributed_check
     @inited_runtime_handler
     def init_worker(self):
         """
@@ -391,6 +410,7 @@ class Fleet(object):
         """
         self._runtime_handle._init_worker()
 
+    @is_non_distributed_check
     @inited_runtime_handler
     def init_server(self, *args, **kwargs):
         """
@@ -416,6 +436,7 @@ class Fleet(object):
         """
         self._runtime_handle._init_server(*args, **kwargs)
 
+    @is_non_distributed_check
     @inited_runtime_handler
     def run_server(self):
         """
@@ -440,6 +461,7 @@ class Fleet(object):
         """
         self._runtime_handle._run_server()
 
+    @is_non_distributed_check
     @inited_runtime_handler
     def stop_worker(self):
         """
@@ -564,12 +586,351 @@ class Fleet(object):
 
         """
         self.user_defined_optimizer = optimizer
+        if paddle.fluid.framework.in_dygraph_mode():
+            return self
+
         if strategy == None:
             strategy = DistributedStrategy()
         self.user_defined_strategy = strategy
         self.valid_strategy = None
         return self
 
+    @dygraph_only
+    def distributed_model(self, model):
+        """
+        Return distributed data parallel model (Only work in dygraph mode)
+
+        Args:
+            model (Layer): the user-defind model which inherits Layer.
+
+        Returns:
+            distributed data parallel model which inherits Layer.
+
+        Examples:
+
+            .. code-block:: python
+
+                import paddle
+                import paddle.nn as nn
+                from paddle.distributed import fleet
+
+                class LinearNet(nn.Layer):
+                    def __init__(self):
+                        super(LinearNet, self).__init__()
+                        self._linear1 = nn.Linear(10, 10)
+                        self._linear2 = nn.Linear(10, 1)
+
+                    def forward(self, x):
+                        return self._linear2(self._linear1(x))
+
+                # 1. enable dynamic mode
+                paddle.disable_static()
+
+                # 2. initialize fleet environment
+                fleet.init(is_collective=True)
+
+                # 3. create layer & optimizer
+                layer = LinearNet()
+                loss_fn = nn.MSELoss()
+                adam = paddle.optimizer.Adam(
+                    learning_rate=0.001, parameters=layer.parameters())
+
+                # 4. get data_parallel model using fleet
+                adam = fleet.distributed_optimizer(adam)
+                dp_layer = fleet.distributed_model(layer)
+
+                # 5. run layer
+                inputs = paddle.randn([10, 10], 'float32')
+                outputs = dp_layer(inputs)
+                labels = paddle.randn([10, 1], 'float32')
+                loss = loss_fn(outputs, labels)
+
+                print("loss:", loss.numpy())
+
+                loss = dp_layer.scale_loss(loss)
+                loss.backward()
+                dp_layer.apply_collective_grads()
+
+                adam.step()
+                adam.clear_grad()
+
+
+        """
+        assert model is not None
+        self.model = paddle.DataParallel(model)
+        return self.model
+
+    @dygraph_only
+    def state_dict(self):
+        """
+        Get state dict information from optimizer.
+        (Only work in dygraph mode)
+
+        Returns: 
+            state_dict(dict) : dict contains all the Tensor used by optimizer
+
+        Examples:
+            .. code-block:: python
+
+                import numpy as np
+                import paddle
+                from paddle.distributed import fleet
+
+                paddle.disable_static()
+                fleet.init(is_collective=True)
+
+                value = np.arange(26).reshape(2, 13).astype("float32")
+                a = paddle.fluid.dygraph.to_variable(value)
+
+                layer = paddle.nn.Linear(13, 5)
+                adam = paddle.optimizer.Adam(learning_rate=0.01, parameters=layer.parameters())
+
+                adam = fleet.distributed_optimizer(adam)
+                dp_layer = fleet.distributed_model(layer)
+                state_dict = adam.state_dict()
+        """
+        # imitate target optimizer retrieval
+        return self.user_defined_optimizer.state_dict()
+
+    @dygraph_only
+    def set_state_dict(self, state_dict):
+        """
+        Load optimizer state dict.
+        (Only work in dygraph mode)
+
+        Args: 
+            state_dict(dict) : Dict contains all the Tensor needed by optimizer
+
+        Returns:
+            None
+
+        Examples:
+            .. code-block:: python
+
+                import numpy as np
+                import paddle
+                from paddle.distributed import fleet
+
+                paddle.disable_static()
+                fleet.init(is_collective=True)
+
+                value = np.arange(26).reshape(2, 13).astype("float32")
+                a = paddle.fluid.dygraph.to_variable(value)
+
+                layer = paddle.nn.Linear(13, 5)
+                adam = paddle.optimizer.Adam(learning_rate=0.01, parameters=layer.parameters())
+
+                adam = fleet.distributed_optimizer(adam)
+                dp_layer = fleet.distributed_model(layer)
+                state_dict = adam.state_dict()
+                paddle.framework.save(state_dict, "paddle_dy")
+                para_state_dict, opti_state_dict = paddle.framework.load( "paddle_dy")
+                adam.set_state_dict(opti_state_dict)
+        """
+        # imitate target optimizer retrieval
+        return self.user_defined_optimizer.set_state_dict(state_dict)
+
+    @dygraph_only
+    def set_lr(self, value):
+        """
+        Set the value of the learning rate manually in the optimizer. 
+        (Only work in dygraph mode)
+
+        Args:
+            value (float|Tensor): the value of learning rate
+
+        Returns: 
+            None 
+
+        Examples:
+            .. code-block:: python
+
+                import numpy as np
+                import paddle
+                from paddle.distributed import fleet
+
+                paddle.disable_static()
+                fleet.init(is_collective=True)
+
+                value = np.arange(26).reshape(2, 13).astype("float32")
+                a = paddle.fluid.dygraph.to_variable(value)
+
+                layer = paddle.nn.Linear(13, 5)
+                adam = paddle.optimizer.Adam(learning_rate=0.01, parameters=layer.parameters())
+
+                adam = fleet.distributed_optimizer(adam)
+                dp_layer = fleet.distributed_model(layer)
+
+                lr_list = [0.2, 0.3, 0.4, 0.5, 0.6]
+                for i in range(5):
+                    adam.set_lr(lr_list[i])
+                    lr = adam.get_lr()
+                    print("current lr is {}".format(lr))
+                # Print:
+                #    current lr is 0.2
+                #    current lr is 0.3
+                #    current lr is 0.4
+                #    current lr is 0.5
+                #    current lr is 0.6
+        """
+        # imitate target optimizer retrieval
+        return self.user_defined_optimizer.set_lr(value)
+
+    @dygraph_only
+    def get_lr(self):
+        """
+        Get current step learning rate.
+        (Only work in dygraph mode)
+
+        Returns:
+            float: The learning rate of the current step.
+
+        Examples:
+            .. code-block:: python
+
+                import numpy as np
+                import paddle
+                from paddle.distributed import fleet
+
+                paddle.disable_static()
+                fleet.init(is_collective=True)
+
+                value = np.arange(26).reshape(2, 13).astype("float32")
+                a = paddle.fluid.dygraph.to_variable(value)
+
+                layer = paddle.nn.Linear(13, 5)
+                adam = paddle.optimizer.Adam(learning_rate=0.01, parameters=layer.parameters())
+
+                adam = fleet.distributed_optimizer(adam)
+                dp_layer = fleet.distributed_model(layer)
+
+                lr = adam.get_lr()
+                print(lr) # 0.01
+        """
+        # imitate target optimizer retrieval
+        return self.user_defined_optimizer.get_lr()
+
+    @dygraph_only
+    def step(self):
+        """
+        Execute the optimizer once.
+        (Only work in dygraph mode)
+
+        Returns:
+            None
+
+        Examples:
+            .. code-block:: python
+
+                import paddle
+                import paddle.nn as nn
+                from paddle.distributed import fleet
+
+                class LinearNet(nn.Layer):
+                    def __init__(self):
+                        super(LinearNet, self).__init__()
+                        self._linear1 = nn.Linear(10, 10)
+                        self._linear2 = nn.Linear(10, 1)
+
+                    def forward(self, x):
+                        return self._linear2(self._linear1(x))
+
+                # 1. enable dynamic mode
+                paddle.disable_static()
+
+                # 2. initialize fleet environment
+                fleet.init(is_collective=True)
+
+                # 3. create layer & optimizer
+                layer = LinearNet()
+                loss_fn = nn.MSELoss()
+                adam = paddle.optimizer.Adam(
+                    learning_rate=0.001, parameters=layer.parameters())
+
+                # 4. get data_parallel model using fleet
+                adam = fleet.distributed_optimizer(adam)
+                dp_layer = fleet.distributed_model(layer)
+
+                # 5. run layer
+                inputs = paddle.randn([10, 10], 'float32')
+                outputs = dp_layer(inputs)
+                labels = paddle.randn([10, 1], 'float32')
+                loss = loss_fn(outputs, labels)
+
+                print("loss:", loss.numpy())
+
+                loss = dp_layer.scale_loss(loss)
+                loss.backward()
+                dp_layer.apply_collective_grads()
+
+                adam.step()
+                adam.clear_grad()
+
+
+        """
+        # imitate target optimizer retrieval
+        return self.user_defined_optimizer.step()
+
+    @dygraph_only
+    def clear_grad(self):
+        """
+        Clear the gradients of all optimized parameters for model.
+        (Only work in dygraph mode)
+
+        Returns: 
+            None
+
+        Examples:
+            .. code-block:: python
+
+                import paddle
+                import paddle.nn as nn
+                from paddle.distributed import fleet
+
+                class LinearNet(nn.Layer):
+                    def __init__(self):
+                        super(LinearNet, self).__init__()
+                        self._linear1 = nn.Linear(10, 10)
+                        self._linear2 = nn.Linear(10, 1)
+
+                    def forward(self, x):
+                        return self._linear2(self._linear1(x))
+
+                # 1. enable dynamic mode
+                paddle.disable_static()
+
+                # 2. initialize fleet environment
+                fleet.init(is_collective=True)
+
+                # 3. create layer & optimizer
+                layer = LinearNet()
+                loss_fn = nn.MSELoss()
+                adam = paddle.optimizer.Adam(
+                    learning_rate=0.001, parameters=layer.parameters())
+
+                # 4. get data_parallel model using fleet
+                adam = fleet.distributed_optimizer(adam)
+                dp_layer = fleet.distributed_model(layer)
+
+                # 5. run layer
+                inputs = paddle.randn([10, 10], 'float32')
+                outputs = dp_layer(inputs)
+                labels = paddle.randn([10, 1], 'float32')
+                loss = loss_fn(outputs, labels)
+
+                print("loss:", loss.numpy())
+
+                loss = dp_layer.scale_loss(loss)
+                loss.backward()
+                dp_layer.apply_collective_grads()
+
+                adam.step()
+                adam.clear_grad()
+
+        """
+        # imitate target optimizer retrieval
+        return self.user_defined_optimizer.clear_grad()
+
     def minimize(self,
                  loss,
                  startup_program=None,
@@ -593,8 +954,8 @@ class Fleet(object):
             tuple: tuple (optimize_ops, params_grads), A list of operators appended
             by minimize and a list of (param, grad) variable pairs, param is
             ``Parameter``, grad is the gradient value corresponding to the parameter.
-            The returned tuple can be passed to ``fetch_list`` in ``Executor.run()`` to 
-            indicate program pruning. If so, the program will be pruned by ``feed`` and 
+            The returned tuple can be passed to ``fetch_list`` in ``Executor.run()`` to
+            indicate program pruning. If so, the program will be pruned by ``feed`` and
             ``fetch_list`` before run, see details in ``Executor``.
 
         Examples:
@@ -619,6 +980,11 @@ class Fleet(object):
                 # for more examples, please reference https://github.com/PaddlePaddle/FleetX
 
         """
+        if paddle.fluid.framework.in_dygraph_mode():
+            # imitate target optimizer retrieval
+            target_opt = self.user_defined_optimizer
+            return target_opt.minimize(loss)
+
         context = {}
         # cache original feed forward program
         self.origin_main_program = loss.block.program
@@ -640,6 +1006,18 @@ class Fleet(object):
             MetaOptimizerFactory()._get_valid_meta_optimizers(
                 self.user_defined_optimizer)
 
+        context["user_defined_strategy"] = copy.copy(self.user_defined_strategy)
+
+        # trigger the auto-parallel in very strict condition
+        # strategy = DistributedStrategy()
+        # strategy.auto = True
+        # optimizer = paddle.optimizer.SGD(learning_rate=0.1)
+        # optimizer = fleet.distributed_optimizer(optimizer, strategy)
+        if self.user_defined_strategy._is_strict_auto():
+            # turn on all the strategy for each optimizer
+            for opt in distributed_optimizer_list:
+                opt._enable_strategy(self.user_defined_strategy, context)
+
         valid_optimizer_list = []
         valid_graph_optimizer_list = []
         can_not_apply_optimizer_list = []
@@ -672,6 +1050,20 @@ class Fleet(object):
         optimize_ops = []
         params_grads = []
 
+        if self._role_maker._is_non_distributed() and not self._is_collective:
+            if self._runtime_handle is None:
+                self._runtime_handle = RuntimeFactory()._create_runtime(context)
+
+            compiled_program = compiler.CompiledProgram(
+                self.origin_main_program).with_data_parallel(
+                    loss_name=loss.name, share_vars_from=None)
+            loss.block.program._graph = compiled_program
+            return self.user_defined_optimizer.minimize(
+                loss,
+                startup_program=startup_program,
+                parameter_list=parameter_list,
+                no_grad_set=no_grad_set)
+
         if meta_optimizer:
             optimize_ops, params_grads = meta_optimizer.minimize(
                 loss,
diff --git a/python/paddle/distributed/fleet/base/role_maker.py b/python/paddle/distributed/fleet/base/role_maker.py
index 25f2d0dd3f45855d9f337c6b7154db9cb5bbae45..f66f013e4dbaadd534d6859b7ba6530779c82a3b 100644
--- a/python/paddle/distributed/fleet/base/role_maker.py
+++ b/python/paddle/distributed/fleet/base/role_maker.py
@@ -13,18 +13,332 @@
 # limitations under the License.
 """Defination of Role Makers."""
 import os
+import time
 import numpy as np
 import warnings
 from multiprocessing import Process, Manager
-import paddle.fluid as fluid
 
-#__all__ = ['UserDefinedRoleMaker', 'PaddleCloudRoleMaker']
+import paddle.fluid as fluid
 
 
 class Role:
     WORKER = 1
     SERVER = 2
     HETER_WORKER = 3
+    ALL = 4
+
+
+class Gloo(object):
+    """
+    Gloo is a universal class for barrier and collective communication
+    """
+
+    class RENDEZVOUS:
+        HDFS = 1
+        FILE = 2
+        HTTP = 3
+
+    def __init__(self):
+        self._worker_comm = None
+        self._server_comm = None
+        self._nodes_comm = None
+
+        self._comm_world = ["worker", "server", "all"]
+        self._err_init = "gloo is not initialized, will not communicator with other nodes"
+        self._err_type = "gloo initialized error, please check arguments"
+        self._err_world = "argument error, comm_world must in {}".format(
+            self._comm_world)
+
+        self._is_initialized = False
+        self._init_timeout_seconds = 3600
+        self._run_timeout_seconds = 9999999
+
+        self._rendezvous = None
+        self._role = None
+        self._iface = None
+
+        self._role_id = -1
+        self._worker_num = -1
+        self._server_num = -1
+        self._need_init_all = False
+
+    def init(self,
+             rendezvous,
+             role,
+             role_id,
+             worker_num,
+             server_num,
+             need_init_all=False,
+             kwargs=None):
+
+        self._rendezvous = rendezvous
+        self._role = role
+        self._role_id = role_id
+        self._worker_num = worker_num
+        self._server_num = server_num
+        self._need_init_all = need_init_all
+        self._iface = self.__get_default_iface()
+        self._prefix = kwargs.get("store.prefix", "")
+
+        if self._rendezvous == Gloo.RENDEZVOUS.HDFS:
+            dfs_name = kwargs.get("dfs.name", "")
+            dfs_ugi = kwargs.get("dfs.ugi", "")
+            dfs_path = kwargs.get("dfs.path", "")
+
+            if not dfs_name or not dfs_ugi or not dfs_path:
+                raise ValueError(self._err_type)
+            self._init_dfs(dfs_name, dfs_ugi, dfs_path, self._prefix)
+
+        elif self._rendezvous == Gloo.RENDEZVOUS.FILE:
+            fs_path = kwargs.get("dfs.path", "")
+
+            if not fs_path:
+                raise ValueError(self._err_type)
+            self._init_fs(fs_path, self._prefix)
+
+        elif self._rendezvous == Gloo.RENDEZVOUS.HTTP:
+            ip = kwargs.get("http.host", "")
+            port = kwargs.get("http.port", "")
+
+            if not ip or not port:
+                raise ValueError(self._err_type)
+            self._init_http(ip, port, self._prefix)
+
+        else:
+            raise ValueError(self._err_type)
+
+        self._is_initialized = True
+
+    def _init_fs(self, fs_path, prefix):
+        def init(rank, nodes, role):
+            gloo = fluid.core.Gloo()
+            gloo.set_rank(rank)
+            gloo.set_size(nodes)
+            gloo.set_prefix(prefix)
+            gloo.set_iface(self._iface)
+            gloo.set_timeout_seconds(self._init_timeout_seconds,
+                                     self._run_timeout_seconds)
+            gloo.set_hdfs_store(os.path.join(fs_path, role), "", "")
+            gloo.init()
+            return gloo
+
+        if self._role == Role.WORKER:
+            rank, nodes = self._get_rank_nodes(Role.WORKER)
+            gloo = init(rank, nodes, "WORKER")
+            self._worker_comm = gloo
+        else:
+            rank, nodes = self._get_rank_nodes(Role.SERVER)
+            gloo = init(rank, nodes, "SERVER")
+            self._server_comm = gloo
+
+        if self._need_init_all:
+            rank, nodes = self._get_rank_nodes(Role.ALL)
+            gloo = init(rank, nodes, "ALL")
+            self._nodes_comm = gloo
+
+    def _init_dfs(self, dfs_name, dfs_ugi, dfs_path, prefix):
+        def init(rank, nodes, role):
+            gloo = fluid.core.Gloo()
+            gloo.set_rank(rank)
+            gloo.set_size(nodes)
+            gloo.set_prefix(prefix)
+            gloo.set_iface(self._iface)
+            gloo.set_timeout_seconds(self._init_timeout_seconds,
+                                     self._run_timeout_seconds)
+            gloo.set_hdfs_store(os.path.join(dfs_path, role), dfs_name, dfs_ugi)
+            gloo.init()
+            return gloo
+
+        if self._role == Role.WORKER:
+            rank, nodes = self._get_rank_nodes(Role.WORKER)
+            gloo = init(rank, nodes, "WORKER")
+            self._worker_comm = gloo
+        else:
+            rank, nodes = self._get_rank_nodes(Role.SERVER)
+            gloo = init(rank, nodes, "SERVER")
+            self._server_comm = gloo
+
+        if self._need_init_all:
+            rank, nodes = self._get_rank_nodes(Role.ALL)
+            gloo = init(rank, nodes, "ALL")
+            self._nodes_comm = gloo
+
+    def _init_http(self, ip, port, prefix):
+        def __start_kv_server(http_server_d, size_d):
+            from paddle.distributed.fleet.utils.http_server import KVServer
+            http_server = KVServer(port, size_d)
+            http_server.start()
+            wait_seconds = 5
+            while http_server_d.get("running",
+                                    False) and not http_server.shoud_stop():
+                time.sleep(wait_seconds)
+            http_server.stop()
+
+        def init_kv_server():
+            size_d = {
+                "trainer": self._worker_num,
+                "pserver": self._server_num,
+                "all": self._worker_num + self._server_num
+            }
+
+            _http_server_d = {"running": True}
+            # child process for http server
+            _http_server = Process(
+                target=__start_kv_server, args=(_http_server_d, size_d))
+            _http_server.daemon = True
+            # set running status to True
+            # start child process
+            _http_server.start()
+
+        def init(rank, nodes, role):
+            gloo = fluid.core.Gloo()
+            gloo.set_rank(rank)
+            gloo.set_size(nodes)
+            gloo.set_prefix(prefix)
+            gloo.set_iface(self._iface)
+            gloo.set_timeout_seconds(self._init_timeout_seconds,
+                                     self._run_timeout_seconds)
+            gloo.set_http_store(ip, port, role)
+            return gloo
+
+        port = int(port)
+
+        if self._role == Role.SERVER and self._role_id == 0:
+            init_kv_server()
+
+        if self._role == Role.WORKER:
+            rank, nodes = self._get_rank_nodes(Role.WORKER)
+            gloo = init(rank, nodes, "WORKER")
+            self._worker_comm = gloo
+        else:
+            rank, nodes = self._get_rank_nodes(Role.SERVER)
+            gloo = init(rank, nodes, "SERVER")
+            self._server_comm = gloo
+
+        if self._need_init_all:
+            rank, nodes = self._get_rank_nodes(Role.ALL)
+            gloo = init(rank, nodes, "ALL")
+            self._nodes_comm = gloo
+
+    def _get_rank_nodes(self, role):
+        nodes = 0
+        rank = -1
+
+        if role == Role.WORKER:
+            nodes = self._worker_num
+            rank = self._role_id
+        elif role == Role.SERVER:
+            nodes = self._server_num
+            rank = self._role_id
+        elif role == Role.ALL:
+            nodes = self._worker_num + self._server_num
+
+            if self._role == Role.WORKER:
+                rank = self._role_id
+            else:
+                rank = self._worker_num + self._role_id
+        else:
+            ValueError(self._err_type)
+
+        return rank, nodes
+
+    def __get_default_iface(self):
+        """
+        get default physical interface
+        """
+        default1 = self.__get_default_iface_from_gateway()
+        default2 = self.__get_default_iface_from_interfaces()
+        return default2 if default1 == "lo" else default1
+
+    def __get_default_iface_from_gateway(self):
+        """
+        get default physical interface
+        """
+        import netifaces
+        gateways = netifaces.gateways()
+        if gateways.get(netifaces.AF_INET) != None:
+            gateway = gateways[netifaces.AF_INET]
+            if len(gateway) > 0 and len(gateway[0]) > 1:
+                return gateway[0][1]
+        return "lo"
+
+    def __get_default_iface_from_interfaces(self):
+        """
+        get default physical interface
+        """
+        import netifaces
+        for intf_name in netifaces.interfaces():
+            addresses = netifaces.ifaddresses(intf_name)
+            if netifaces.AF_INET in addresses:
+                ipv4_addresses = addresses[netifaces.AF_INET]
+                for ipv4_address in ipv4_addresses:
+                    if 'broadcast' in ipv4_address:
+                        return intf_name
+        return "lo"
+
+    def barrier(self, comm_world):
+        """
+        dummy barrier, do nothing
+        """
+        if not self._is_initialized:
+            warnings.warn(self._err_init)
+            return
+
+        if comm_world not in self._comm_world:
+            raise ValueError(self._err_world)
+
+        if comm_world == "worker":
+            self._worker_comm.barrier()
+        elif comm_world == "server":
+            self._server_comm.barrier()
+        else:
+            self._nodes_comm.barrier()
+
+    def all_reduce(self, input, mode="sum", comm_world="worker"):
+        if not self._is_initialized:
+            warnings.warn(self._err_init)
+            return input
+
+        if comm_world not in self._comm_world:
+            raise ValueError(self._err_world)
+
+        input = np.array(input)
+        input_shape = input.shape
+        input_list = input.reshape(-1).tolist()
+
+        self.barrier(comm_world)
+
+        if comm_world == "worker":
+            ans = self._worker_comm.all_reduce(input_list, mode)
+        elif comm_world == "server":
+            ans = self._server_comm.all_reduce(input_list, mode)
+        else:
+            ans = self._nodes_comm.all_reduce(input_list, mode)
+
+        output = np.array(ans).reshape(input_shape)
+        return output
+
+    def all_gather(self, input, comm_world="worker"):
+        """
+        dummy all gather, do nothing
+        Args:
+            obj(any): obj to do all gather
+        """
+        if not self._is_initialized:
+            warnings.warn(self._err_init)
+            return input
+
+        if comm_world not in self._comm_world:
+            raise ValueError(self._err_world)
+
+        if comm_world == "worker":
+            output = self._worker_comm.all_gather(input)
+        elif comm_world == "server":
+            output = self._server_comm.all_gather(input)
+        else:
+            output = self._nodes_comm.all_gather(input)
+
+        return output
 
 
 class RoleMakerBase(object):
@@ -47,23 +361,19 @@ class RoleMakerBase(object):
         self._heter_trainer_device = "CPU"
         self._is_heter_parameter_server_mode = False
 
-        self._node_type = None
-        self._node_type_comm = None
-        self._all_comm = None
-
-    def is_worker(self):
+    def _is_worker(self):
         """
         return is_worker() of current process
         """
         raise NotImplementedError("Please implement this method in child class")
 
-    def is_server(self):
+    def _is_server(self):
         """
         return is_server() of current process
         """
         raise NotImplementedError("Please implement this method in child class")
 
-    def is_first_worker(self):
+    def _is_first_worker(self):
         """
         Check whether the node is the first instance of worker.
         Returns:
@@ -72,7 +382,7 @@ class RoleMakerBase(object):
         """
         raise NotImplementedError("Please implement this method in child class")
 
-    def worker_num(self):
+    def _worker_num(self):
         """
         Get current total worker number.
 
@@ -81,7 +391,7 @@ class RoleMakerBase(object):
         """
         raise NotImplementedError("Please implement this method in child class")
 
-    def server_num(self):
+    def _server_num(self):
         """
         Get current total server number.
 
@@ -90,7 +400,7 @@ class RoleMakerBase(object):
         """
         raise NotImplementedError("Please implement this method in child class")
 
-    def worker_index(self):
+    def _worker_index(self):
         """
         Get current worker id.
 
@@ -99,7 +409,7 @@ class RoleMakerBase(object):
         """
         raise NotImplementedError("Please implement this method in child class")
 
-    def server_index(self):
+    def _server_index(self):
         """
         Get current server id.
 
@@ -108,7 +418,7 @@ class RoleMakerBase(object):
         """
         raise NotImplementedError("Please implement this method in child class")
 
-    def role_id(self):
+    def _role_id(self):
         """
         Get current id.
 
@@ -117,7 +427,7 @@ class RoleMakerBase(object):
         """
         raise NotImplementedError("Please implement this method in child class")
 
-    def node_num(self):
+    def _node_num(self):
         """
         Get the training node number
         Returns:
@@ -125,13 +435,13 @@ class RoleMakerBase(object):
         """
         raise NotImplementedError("Please implement this method in child class")
 
-    def get_trainer_endpoints(self):
+    def _get_trainer_endpoints(self):
         """
         return trainer endpoints
         """
         return self._worker_endpoints
 
-    def get_pserver_endpoints(self):
+    def _get_pserver_endpoints(self):
         """
         return pserver endpoints
         """
@@ -142,19 +452,11 @@ class RoleMakerBase(object):
             self._role, self._current_id, self._worker_endpoints,
             self._server_endpoints)
 
-    def _all_gather(self, comm_world, input):
-        """
-
-        Args:
-            input(int|float): input value
-
-        Returns:
-            return a list of values
-        """
-        print("warning: RoleMakerBase does not have all gather.")
+    def _all_gather(self, input, comm_world="worker"):
+        print("warning: RoleMakerBase does not have all gather worker.")
         return None
 
-    def _all_reduce(self, comm_world, input, mode="sum"):
+    def _all_reduce(self, input, mode="sum", comm_world="worker"):
         """
         Args:
             input(list/numpy.array): array of one dim
@@ -221,164 +523,130 @@ class PaddleCloudRoleMaker(RoleMakerBase):
     def __init__(self, is_collective=False, **kwargs):
         super(PaddleCloudRoleMaker, self).__init__()
         self._is_collective = is_collective
-        self._init_gloo = False  # default no init gloo
-        self._kwargs = kwargs
 
+        self._non_distributed = False
+
+        self._kwargs = kwargs
         self._role_is_generated = False
 
         self._server_endpoints = None
         self._worker_endpoints = None
 
-        self._node_type_comm = None
-        self._all_comm = None
-
-        if not self._is_collective:
-            self._hdfs_name = kwargs.get("hdfs_name", "")
-            self._hdfs_ugi = kwargs.get("hdfs_ugi", "")
-            self._hdfs_path = kwargs.get("path", "").rstrip("/")
-            self._init_timeout_seconds = kwargs.get("init_timeout_seconds",
-                                                    3600)
-            self._run_timeout_seconds = kwargs.get("run_timeout_seconds",
-                                                   9999999)
-            ip_port = kwargs.get("http_ip_port", "")
-            self._http_ip_port = []
-            self._http_server = None
-            # if ip_port is not empty, it will use http instead of hdfs
-            if ip_port != "":
-                self._http_ip_port = ip_port.split(":")
-                # it's for communication between processes
-                self._manager = Manager()
-                # global dict to store status
-                self._http_server_d = self._manager.dict()
-                # set running status of http server
-                self._http_server_d["running"] = False
-            self._iface = self.__get_default_iface()
-            # this environment variable can be empty
-            self._prefix = os.getenv("SYS_JOB_ID", "")
+        self._gloo = Gloo()  # gloo instance
 
     def _barrier(self, comm_world):
-        if isinstance(comm_world, fluid.core.Gloo):
-            comm_world.barrier()
-        else:
-            print("warning: must init Gloo before using _barrier() function")
-
-    def _all_gather(self, comm_world, input):
-        if isinstance(comm_world, fluid.core.Gloo):
-            self._barrier(comm_world)
-            output = comm_world.all_gather(input)
-            return output
-        else:
-            print("warning: must init Gloo before using _all_gather() function")
-            return None
-
-    def _all_reduce(self, comm_world, input, mode="sum"):
-        if isinstance(comm_world, fluid.core.Gloo):
+        self._gloo.barrier(comm_world)
 
-            input = np.array(input)
+    def _all_gather(self, input, comm_world="worker"):
+        return self._gloo.all_gather(input, comm_world)
 
-            input_shape = input.shape
-            input_list = input.reshape(-1).tolist()
+    def _all_reduce(self, input, mode="sum", comm_world="worker"):
+        return self._gloo.all_reduce(input, mode, comm_world)
 
-            self._barrier(comm_world)
-            ans = comm_world.all_reduce(input_list, mode)
-            output = np.array(ans).reshape(input_shape)
-            return output
-        else:
-            print("warning: must init Gloo before using _all_reduce() function")
-            return None
-
-    def is_worker(self):
+    def _is_worker(self):
         """
         whether current process is worker
         """
         if not self._role_is_generated:
-            self.generate_role()
+            self._generate_role()
         return self._role == Role.WORKER
 
-    def is_server(self):
+    def _is_server(self):
         """
         whether current process is server
         """
         if not self._role_is_generated:
-            self.generate_role()
+            self._generate_role()
         return self._role == Role.SERVER
 
-    def is_first_worker(self):
+    def _is_first_worker(self):
         """
         whether current process is worker of rank 0
         """
         if not self._role_is_generated:
-            self.generate_role()
+            self._generate_role()
         return self._role == Role.WORKER and self._current_id == 0
 
-    def worker_index(self):
+    def _worker_index(self):
         """
         get index of current worker
         """
         if not self._role_is_generated:
-            self.generate_role()
+            self._generate_role()
         return self._current_id
 
-    def server_index(self):
+    def _server_index(self):
         """
         get index of current server
         """
         if not self._role_is_generated:
-            self.generate_role()
+            self._generate_role()
         return self._current_id
 
-    def role_id(self):
+    def _role_id(self):
         """
         get index of current node
         """
+        if not self._role_is_generated:
+            self._generate_role()
         return self._current_id
 
-    def worker_num(self):
+    def _worker_num(self):
         """
         retrun the current number of worker
         """
         if not self._role_is_generated:
-            self.generate_role()
+            self._generate_role()
         return self._trainers_num
 
-    def server_num(self):
+    def _server_num(self):
         """
         return the current number of server
         """
         if not self._role_is_generated:
-            self.generate_role()
-        return self._trainers_num
+            self._generate_role()
+        return len(self._get_pserver_endpoints(
+        )) if self._get_pserver_endpoints() is not None else 0
 
-    def node_num(self):
+    def _node_num(self):
         """
         return the training node number
         """
         if not self._role_is_generated:
-            self.generate_role()
-        return self._node_num
+            self._generate_role()
+        return self._nodes_num
 
-    def get_trainer_endpoints(self):
+    def _get_trainer_endpoints(self):
         """
         get endpoint of all trainers
         """
         if not self._role_is_generated:
-            self.generate_role()
+            self._generate_role()
         return self._worker_endpoints
 
-    def get_pserver_endpoints(self):
+    def _get_pserver_endpoints(self):
         """
         get endpoint of all pservers
         """
         if not self._role_is_generated:
-            self.generate_role()
+            self._generate_role()
         return self._server_endpoints
 
+    def _is_non_distributed(self):
+        """
+        Return True if indispensable environment for fleetrun is not found
+        (use python-run to launch fleet-code directly)
+        """
+        if not self._role_is_generated:
+            self._generate_role()
+        return self._non_distributed
+
     def _heter_worker_num(self):
         """
         get heter worker nums
         """
         if not self._role_is_generated:
-            self.generate_role()
+            self._generate_role()
         return self._heter_trainers_num
 
     def _is_heter_worker(self):
@@ -386,35 +654,34 @@ class PaddleCloudRoleMaker(RoleMakerBase):
         whether current process is heter worker
         """
         if not self._role_is_generated:
-            self.generate_role()
+            self._generate_role()
         return self._role == Role.HETER_WORKER
 
-    def _get_rank(self):
-        """
-        get current rank in all workers and pservers
-        """
-        if not self._role_is_generated:
-            self.generate_role()
-        return self._rank
-
-    def _get_size(self):
-        """
-        get total num of all workers and pservers
-        """
-        if not self._role_is_generated:
-            self.generate_role()
-        return self._size
-
     def _ps_env(self):
         try:
             # Environment variable PADDLE_PSERVERS_IP_PORT_LIST must be set
             # format: string(ip:port,ip:port), eg. 127.0.0.1:6001,127.0.0.1:6002
-            self._server_endpoints = os.getenv("PADDLE_PSERVERS_IP_PORT_LIST",
-                                               "").split(",")
-            assert self._server_endpoints != ""
-            self._worker_endpoints = os.getenv("PADDLE_TRAINER_ENDPOINTS",
-                                               "").split(",")
-            assert self._server_endpoints != ""
+            self._server_endpoints = os.getenv("PADDLE_PSERVERS_IP_PORT_LIST")
+
+            if self._server_endpoints is None:
+                # back to non_distributed execution.
+                self._server_endpoints = ""
+                self._trainers_num = 1
+                self._role = Role.WORKER
+                self._current_id = 0
+                self._nodes_num = 1
+                self._heter_trainers_num = 0
+                self._heter_trainer_endpoints = None
+                self._non_distributed = True
+                return
+
+            self._server_endpoints = self._server_endpoints.split(",")
+
+            self._worker_endpoints = os.getenv("PADDLE_TRAINER_ENDPOINTS")
+            if self._worker_endpoints:
+                self._worker_endpoints = self._worker_endpoints.split(",")
+            else:
+                self._worker_endpoints = []
 
             trainers_num = int(os.environ["PADDLE_TRAINERS_NUM"])
             training_role = os.environ["TRAINING_ROLE"]
@@ -477,7 +744,7 @@ class PaddleCloudRoleMaker(RoleMakerBase):
         self._trainers_num = trainers_num
         self._role = role
         self._current_id = current_id
-        self._node_num = len(
+        self._nodes_num = len(
             set([x.split(':')[0] for x in self._worker_endpoints]))
         self._heter_trainers_num = heter_trainers_num
         self._heter_trainer_endpoints = heter_trainer_eplist
@@ -486,147 +753,96 @@ class PaddleCloudRoleMaker(RoleMakerBase):
         self._current_id = int(os.getenv("PADDLE_TRAINER_ID", "0"))
         self._training_role = os.getenv("PADDLE_TRAINING_ROLE", "TRAINER")
         assert (self._training_role == "TRAINER")
+        self._role = Role.WORKER
         self._worker_endpoints = os.getenv("PADDLE_TRAINER_ENDPOINTS")
         self._cur_endpoint = os.getenv("PADDLE_CURRENT_ENDPOINT")
-        assert self._worker_endpoints is not None, "can't find PADDLE_TRAINER_ENDPOINTS"
+        if self._worker_endpoints is None:
+            # back to non_distributed execution.
+            self._worker_endpoints = "127.0.0.1:6170"
+            self._cur_endpoint = self._worker_endpoints
+            self._non_distributed = True
         self._worker_endpoints = self._worker_endpoints.split(",")
         self._trainers_num = len(self._worker_endpoints)
-        self._node_num = len(
+        self._nodes_num = len(
             set([x.split(':')[0] for x in self._worker_endpoints]))
 
-    def _init_gloo_env(self):
-        def init_gloo_instance(role="trainer"):
-            role = role.lower()
-            assert role in ["trainer", "pserver", "all"]
-            if role == "trainer":
-                all_list = self._worker_endpoints
-                rank = self._current_id
-            elif role == "pserver":
-                all_list = self._server_endpoints
-                rank = self._current_id
-            else:
-                all_list = self._worker_endpoints + self._server_endpoints
-                rank = all_list.index(self._cur_endpoint)
-            gloo = fluid.core.Gloo()
-            gloo.set_rank(rank)
-            gloo.set_size(len(all_list))
-            gloo.set_prefix(self._prefix)
-            gloo.set_iface(self._iface)
-            gloo.set_timeout_seconds(self._init_timeout_seconds,
-                                     self._run_timeout_seconds)
-            if len(self._http_ip_port) != 0:
-                gloo.set_http_store(self._http_ip_port[0],
-                                    int(self._http_ip_port[1]), role)
-            else:
-                gloo.set_hdfs_store(self._hdfs_path + "/" + role,
-                                    self._hdfs_name, self._hdfs_ugi)
-            gloo.init()
-            return gloo
-
-        # paddlecloud support gloo
-        if self._role == Role.WORKER:
-            if self._current_id == 0 and len(self._http_ip_port) != 0:
-                size_d = {
-                    "trainer": len(self._worker_endpoints),
-                    "pserver": len(self._server_endpoints),
-                    "all":
-                    len(self._worker_endpoints) + len(self._server_endpoints)
-                }
-                # child process for http server
-                self._http_server = Process(
-                    target=self.__start_kv_server,
-                    args=(self._http_server_d, size_d))
-                self._http_server.daemon = True
-                # set running status to True
-                self._http_server_d["running"] = True
-                # start child process
-                self._http_server.start()
-            self._node_type = 1
-            gloo = init_gloo_instance("trainer")
-            self._node_type_comm = gloo
+    def _gloo_init(self):
+        # PADDLE_WITH_GLOO 1: trainer barrier, 2: all barrier
+        use_gloo = int(os.getenv("PADDLE_WITH_GLOO", "0"))
+        if use_gloo not in [1, 2]:
+            return
+
+        # PADDLE_GLOO_RENDEZVOUS 1: HDFS 2: FILE 3: HTTP
+        rendezvous_type = int(os.getenv("PADDLE_GLOO_RENDEZVOUS", "0"))
+        prefix = os.getenv("SYS_JOB_ID", "")
+        if rendezvous_type not in [
+                Gloo.RENDEZVOUS.HDFS, Gloo.RENDEZVOUS.HTTP, Gloo.RENDEZVOUS.FILE
+        ]:
+            raise ValueError(self._gloo._err_type)
+
+        need_init_all = True if use_gloo == 2 else False
+
+        if rendezvous_type == Gloo.RENDEZVOUS.HDFS:
+            dfs_name = os.getenv("PADDLE_GLOO_FS_NAME", "")
+            dfs_ugi = os.getenv("PADDLE_GLOO_FS_UGI", "")
+            dfs_path = os.getenv("PADDLE_GLOO_FS_PATH", "")
+            kwargs = {
+                "dfs.name": dfs_name,
+                "dfs.ugi": dfs_ugi,
+                "dfs.path": dfs_path,
+                "store.prefix": prefix,
+            }
+        elif rendezvous_type == Gloo.RENDEZVOUS.HTTP:
+            ip = os.getenv("PADDLE_GLOO_HTTP_HOST", "")
+            port = os.getenv("PADDLE_GLOO_HTTP_PORT", "")
+            kwargs = {
+                "http.host": ip,
+                "http.port": port,
+                "store.prefix": prefix,
+            }
         else:
-            assert self._role == Role.SERVER
-            self._node_type = 0
-            gloo = init_gloo_instance("pserver")
-            self._node_type_comm = gloo
-
-        all_list = self._worker_endpoints + self._server_endpoints
-        self._rank = all_list.index(self._cur_endpoint)
-        self._size = len(all_list)
-
-        gloo = init_gloo_instance("all")
-        self._all_comm = gloo
+            dfs_path = os.getenv("PADDLE_GLOO_FS_PATH", "")
+            kwargs = {
+                "dfs.path": dfs_path,
+                "store.prefix": prefix,
+            }
+
+        if rendezvous_type == Gloo.RENDEZVOUS.HDFS:
+            type = "HDFS"
+        elif rendezvous_type == Gloo.RENDEZVOUS.HTTP:
+            type = "HTTP"
+        else:
+            type = "FILE"
+        print("Gloo init with {}: need_init_all: {}, args: {}".format(
+            type, need_init_all, kwargs))
 
-        if self._http_server is not None:
-            # set running status to False
-            self._http_server_d["running"] = False
-            # wait until child process exits
-            self._http_server.join()
+        self._gloo.init(
+            rendezvous=rendezvous_type,
+            role=self._role,
+            role_id=self._role_id(),
+            worker_num=self._worker_num(),
+            server_num=self._server_num(),
+            need_init_all=need_init_all,
+            kwargs=kwargs)
 
-    def generate_role(self):
+    def _generate_role(self):
         """
         generate role for role maker
         """
         if not self._role_is_generated:
             if not self._is_collective:
                 self._ps_env()
-                if "PADDLE_WITH_GLOO" in os.environ:
-                    self._init_gloo = bool(os.environ["PADDLE_WITH_GLOO"])
-                if self._init_gloo:
-                    self._init_gloo_env()
             else:
                 self._collective_env()
             self._role_is_generated = True
-
-    def __get_default_iface(self):
-        """
-        get default physical interface
-        """
-        default1 = self.__get_default_iface_from_gateway()
-        default2 = self.__get_default_iface_from_interfaces()
-        return default2 if default1 == "lo" else default1
-
-    def __get_default_iface_from_gateway(self):
-        """
-        get default physical interface
-        """
-        import netifaces
-        gateways = netifaces.gateways()
-        if gateways.get(netifaces.AF_INET) != None:
-            gateway = gateways[netifaces.AF_INET]
-            if len(gateway) > 0 and len(gateway[0]) > 1:
-                return gateway[0][1]
-        return "lo"
-
-    def __get_default_iface_from_interfaces(self):
-        """
-        get default physical interface
-        """
-        import netifaces
-        for intf_name in netifaces.interfaces():
-            addresses = netifaces.ifaddresses(intf_name)
-            if netifaces.AF_INET in addresses:
-                ipv4_addresses = addresses[netifaces.AF_INET]
-                for ipv4_address in ipv4_addresses:
-                    if 'broadcast' in ipv4_address:
-                        return intf_name
-        return "lo"
-
-    def __start_kv_server(self, http_server_d, size_d):
-        from paddle.distributed.fleet.utils import KVServer
-        http_server = KVServer(int(self._http_ip_port[1]), size_d)
-        http_server.start()
-        wait_seconds = 5
-        while http_server_d.get("running",
-                                False) and not http_server.shoud_stop():
-            time.sleep(wait_seconds)
-        http_server.stop()
+            self._gloo_init()
 
 
 class UserDefinedRoleMaker(PaddleCloudRoleMaker):
     def __init__(self, is_collective=False, init_gloo=False, **kwargs):
         super(UserDefinedRoleMaker, self).__init__(
             is_collective=is_collective, init_gloo=init_gloo, **kwargs)
+        self._init_gloo = init_gloo
 
     def _user_defined_ps_env(self):
         self._server_endpoints = self._kwargs.get("server_endpoints")
@@ -645,26 +861,24 @@ class UserDefinedRoleMaker(PaddleCloudRoleMaker):
             self._cur_endpoint = self._worker_endpoints[self._current_id]
         elif self._role == Role.SERVER:
             self._cur_endpoint = self._server_endpoints[self._current_id]
-        self._node_num = len(
+        self._nodes_num = len(
             set([x.split(':')[0] for x in self._worker_endpoints]))
 
     def _user_defined_collective_env(self):
         self._worker_endpoints = self._kwargs.get("worker_endpoints")
         self._current_id = self._kwargs.get("current_id")
         self._trainers_num = len(self._worker_endpoints)
-        self._training_role = Role.Worker
-        self._node_num = len(
+        self._training_role = Role.WORKER
+        self._nodes_num = len(
             set([x.split(':')[0] for x in self._worker_endpoints]))
 
-    def generate_role(self):
+    def _generate_role(self):
         """
         generate role for role maker
         """
         if not self._role_is_generated:
             if not self._is_collective:
                 self._user_defined_ps_env()
-                if self._init_gloo:
-                    self._init_gloo_env()
             else:
                 self._user_defined_collective_env()
             self._role_is_generated = True
diff --git a/python/paddle/distributed/fleet/base/strategy_compiler.py b/python/paddle/distributed/fleet/base/strategy_compiler.py
index 4097fc1237f8d7616101810f994c243dffb2cd67..29e10661888f8a7fd6e3c40ee356aad326c193a9 100644
--- a/python/paddle/distributed/fleet/base/strategy_compiler.py
+++ b/python/paddle/distributed/fleet/base/strategy_compiler.py
@@ -60,7 +60,7 @@ class StrategyCompiler(StrategyCompilerBase):
 
     def _get_valid_strategy(self, dist_strategy, can_not_apply_optimizer_list):
         import copy
-        valid_strategy = copy.copy(dist_strategy)
+        valid_strategy = copy.deepcopy(dist_strategy)
         invalid_optimizers = []
         for candidate in self._meta_optimizer_candidates:
             is_valid = False
diff --git a/python/paddle/distributed/fleet/base/util_factory.py b/python/paddle/distributed/fleet/base/util_factory.py
index f5a6c417c0c45bea819c5832f98b5b6c9fabbd4b..efaa854c0879ddb57c7746cede68047ff82931a0 100644
--- a/python/paddle/distributed/fleet/base/util_factory.py
+++ b/python/paddle/distributed/fleet/base/util_factory.py
@@ -16,20 +16,18 @@
 """basic collective operations in python"""
 """remote file system"""
 
-__all__ = ['UtilBase']
-
-import numpy as np
-import os
-
-import subprocess
-from paddle.fluid import core
-from collections import OrderedDict
-import paddle.fluid as fluid
-from google.protobuf import text_format
-from paddle.fluid import debugger
-from paddle.fluid.framework import Program
-from paddle.fluid.proto import framework_pb2
 from ..utils.fs import FS, LocalFS, HDFSClient
+from paddle.fluid.proto import framework_pb2
+from paddle.fluid.framework import Program
+from paddle.fluid import debugger
+from google.protobuf import text_format
+import paddle.fluid as fluid
+from collections import OrderedDict
+from paddle.fluid import core
+import subprocess
+import os
+import numpy as np
+__all__ = ['UtilBase']
 
 
 class UtilFactory(object):
@@ -53,76 +51,194 @@ class UtilBase(object):
     def _set_role_maker(self, role_maker):
         self.role_maker = role_maker
 
-    def set_file_system(self, fs_client):
+    def _set_file_system(self, fs_client):
         assert isinstance(
             fs_client, FS
         ), "fs_client must be the instance of paddle.distributed.fleet.utils.FS"
         self.fs_client = fs_client
 
-    def __check_comm_world(self, comm_world="worker"):
-        if not self.role_maker._role_is_generated:
-            self.role_maker.generate_role()
-
-        _comm_world = None
-        comm_world_upper = comm_world.upper()
-        if comm_world_upper == "WORKER":
-            if not self.role_maker.is_worker():
-                print(
-                    "warning: current role is not worker in collective_func(comm_world=\"worker\")"
-                )
-            _comm_world = self.role_maker._node_type_comm
-        elif comm_world_upper == "SERVER":
-            if not self.role_maker.is_server():
-                print(
-                    "warning: current role is not server in collective_func(comm_world=\"server\")"
-                )
-            _comm_world = self.role_maker._node_type_comm
-        elif comm_world_upper == "ALL":
-            _comm_world = self.role_maker._all_comm
-        else:
-            raise ValueError(
-                "not support comm_world, please choose one from [worker, server, all]"
-            )
+    def all_reduce(self, input, mode="sum", comm_world="worker"):
+        """
+        All reduce `input` between specified collection. This is a distributed API.
 
-        return _comm_world
+        Args:
+            input (list|numpy.array): The input variable to do all_reduce between specified collection.
+            mode (str): "sum" or "min" or "max".
+            comm_world (str, optional): Collection used to execute all_reduce operation. Supported collections incude `worker` , `server` and `all` . The default is `worker` .
 
-    def all_reduce(self, input, mode, comm_world="worker"):
-        _comm_world = self.__check_comm_world(comm_world)
-        return self.role_maker._all_reduce(_comm_world, input, mode)
+        Returns:
+            output(Numpy.array|None): A numpy array with the same shape as the `input` .
+
+        Examples:
+            .. code-block:: python
+
+                # Save the following code in `train.py` , and then execute the command `fleetrun --server_num 2 --worker_num 2 train.py` .
+                from paddle.distributed.fleet.base.util_factory import fleet_util
+                import paddle.distributed.fleet as fleet
+                from paddle.distributed.fleet import PaddleCloudRoleMaker
+                import sys
+                import numpy as np
+
+                def train():
+                    role = PaddleCloudRoleMaker(
+                        is_collective=False,
+                        init_gloo=True,
+                        path="./tmp_gloo")
+                    fleet.init(role)
+                    fleet_util._set_role_maker(role)
+
+                    if fleet.is_server():
+                        input = [1, 2]
+                        output = fleet_util.all_reduce(input, "sum", "server")
+                        print(output)
+                        # [2, 4]
+                    elif fleet.is_worker():
+                        input = np.array([3, 4])
+                        output = fleet_util.all_reduce(input, "sum", "worker")
+                        print(output)
+                        # [6, 8]
+                    output = fleet_util.all_reduce(input, "sum", "all")
+                    print(output)
+                    # [8, 12]
+                if __name__ == "__main__":
+                    train()
+        """
+        return self.role_maker._all_reduce(input, mode, comm_world)
 
     def barrier(self, comm_world="worker"):
-        _comm_world = self.__check_comm_world(comm_world)
-        self.role_maker._barrier(_comm_world)
+        """
+        Barrier between specified collection.
+
+        Args:
+            comm_world (str, optional): Collection used to execute barrier operation. Supported collections incude `worker` , `server` and `all` . The default is `worker` .
+
+        Examples:
+
+            .. code-block:: python
+                # Save the following code in `train.py` , and then execute the command `fleetrun --server_num 2 --worker_num 2 train.py` .
+
+                from paddle.distributed.fleet.base.util_factory import fleet_util
+                import paddle.distributed.fleet as fleet
+                from paddle.distributed.fleet import PaddleCloudRoleMaker
+                import sys
+
+                def train():
+                    role = PaddleCloudRoleMaker(
+                        is_collective=False,
+                        init_gloo=True,
+                        path="./tmp_gloo")
+                    fleet.init(role)
+                    fleet_util._set_role_maker(role)
+
+                    if fleet.is_server():
+                        fleet_util.barrier("server")
+                        print("all server arrive here")
+                    elif fleet.is_worker():
+                        fleet_util.barrier("worker")
+                        print("all server arrive here")
+                    fleet_util.barrier("all")
+                    print("all servers and workers arrive here")
+
+                if __name__ == "__main__":
+                    train()
+        """
+        self.role_maker._barrier(comm_world)
 
     def all_gather(self, input, comm_world="worker"):
-        _comm_world = self.__check_comm_world(comm_world)
-        return self.role_maker._all_gather(_comm_world, input)
+        """
+        All gather `input` between specified collection.
+
+        Args:
+            input (Int|Float): The input variable to do all_gather between specified collection.
+            comm_world (str, optional): Collection used to execute all_reduce operation. Supported collections incude `worker` , `server` and `all` . The default is `worker` .
+
+        Returns:
+            output (List): A list of gathered values.
+
+        Examples:
+
+            .. code-block:: python
+
+                # Save the following code in `train.py` , and then execute the command `fleetrun --server_num 2 --worker_num 2 train.py` .
+                from paddle.distributed.fleet.base.util_factory import fleet_util
+                import paddle.distributed.fleet as fleet
+                from paddle.distributed.fleet import PaddleCloudRoleMaker
+                import sys
+
+                def train():
+                    role = PaddleCloudRoleMaker(
+                        is_collective=False,
+                        init_gloo=True,
+                        path="./tmp_gloo")
+                    fleet.init(role)
+                    fleet_util._set_role_maker(role)
+
+                    if fleet.is_server():
+                        input = fleet.server_index()
+                        output = fleet_util.all_gather(input, "server")
+                        print(output)
+                        # output = [0, 1]
+                    elif fleet.is_worker():
+                        input = fleet.worker_index()
+                        output = fleet_util.all_gather(input, "worker")
+                        # output = [0, 1]
+                        print(output)
+                    output = fleet_util.all_gather(input, "all")
+                    print(output)
+                    # output = [0, 1, 0, 1]
+
+                if __name__ == "__main__":
+                    train()
+        """
+
+        return self.role_maker._all_gather(input, comm_world)
 
-    def broadcast(self):
+    def _broadcast(self):
         pass
 
-    def scatter(self):
+    def _scatter(self):
         pass
 
     def get_file_shard(self, files):
         """
-        split files before distributed training,
-        example 1: files is [a, b, c ,d, e]  and trainer_num = 2, then trainer
-                   0 gets [a, b, c] and trainer 1 gets [d, e].
-        example 2: files is [a, b], and trainer_num = 3, then trainer 0 gets
-                   [a], trainer 1 gets [b],  trainer 2 gets []
+        Split files before distributed training, and return filelist assigned to the current trainer.
+
+        .. code-block:: text
+
+            example 1: files is [a, b, c ,d, e]  and trainer_num = 2, then trainer
+                    0 gets [a, b, c] and trainer 1 gets [d, e].
+            example 2: files is [a, b], and trainer_num = 3, then trainer 0 gets
+                    [a], trainer 1 gets [b],  trainer 2 gets []
 
         Args:
-            files(list): file list need to be read.
+            files(list): File list need to be read.
 
         Returns:
-            list: files belongs to this worker.
+            List: Files belong to this worker.
+
+        Examples:
+
+            .. code-block:: python
+
+                from paddle.distributed.fleet.base.util_factory import fleet_util
+                import paddle.distributed.fleet.base.role_maker as role_maker
+
+                role = role_maker.UserDefinedRoleMaker(
+                    is_collective=False,
+                    init_gloo=False,
+                    current_id=0,
+                    role=role_maker.Role.WORKER,
+                    worker_endpoints=["127.0.0.1:6003", "127.0.0.1:6004"],
+                    server_endpoints=["127.0.0.1:6001", "127.0.0.1:6002"])
+                fleet_util._set_role_maker(role)
+                files = fleet_util.get_file_shard(["file1", "file2", "file3"])
+                # files = ["file1", "file2"]
         """
         if not isinstance(files, list):
             raise TypeError("files should be a list of file need to be read.")
 
-        trainer_id = self.role_maker.worker_index()
-        trainers = self.role_maker.worker_num()
+        trainer_id = self.role_maker._worker_index()
+        trainers = self.role_maker._worker_num()
 
         remainder = len(files) % trainers
         blocksize = int(len(files) / trainers)
@@ -140,7 +256,31 @@ class UtilBase(object):
         return trainer_files[trainer_id]
 
     def print_on_rank(self, message, rank_id):
-        if self.role_maker.worker_index() != rank_id:
+        """
+        Woker of rank `rank_id` print some message. 
+
+        Args:
+            message(str): Log to be printed.
+            rank_id(int): trainer id.
+
+        Examples:
+
+            .. code-block:: python
+
+                from paddle.distributed.fleet.base.util_factory import fleet_util
+                import paddle.distributed.fleet.base.role_maker as role_maker
+
+                role = role_maker.UserDefinedRoleMaker(
+                    is_collective=False,
+                    init_gloo=False,
+                    current_id=0,
+                    role=role_maker.Role.WORKER,
+                    worker_endpoints=["127.0.0.1:6003", "127.0.0.1:6004"],
+                    server_endpoints=["127.0.0.1:6001", "127.0.0.1:6002"])
+                fleet_util._set_role_maker(role)
+                fleet_util.print_on_rank("I'm worker 0", 0)
+        """
+        if self.role_maker._worker_index() != rank_id:
             return
         print(message)
 
@@ -297,7 +437,7 @@ class UtilBase(object):
         with fluid.scope_guard(scope):
             inference_program, feed_target_names, fetch_targets = \
                 fluid.io.load_inference_model(config.dump_model_dir, exe, model_filename=model_filename,
-                                            params_filename=config.save_params_filename)
+                                              params_filename=config.save_params_filename)
 
             # check program vars and saved vars shape
             orig_para_shape = {
diff --git a/python/paddle/distributed/fleet/cloud_utils.py b/python/paddle/distributed/fleet/cloud_utils.py
index 49d66118d902e43f7ee0c4003c516081092b2a97..a1203bed85cadd859132ad67159b604c7b78916b 100644
--- a/python/paddle/distributed/fleet/cloud_utils.py
+++ b/python/paddle/distributed/fleet/cloud_utils.py
@@ -19,7 +19,7 @@ from paddle.distributed.fleet.launch_utils import get_cluster, logger
 
 def get_cloud_cluster(args_node_ips, selected_gpus, args_port=6170):
     """
-    args_node_ips, args_node_ip:string
+    args_node_ips:string, selected_gpus:list, args_port: int
     """
     #you can automatically get ip info while using paddlecloud multi nodes mode.
     node_ips = os.getenv("PADDLE_TRAINERS")
@@ -31,6 +31,9 @@ def get_cloud_cluster(args_node_ips, selected_gpus, args_port=6170):
     node_rank = os.getenv("PADDLE_TRAINER_ID")
     assert node_rank is not None, "PADDLE_TRAINER_ID should not be None"
 
+    paddle_ports_num = int(os.getenv("TRAINER_PORTS_NUM"))
+    assert paddle_ports_num is not None, "TRAINER_PORTS_NUM should not be None"
+
     node_ips = node_ips.split(",")
     num_nodes = len(node_ips)
     node_rank = int(node_rank)
@@ -42,32 +45,47 @@ automatically got from PADDLE_TRAINERS(multi nodes) or POD_IP(single node).\
 Your input cluster_node_ips: {} doesn't equals to IPs: {} from \
 paddlecloud environment.".format(args_node_ips, node_ips))
 
-    started_port = args_port
-    print("num_nodes:", num_nodes)
-    if num_nodes > 1:
-        try:
-            paddle_port = int(os.getenv("PADDLE_PORT", ""))
-            paddle_port_num = int(os.getenv("TRAINER_PORTS_NUM", ""))
-
-            if paddle_port_num >= len(
-                    selected_gpus) and paddle_port != args_port:
-                logger.warning("Use Cloud specified port:{}.".format(
-                    paddle_port))
-                started_port = paddle_port
-
-        except Exception as e:
-            print(e)
-            pass
-
-    if started_port is None:
-        started_port = 6170
-
-    logger.debug("parsed from args:node_ips:{} \
-        node_ip:{} node_rank:{} started_port:{}"
-                 .format(node_ips, node_ip, node_rank, started_port))
-
-    ports = [x for x in range(started_port, started_port + len(selected_gpus))]
-    cluster, pod = get_cluster(node_ips, node_ip, ports, selected_gpus)
+    # DISTRIBUTED_TRAINER_ENDPOINTS: new environment since paddlecloud 1.8.4
+    # e.g: DISTRIBUTED_TRAINER_ENDPOINTS="ip1:port1,ip1:port2,ip1:port3,ip1:port4,ip2:port5,ip2:port6,ip2:port7,ip2:port8"
+    trainer_endpoints = os.getenv("DISTRIBUTED_TRAINER_ENDPOINTS")
+    if trainer_endpoints is None:
+        started_port = args_port
+        if num_nodes > 1:
+            try:
+                paddle_port = int(os.getenv("PADDLE_PORT", ""))
+
+                if paddle_ports_num >= len(
+                        selected_gpus) and paddle_port != args_port:
+                    logger.warning("Use Cloud specified port:{}.".format(
+                        paddle_port))
+                    started_port = paddle_port
+
+            except Exception as e:
+                print(e)
+                pass
+
+        if started_port is None:
+            started_port = 6170
+        ports = [
+            x for x in range(started_port, started_port + len(selected_gpus))
+        ]
+        trainer_endpoints = []
+        for ip in node_ips:
+            trainer_endpoints.append(["%s:%d" % (ip, port) for port in ports])
+    else:
+        trainer_endpoints_ori = trainer_endpoints.split(",")
+        trainer_endpoints = []
+        assert num_nodes * paddle_ports_num == len(trainer_endpoints_ori)
+        for i in range(num_nodes):
+            trainer_endpoints.append(trainer_endpoints_ori[
+                i * paddle_ports_num:(i + 1) * paddle_ports_num])
+
+    logger.debug("parsed from args: node_ips:{} \
+        node_ip:{} node_rank:{} trainer_endpoints:{}"
+                 .format(node_ips, node_ip, node_rank, trainer_endpoints))
+
+    cluster, pod = get_cluster(node_ips, node_ip, trainer_endpoints,
+                               selected_gpus)
     return cluster, cluster.pods[node_rank]
 
 
@@ -75,7 +93,8 @@ def use_paddlecloud():
     node_ips = os.getenv("PADDLE_TRAINERS")
     node_ip = os.getenv("POD_IP")
     node_rank = os.getenv("PADDLE_TRAINER_ID")
-    if node_ips is None or node_ip is None or node_rank is None:
+    paddle_ports_num = os.getenv("TRAINER_PORTS_NUM")
+    if node_ips is None or node_ip is None or node_rank is None or paddle_ports_num is None:
         return False
     else:
         return True
diff --git a/python/paddle/distributed/fleet/dataset/dataset.py b/python/paddle/distributed/fleet/dataset/dataset.py
index f6504cacd9680806a13b4bb815247124b7e6a23c..5bd971181ed34e53ec90a31eb7371071372d443a 100644
--- a/python/paddle/distributed/fleet/dataset/dataset.py
+++ b/python/paddle/distributed/fleet/dataset/dataset.py
@@ -14,54 +14,11 @@
 """This is definition of dataset class, which is high performance IO."""
 
 import paddle
-import paddle.fluid as fluid
 from paddle.fluid.proto import data_feed_pb2
 from google.protobuf import text_format
 import paddle.fluid.core as core
 
 
-class DatasetFactory(object):
-    """
-    DatasetFactory is a factory which create dataset by its name,
-    you can create "QueueDataset" or "InMemoryDataset", or "FileInstantDataset",
-    the default is "QueueDataset".
-
-    Example:
-        .. code-block:: python
-
-          import paddle.fluid as fluid
-          dataset = fluid.DatasetFactory().create_dataset("InMemoryDataset")
-
-    """
-
-    def __init__(self):
-        """ Init. """
-        pass
-
-    def create_dataset(self, datafeed_class="QueueDataset"):
-        """
-        Create "QueueDataset" or "InMemoryDataset", or "FileInstantDataset",
-        the default is "QueueDataset".
-
-        Args:
-            datafeed_class(str): datafeed class name, QueueDataset or InMemoryDataset.
-                                 Default is QueueDataset.
-
-        Examples:
-            .. code-block:: python
-
-              import paddle.fluid as fluid
-              dataset = fluid.DatasetFactory().create_dataset()
-
-        """
-        try:
-            dataset = globals()[datafeed_class]()
-            return dataset
-        except:
-            raise ValueError("datafeed class %s does not exist" %
-                             datafeed_class)
-
-
 class DatasetBase(object):
     """ Base dataset class. """
 
@@ -75,96 +32,67 @@ class DatasetBase(object):
         self.thread_num = 1
         self.filelist = []
 
-    def set_pipe_command(self, pipe_command):
+    def init(self,
+             batch_size=1,
+             thread_num=1,
+             use_var=[],
+             pipe_command="cat",
+             input_type=0,
+             fs_name="",
+             fs_ugi="",
+             download_cmd="cat"):
         """
-        Set pipe command of current dataset
-        A pipe command is a UNIX pipeline command that can be used only
-
-        Examples:
-            .. code-block:: python
-
-              import paddle.fluid as fluid
-              dataset = fluid.DatasetFactory().create_dataset()
-              dataset.set_pipe_command("python my_script.py")
+        should be called only once in user's python scripts to initialize setings of dataset instance. 
+        Normally, it is called by InMemoryDataset or QueueDataset.
 
         Args:
-            pipe_command(str): pipe command
+            batch_size(int): batch size. It will be effective during training. default is 1.
+            thread_num(int): thread num, it is the num of readers. default is 1.
+            use_var(list): list of variables. Variables which you will use. default is [].
+            pipe_command(str): pipe command of current dataset. A pipe command is a UNIX pipeline command that can be used only. default is "cat"
+            input_type(int): the input type of generated input. 0 is for one sample, 1 is for one batch. defalut is 0.
+            fs_name(str): fs name. default is "".
+            fs_ugi(str): fs ugi. default is "".
+            download_cmd(str): customized download command. default is "cat"
 
-        """
-        self.proto_desc.pipe_command = pipe_command
 
-    def set_rank_offset(self, rank_offset):
         """
-        Set rank_offset for merge_pv. It set the message of Pv.
-
-        Examples:
-            .. code-block:: python
-
-              import paddle.fluid as fluid
-              dataset = fluid.DatasetFactory().create_dataset()
-              dataset.set_rank_offset("rank_offset")
-
-        Args:
-            rank_offset(str): rank_offset's name
+        self._set_batch_size(batch_size)
+        self._set_thread(thread_num)
+        self._set_use_var(use_var)
+        self._set_pipe_command(pipe_command)
+        self._set_input_type(input_type)
+        self._set_hdfs_config(fs_name, fs_ugi)
+        self._set_download_cmd(download_cmd)
 
+    def _set_pipe_command(self, pipe_command):
         """
-        self.proto_desc.rank_offset = rank_offset
+        Set pipe command of current dataset
+        A pipe command is a UNIX pipeline command that can be used only
 
-    def set_fea_eval(self, record_candidate_size, fea_eval=True):
-        """
-        set fea eval mode for slots shuffle to debug the importance level of
-        slots(features), fea_eval need to be set True for slots shuffle.
-        
-        Args:
-            record_candidate_size(int): size of instances candidate to shuffle 
-                                        one slot
-            fea_eval(bool): whether enable fea eval mode to enable slots shuffle.
-                            default is True.
-            
         Examples:
             .. code-block:: python
 
-            import paddle.fluid as fluid
-            dataset = fluid.DatasetFactory().create_dataset("InMemoryDataset")
-            dataset.set_fea_eval(1000000, True)
+              import paddle
+              dataset = paddle.distributed.fleet.dataset.DatasetBase()
+              dataset._set_pipe_command("python my_script.py")
 
-        """
-        if fea_eval:
-            self.dataset.set_fea_eval(fea_eval, record_candidate_size)
-        self.fea_eval = fea_eval
-
-    def slots_shuffle(self, slots):
-        """
-        Slots Shuffle 
-        Slots Shuffle is a shuffle method in slots level, which is usually used 
-        in sparse feature with large scale of instances. To compare the metric, i.e.
-        auc while doing slots shuffle on one or several slots with baseline to 
-        evaluate the importance level of slots(features).
-        
         Args:
-            slots(list[string]): the set of slots(string) to do slots shuffle.
+            pipe_command(str): pipe command
 
-        Examples:
-            import paddle.fluid as fluid
-            dataset = fluid.DatasetFactory().create_dataset("InMemoryDataset")
-            dataset.set_merge_by_lineid()
-            #suppose there is a slot 0
-            dataset.slots_shuffle(['0'])
         """
-        if self.fea_eval:
-            slots_set = set(slots)
-            self.dataset.slots_shuffle(slots_set)
+        self.proto_desc.pipe_command = pipe_command
 
-    def set_batch_size(self, batch_size):
+    def _set_batch_size(self, batch_size):
         """
         Set batch size. Will be effective during training
 
         Examples:
             .. code-block:: python
 
-              import paddle.fluid as fluid
-              dataset = fluid.DatasetFactory().create_dataset()
-              dataset.set_batch_size(128)
+              import paddle
+              dataset = paddle.distributed.fleet.DatasetBase()
+              dataset._set_batch_size(128)
 
         Args:
             batch_size(int): batch size
@@ -172,32 +100,16 @@ class DatasetBase(object):
         """
         self.proto_desc.batch_size = batch_size
 
-    def set_pv_batch_size(self, pv_batch_size):
-        """
-        Set pv batch size. It will be effective during enable_pv_merge
-
-        Examples:
-            .. code-block:: python
-
-              import paddle.fluid as fluid
-              dataset = fluid.DatasetFactory().create_dataset()
-              dataset.set_pv_batch(128)
-        Args:
-            pv_batch_size(int): pv batch size
-
-        """
-        self.proto_desc.pv_batch_size = pv_batch_size
-
-    def set_thread(self, thread_num):
+    def _set_thread(self, thread_num):
         """
         Set thread num, it is the num of readers.
 
         Examples:
             .. code-block:: python
 
-              import paddle.fluid as fluid
-              dataset = fluid.DatasetFactory().create_dataset()
-               dataset.set_thread(12)
+              import paddle
+              dataset = paddle.distributed.fleet.DatasetBase()
+              dataset._set_thread(12)
 
         Args:
             thread_num(int): thread num
@@ -212,8 +124,8 @@ class DatasetBase(object):
         Examples:
             .. code-block:: python
 
-              import paddle.fluid as fluid
-              dataset = fluid.DatasetFactory().create_dataset()
+              import paddle
+              dataset = paddle.distributed.fleet.DatasetBase()
               dataset.set_filelist(['a.txt', 'b.txt'])
 
         Args:
@@ -222,19 +134,19 @@ class DatasetBase(object):
         self.dataset.set_filelist(filelist)
         self.filelist = filelist
 
-    def set_input_type(self, input_type):
+    def _set_input_type(self, input_type):
         self.proto_desc.input_type = input_type
 
-    def set_use_var(self, var_list):
+    def _set_use_var(self, var_list):
         """
         Set Variables which you will use.
 
         Examples:
             .. code-block:: python
 
-              import paddle.fluid as fluid
-              dataset = fluid.DatasetFactory().create_dataset()
-              dataset.set_use_var([data, label])
+              import paddle
+              dataset = paddle.distributed.fleet.DatasetBase()
+              dataset._set_use_var([data, label])
 
         Args:
             var_list(list): variable list
@@ -253,19 +165,19 @@ class DatasetBase(object):
                 slot_var.type = "uint64"
             else:
                 raise ValueError(
-                    "Currently, fluid.dataset only supports dtype=float32 and dtype=int64"
+                    "Currently, paddle.distributed.fleet.dataset only supports dtype=float32 and dtype=int64"
                 )
 
-    def set_hdfs_config(self, fs_name, fs_ugi):
+    def _set_hdfs_config(self, fs_name, fs_ugi):
         """
         Set hdfs config: fs name ad ugi
 
         Examples:
             .. code-block:: python
 
-              import paddle.fluid as fluid
-              dataset = fluid.DatasetFactory().create_dataset()
-              dataset.set_hdfs_config("my_fs_name", "my_fs_ugi")
+              import paddle
+              dataset = paddle.distributed.fleet.DatasetBase()
+              dataset._set_hdfs_config("my_fs_name", "my_fs_ugi")
 
         Args:
             fs_name(str): fs name
@@ -273,16 +185,16 @@ class DatasetBase(object):
         """
         self.dataset.set_hdfs_config(fs_name, fs_ugi)
 
-    def set_download_cmd(self, download_cmd):
+    def _set_download_cmd(self, download_cmd):
         """
         Set customized download cmd: download_cmd
 
         Examples:
             .. code-block:: python
 
-              import paddle.fluid as fluid
-              dataset = fluid.DatasetFactory().create_dataset()
-              dataset.set_download_cmd("./read_from_afs")
+              import paddle
+              dataset = paddle.distributed.fleet.DatasetBase()
+              dataset._set_download_cmd("./read_from_afs")
 
         Args:
             download_cmd(str): customized download command
@@ -297,22 +209,22 @@ class DatasetBase(object):
         if self.thread_num > len(self.filelist):
             self.thread_num = len(self.filelist)
         self.dataset.set_thread_num(self.thread_num)
-        self.dataset.set_data_feed_desc(self.desc())
+        self.dataset.set_data_feed_desc(self._desc())
         self.dataset.create_readers()
 
     def _finish_to_run(self):
         self.dataset.destroy_readers()
 
-    def desc(self):
+    def _desc(self):
         """
         Returns a protobuf message for this DataFeedDesc
 
         Examples:
             .. code-block:: python
 
-              import paddle.fluid as fluid
-              dataset = fluid.DatasetFactory().create_dataset()
-              print(dataset.desc())
+              import paddle
+              dataset = paddle.distributed.fleet.DatasetBase()
+              print(dataset._desc())
 
         Returns:
             A string message
@@ -330,10 +242,10 @@ class InMemoryDataset(DatasetBase):
     """
     InMemoryDataset, it will load data into memory
     and shuffle data before training.
-    This class should be created by DatasetFactory
 
     Example:
-        dataset = paddle.fluid.DatasetFactory().create_dataset("InMemoryDataset")
+        import paddle
+        dataset = paddle.distributed.InMemoryDataset()
     """
 
     def __init__(self):
@@ -351,7 +263,229 @@ class InMemoryDataset(DatasetBase):
         self.merge_by_lineid = False
         self.fleet_send_sleep_seconds = None
 
-    def set_feed_type(self, data_feed_type):
+    def _init_distributed_settings(self, **kwargs):
+        """
+        should be called only once in user's python scripts to initialize distributed-related setings of dataset instance
+        Args:
+            kwargs: Keyword arguments. Currently, we support following keys in **kwargs:
+
+            merge_size(int): ins size to merge, if merge_size > 0, set merge by line id, 
+                             instances of same line id will be merged after shuffle, 
+                             you should parse line id in data generator. default is -1.
+            parse_ins_id(bool): Set if Dataset need to parse ins_id. default is False.
+            parse_content(bool): Set if Dataset need to parse content. default is False.
+            fleet_send_batch_size(int): Set fleet send batch size in one rpc, default is 1024
+            fleet_send_sleep_seconds(int): Set fleet send sleep time, default is 0
+            fea_eval(bool): Set if Dataset need to do feature importance evaluation using slots shuffle.
+                            default is False.
+            candidate_size(int): if fea_eval is set True, set the candidate size used in slots shuffle.
+
+        Examples:
+            .. code-block:: python
+
+              import paddle
+              dataset = paddle.distributed.InMemoryDataset()
+              dataset.init(
+                    batch_size=1,
+                    thread_num=2,
+                    input_type=1,
+                    pipe_command="cat",
+                    use_var=[])
+              dataset._init_distributed_settings(
+                    parse_ins_id=True,
+                    parse_content=True,
+                    fea_eval=True,
+                    candidate_size=10000)
+              
+        """
+        merge_size = kwargs.get("merge_size", -1)
+        if merge_size > 0:
+            self._set_merge_by_lineid(merge_size)
+
+        parse_ins_id = kwargs.get("parse_ins_id", False)
+        self._set_parse_ins_id(parse_ins_id)
+
+        parse_content = kwargs.get("parse_content", False)
+        self._set_parse_content(parse_content)
+
+        fleet_send_batch_size = kwargs.get("fleet_send_batch_size", None)
+        if fleet_send_batch_size:
+            self._set_fleet_send_batch_size(fleet_send_batch_size)
+
+        fleet_send_sleep_seconds = kwargs.get("fleet_send_sleep_seconds", None)
+        if fleet_send_sleep_seconds:
+            self._set_fleet_send_sleep_seconds(fleet_send_sleep_seconds)
+
+        fea_eval = kwargs.get("fea_eval", False)
+        if fea_eval:
+            candidate_size = kwargs.get("candidate_size", 10000)
+            self._set_fea_eval(candidate_size, True)
+
+    def update_settings(self, **kwargs):
+        """
+        should be called in user's python scripts to update setings of dataset instance
+        Args:
+            kwargs: Keyword arguments. Currently, we support following keys in **kwargs,
+                    including single node settings and advanced distributed related settings:
+
+            batch_size(int): batch size. It will be effective during training. default is 1.
+            thread_num(int): thread num, it is the num of readers. default is 1.
+            use_var(list): list of variables. Variables which you will use. default is [].
+            input_type(int): the input type of generated input. 0 is for one sample, 1 is for one batch. defalut is 0.
+            fs_name(str): fs name. default is "".
+            fs_ugi(str): fs ugi. default is "".
+            pipe_command(str): pipe command of current dataset. A pipe command is a UNIX pipeline command that can be used only. default is "cat"
+            download_cmd(str): customized download command. default is "cat"
+            data_feed_type(str): data feed type used in c++ code. default is "MultiSlotInMemoryDataFeed".
+            queue_num(int): Dataset output queue num, training threads get data from queues. default is-1, which is set same as thread number in c++.
+
+            merge_size(int): ins size to merge, if merge_size > 0, set merge by line id, 
+                             instances of same line id will be merged after shuffle, 
+                             you should parse line id in data generator. default is -1.
+            parse_ins_id(bool): Set if Dataset need to parse ins_id. default is False.
+            parse_content(bool): Set if Dataset need to parse content. default is False.
+            fleet_send_batch_size(int): Set fleet send batch size in one rpc, default is 1024
+            fleet_send_sleep_seconds(int): Set fleet send sleep time, default is 0
+            fea_eval(bool): Set if Dataset need to do feature importance evaluation using slots shuffle.
+                            default is False.
+            candidate_size(int): if fea_eval is set True, set the candidate size used in slots shuffle.
+
+        Examples:
+            .. code-block:: python
+
+              import paddle
+              dataset = paddle.distributed.InMemoryDataset()
+              dataset.init(
+                    batch_size=1,
+                    thread_num=2,
+                    input_type=1,
+                    pipe_command="cat",
+                    use_var=[])
+              dataset._init_distributed_settings(
+                    parse_ins_id=True,
+                    parse_content=True,
+                    fea_eval=True,
+                    candidate_size=10000)
+              dataset.update_settings(batch_size=2)
+            
+        """
+        for key in kwargs:
+            if key == "pipe_command":
+                self._set_pipe_command(kwargs[key])
+            elif key == "batch_size":
+                self._set_batch_size(kwargs[key])
+            elif key == "thread_num":
+                self._set_thread(kwargs[key])
+            elif key == "use_var":
+                self._set_use_var(kwargs[key])
+            elif key == "input_type":
+                self._set_input_type(kwargs[key])
+            elif key == "fs_name" and "fs_ugi" in kwargs:
+                self._set_hdfs_config(kwargs[key], kwargs["fs_ugi"])
+            elif key == "download_cmd":
+                self._set_download_cmd(kwargs[key])
+            elif key == "merge_size" and kwargs.get("merge_size", -1) > 0:
+                self._set_merge_by_lineid(kwargs[key])
+            elif key == "parse_ins_id":
+                self._set_parse_ins_id(kwargs[key])
+            elif key == "parse_content":
+                self._set_parse_content(kwargs[key])
+            elif key == "fleet_send_batch_size":
+                self._set_fleet_send_batch_size(kwargs[key])
+            elif key == "fleet_send_sleep_seconds":
+                self._set_fleet_send_sleep_seconds(kwargs[key])
+            elif key == "fea_eval" and kwargs[key] == True:
+                candidate_size = kwargs.get("candidate_size", 10000)
+                self._set_fea_eval(candidate_size, True)
+
+    def init(self, **kwargs):
+        """
+        should be called only once in user's python scripts to initialize setings of dataset instance
+        Args:
+            kwargs: Keyword arguments. Currently, we support following keys in **kwargs:
+            
+            batch_size(int): batch size. It will be effective during training. default is 1.
+            thread_num(int): thread num, it is the num of readers. default is 1.
+            use_var(list): list of variables. Variables which you will use. default is [].
+            input_type(int): the input type of generated input. 0 is for one sample, 1 is for one batch. defalut is 0.
+            fs_name(str): fs name. default is "".
+            fs_ugi(str): fs ugi. default is "".
+            pipe_command(str): pipe command of current dataset. A pipe command is a UNIX pipeline command that can be used only. default is "cat"
+            download_cmd(str): customized download command. default is "cat"
+            data_feed_type(str): data feed type used in c++ code. default is "MultiSlotInMemoryDataFeed".
+            queue_num(int): Dataset output queue num, training threads get data from queues. default is -1, which is set same as thread number in c++.
+
+        Examples:
+            .. code-block:: python
+
+                import paddle
+                with open("test_queue_dataset_run_a.txt", "w") as f:
+                    data = "2 1 2 2 5 4 2 2 7 2 1 3\n"
+                    data += "2 6 2 2 1 4 2 2 4 2 2 3\n"
+                    data += "2 5 2 2 9 9 2 2 7 2 1 3\n"
+                    data += "2 7 2 2 1 9 2 3 7 2 5 3\n"
+                    f.write(data)
+                with open("test_queue_dataset_run_b.txt", "w") as f:
+                    data = "2 1 2 2 5 4 2 2 7 2 1 3\n"
+                    data += "2 6 2 2 1 4 2 2 4 2 2 3\n"
+                    data += "2 5 2 2 9 9 2 2 7 2 1 3\n"
+                    data += "2 7 2 2 1 9 2 3 7 2 5 3\n"
+                    f.write(data)
+
+                slots = ["slot1", "slot2", "slot3", "slot4"]
+                slots_vars = []
+                for slot in slots:
+                    var = fluid.data(
+                        name=slot, shape=[None, 1], dtype="int64", lod_level=1)
+                    slots_vars.append(var)
+
+                dataset = paddle.distributed.InMemoryDataset()
+                dataset.init(
+                    batch_size=1,
+                    thread_num=2,
+                    input_type=1,
+                    pipe_command="cat",
+                    use_var=slots_vars)
+                dataset.set_filelist(
+                    ["test_queue_dataset_run_a.txt", "test_queue_dataset_run_b.txt"])
+                dataset.load_into_memory()
+
+                exe = fluid.Executor(fluid.CPUPlace() if not core.is_compiled_with_cuda(
+                ) else fluid.CUDAPlace(0))
+                exe.run(fluid.default_startup_program())
+                exe.train_from_dataset(fluid.default_main_program(),
+                                           dataset)
+                os.remove("./test_queue_dataset_run_a.txt")
+                os.remove("./test_queue_dataset_run_b.txt")
+        """
+        batch_size = kwargs.get("batch_size", 1)
+        thread_num = kwargs.get("thread_num", 1)
+        use_var = kwargs.get("use_var", [])
+        input_type = kwargs.get("input_type", 0)
+        fs_name = kwargs.get("fs_name", "")
+        fs_ugi = kwargs.get("fs_ugi", "")
+        pipe_command = kwargs.get("pipe_command", "cat")
+        download_cmd = kwargs.get("download_cmd", "cat")
+
+        super(InMemoryDataset, self).init(
+            batch_size=batch_size,
+            thread_num=thread_num,
+            use_var=use_var,
+            pipe_command=pipe_command,
+            input_type=input_type,
+            fs_name=fs_name,
+            fs_ugi=fs_ugi,
+            download_cmd=download_cmd)
+
+        data_feed_type = kwargs.get("data_feed_type",
+                                    "MultiSlotInMemoryDataFeed")
+        self._set_feed_type(data_feed_type)
+
+        if kwargs.get("queue_num", -1) > 0:
+            queue_num = kwargs.get("queue_num", -1)
+            self._set_queue_num(queue_num)
+
+    def _set_feed_type(self, data_feed_type):
         """
         Set data_feed_desc
         """
@@ -373,7 +507,7 @@ class InMemoryDataset(DatasetBase):
         self.dataset.set_parse_logkey(self.parse_logkey)
         self.dataset.set_merge_by_sid(self.merge_by_sid)
         self.dataset.set_enable_pv_merge(self.enable_pv_merge)
-        self.dataset.set_data_feed_desc(self.desc())
+        self.dataset.set_data_feed_desc(self._desc())
         self.dataset.create_channel()
         self.dataset.create_readers()
 
@@ -387,7 +521,7 @@ class InMemoryDataset(DatasetBase):
             self.dataset.dynamic_adjust_channel_num(self.thread_num, False)
         self.dataset.dynamic_adjust_readers_num(self.thread_num)
 
-    def set_queue_num(self, queue_num):
+    def _set_queue_num(self, queue_num):
         """
         Set Dataset output queue num, training threads get data from queues
 
@@ -397,17 +531,17 @@ class InMemoryDataset(DatasetBase):
         Examples:
             .. code-block:: python
 
-              import paddle.fluid as fluid
-              dataset = fluid.DatasetFactory().create_dataset("InMemoryDataset")
-              dataset.set_queue_num(12)
+              import paddle
+              dataset = paddle.distributed.InMemoryDataset()
+              dataset._set_queue_num(12)
 
         """
         self.is_user_set_queue_num = True
         self.queue_num = queue_num
 
-    def set_parse_ins_id(self, parse_ins_id):
+    def _set_parse_ins_id(self, parse_ins_id):
         """
-        Set id Dataset need to parse insid
+        Set if Dataset need to parse insid
 
         Args:
             parse_ins_id(bool): if parse ins_id or not
@@ -415,14 +549,14 @@ class InMemoryDataset(DatasetBase):
         Examples:
             .. code-block:: python
 
-              import paddle.fluid as fluid
-              dataset = fluid.DatasetFactory().create_dataset("InMemoryDataset")
-              dataset.set_parse_ins_id(True)
+              import paddle
+              dataset = paddle.distributed.InMemoryDataset()
+              dataset._set_parse_ins_id(True)
 
         """
         self.parse_ins_id = parse_ins_id
 
-    def set_parse_content(self, parse_content):
+    def _set_parse_content(self, parse_content):
         """
         Set if Dataset need to parse content
 
@@ -432,120 +566,14 @@ class InMemoryDataset(DatasetBase):
         Examples:
             .. code-block:: python
 
-              import paddle.fluid as fluid
-              dataset = fluid.DatasetFactory().create_dataset("InMemoryDataset")
-              dataset.set_parse_content(True)
+              import paddle
+              dataset = paddle.distributed.InMemoryDataset()
+              dataset._set_parse_content(True)
 
         """
         self.parse_content = parse_content
 
-    def set_parse_logkey(self, parse_logkey):
-        """
-        Set if Dataset need to parse logkey
-
-        Args:
-            parse_content(bool): if parse logkey or not
-
-        Examples:
-            .. code-block:: python
-
-              import paddle.fluid as fluid
-              dataset = fluid.DatasetFactory().create_dataset("InMemoryDataset")
-              dataset.set_parse_logkey(True)
-
-        """
-        self.parse_logkey = parse_logkey
-
-    def set_merge_by_sid(self, merge_by_sid):
-        """
-        Set if Dataset need to merge sid. If not, one ins means one Pv.
-
-        Args:
-            merge_by_sid(bool): if merge sid or not
-
-        Examples:
-            .. code-block:: python
-
-              import paddle.fluid as fluid
-              dataset = fluid.DatasetFactory().create_dataset("InMemoryDataset")
-              dataset.set_merge_by_sid(True)
-
-        """
-        self.merge_by_sid = merge_by_sid
-
-    def set_enable_pv_merge(self, enable_pv_merge):
-        """
-        Set if Dataset need to merge pv.
-
-        Args:
-            enable_pv_merge(bool): if enable_pv_merge or not
-
-        Examples:
-            .. code-block:: python
-
-              import paddle.fluid as fluid
-              dataset = fluid.DatasetFactory().create_dataset("InMemoryDataset")
-              dataset.set_enable_pv_merge(True)
-
-        """
-        self.enable_pv_merge = enable_pv_merge
-
-    def preprocess_instance(self):
-        """
-        Merge pv instance and convey it from input_channel to input_pv_channel. 
-        It will be effective when enable_pv_merge_ is True.
-
-        Examples:
-            .. code-block:: python
-
-              import paddle.fluid as fluid
-              dataset = fluid.DatasetFactory().create_dataset("InMemoryDataset")
-              filelist = ["a.txt", "b.txt"]
-              dataset.set_filelist(filelist)
-              dataset.load_into_memory()
-              dataset.preprocess_instance()
-
-        """
-        self.dataset.preprocess_instance()
-
-    def set_current_phase(self, current_phase):
-        """
-        Set current phase in train. It is useful for untest.
-        current_phase : 1 for join, 0 for update.
-
-        Examples:
-            .. code-block:: python
-
-              import paddle.fluid as fluid
-              dataset = fluid.DatasetFactory().create_dataset("InMemoryDataset")
-              filelist = ["a.txt", "b.txt"]
-              dataset.set_filelist(filelist)
-              dataset.load_into_memory()
-              dataset.set_current_phase(1)
-
-        """
-        self.dataset.set_current_phase(current_phase)
-
-    def postprocess_instance(self):
-        """
-        Divide pv instance and convey it to input_channel.
-
-        Examples:
-            .. code-block:: python
-
-              import paddle.fluid as fluid
-              dataset = fluid.DatasetFactory().create_dataset("InMemoryDataset")
-              filelist = ["a.txt", "b.txt"]
-              dataset.set_filelist(filelist)
-              dataset.load_into_memory()
-              dataset.preprocess_instance()
-              exe.train_from_dataset(dataset)
-              dataset.postprocess_instance()
-
-        """
-        self.dataset.postprocess_instance()
-
-    def set_fleet_send_batch_size(self, fleet_send_batch_size=1024):
+    def _set_fleet_send_batch_size(self, fleet_send_batch_size=1024):
         """
         Set fleet send batch size, default is 1024
 
@@ -555,14 +583,14 @@ class InMemoryDataset(DatasetBase):
         Examples:
             .. code-block:: python
 
-              import paddle.fluid as fluid
-              dataset = fluid.DatasetFactory().create_dataset("InMemoryDataset")
-              dataset.set_fleet_send_batch_size(800)
+              import paddle
+              dataset = paddle.distributed.InMemoryDataset()
+              dataset._set_fleet_send_batch_size(800)
 
         """
         self.fleet_send_batch_size = fleet_send_batch_size
 
-    def set_fleet_send_sleep_seconds(self, fleet_send_sleep_seconds=0):
+    def _set_fleet_send_sleep_seconds(self, fleet_send_sleep_seconds=0):
         """
         Set fleet send sleep time, default is 0
 
@@ -572,14 +600,14 @@ class InMemoryDataset(DatasetBase):
         Examples:
             .. code-block:: python
 
-              import paddle.fluid as fluid
-              dataset = fluid.DatasetFactory().create_dataset("InMemoryDataset")
-              dataset.set_fleet_send_sleep_seconds(2)
+              import paddle
+              dataset = paddle.distributed.InMemoryDataset()
+              dataset._set_fleet_send_sleep_seconds(2)
 
         """
         self.fleet_send_sleep_seconds = fleet_send_sleep_seconds
 
-    def set_merge_by_lineid(self, merge_size=2):
+    def _set_merge_by_lineid(self, merge_size=2):
         """
         Set merge by line id, instances of same line id will be merged after
         shuffle, you should parse line id in data generator.
@@ -590,22 +618,22 @@ class InMemoryDataset(DatasetBase):
         Examples:
             .. code-block:: python
 
-              import paddle.fluid as fluid
-              dataset = fluid.DatasetFactory().create_dataset("InMemoryDataset")
-              dataset.set_merge_by_lineid()
+              import paddle
+              dataset = paddle.distributed.InMemoryDataset()
+              dataset._set_merge_by_lineid()
 
         """
         self.dataset.set_merge_by_lineid(merge_size)
         self.merge_by_lineid = True
         self.parse_ins_id = True
 
-    def set_generate_unique_feasigns(self, generate_uni_feasigns, shard_num):
+    def _set_generate_unique_feasigns(self, generate_uni_feasigns, shard_num):
         self.dataset.set_generate_unique_feasigns(generate_uni_feasigns)
         self.gen_uni_feasigns = generate_uni_feasigns
         self.local_shard_num = shard_num
 
-    def generate_local_tables_unlock(self, table_id, fea_dim, read_thread_num,
-                                     consume_thread_num, shard_num):
+    def _generate_local_tables_unlock(self, table_id, fea_dim, read_thread_num,
+                                      consume_thread_num, shard_num):
         self.dataset.generate_local_tables_unlock(
             table_id, fea_dim, read_thread_num, consume_thread_num, shard_num)
 
@@ -616,8 +644,8 @@ class InMemoryDataset(DatasetBase):
         Examples:
             .. code-block:: python
 
-              import paddle.fluid as fluid
-              dataset = fluid.DatasetFactory().create_dataset("InMemoryDataset")
+              import paddle
+              dataset = paddle.distributed.InMemoryDataset()
               filelist = ["a.txt", "b.txt"]
               dataset.set_filelist(filelist)
               dataset.load_into_memory()
@@ -635,8 +663,8 @@ class InMemoryDataset(DatasetBase):
         Examples:
             .. code-block:: python
 
-              import paddle.fluid as fluid
-              dataset = fluid.DatasetFactory().create_dataset("InMemoryDataset")
+              import paddle
+              dataset = paddle.distributed.InMemoryDataset()
               filelist = ["a.txt", "b.txt"]
               dataset.set_filelist(filelist)
               dataset.preload_into_memory()
@@ -656,8 +684,8 @@ class InMemoryDataset(DatasetBase):
         Examples:
             .. code-block:: python
 
-              import paddle.fluid as fluid
-              dataset = fluid.DatasetFactory().create_dataset("InMemoryDataset")
+              import paddle
+              dataset = paddle.distributed.InMemoryDataset()
               filelist = ["a.txt", "b.txt"]
               dataset.set_filelist(filelist)
               dataset.preload_into_memory()
@@ -673,8 +701,8 @@ class InMemoryDataset(DatasetBase):
         Examples:
             .. code-block:: python
 
-              import paddle.fluid as fluid
-              dataset = fluid.DatasetFactory().create_dataset("InMemoryDataset")
+              import paddle
+              dataset = paddle.distributed.InMemoryDataset()
               filelist = ["a.txt", "b.txt"]
               dataset.set_filelist(filelist)
               dataset.load_into_memory()
@@ -692,9 +720,9 @@ class InMemoryDataset(DatasetBase):
         Examples:
             .. code-block:: python
 
-              import paddle.fluid as fluid
+              import paddle
               from paddle.fluid.incubate.fleet.parameter_server.pslib import fleet
-              dataset = fluid.DatasetFactory().create_dataset("InMemoryDataset")
+              dataset = paddle.distributed.InMemoryDataset()
               filelist = ["a.txt", "b.txt"]
               dataset.set_filelist(filelist)
               dataset.load_into_memory()
@@ -736,9 +764,9 @@ class InMemoryDataset(DatasetBase):
         Examples:
             .. code-block:: python
 
-              import paddle.fluid as fluid
+              import paddle
               from paddle.fluid.incubate.fleet.parameter_server.pslib import fleet
-              dataset = fluid.DatasetFactory().create_dataset("InMemoryDataset")
+              dataset = paddle.distributed.InMemoryDataset()
               filelist = ["a.txt", "b.txt"]
               dataset.set_filelist(filelist)
               dataset.load_into_memory()
@@ -751,30 +779,6 @@ class InMemoryDataset(DatasetBase):
         """
         self.dataset.release_memory()
 
-    def get_pv_data_size(self):
-        """
-        Get memory data size of Pv, user can call this function to know the pv num
-        of ins in all workers after load into memory.
-
-        Note:
-            This function may cause bad performance, because it has barrier
-
-        Returns:
-            The size of memory pv data.
-
-        Examples:
-            .. code-block:: python
-
-              import paddle.fluid as fluid
-              dataset = fluid.DatasetFactory().create_dataset("InMemoryDataset")
-              filelist = ["a.txt", "b.txt"]
-              dataset.set_filelist(filelist)
-              dataset.load_into_memory()
-              print dataset.get_pv_data_size()
-
-        """
-        return self.dataset.get_pv_data_size()
-
     def get_memory_data_size(self, fleet=None):
         """
         Get memory data size, user can call this function to know the num
@@ -792,9 +796,9 @@ class InMemoryDataset(DatasetBase):
         Examples:
             .. code-block:: python
 
-              import paddle.fluid as fluid
+              import paddle
               from paddle.fluid.incubate.fleet.parameter_server.pslib import fleet
-              dataset = fluid.DatasetFactory().create_dataset("InMemoryDataset")
+              dataset = paddle.distributed.InMemoryDataset()
               filelist = ["a.txt", "b.txt"]
               dataset.set_filelist(filelist)
               dataset.load_into_memory()
@@ -829,9 +833,9 @@ class InMemoryDataset(DatasetBase):
         Examples:
             .. code-block:: python
 
-              import paddle.fluid as fluid
+              import paddle
               from paddle.fluid.incubate.fleet.parameter_server.pslib import fleet
-              dataset = fluid.DatasetFactory().create_dataset("InMemoryDataset")
+              dataset = paddle.distributed.InMemoryDataset()
               filelist = ["a.txt", "b.txt"]
               dataset.set_filelist(filelist)
               dataset.load_into_memory()
@@ -849,6 +853,51 @@ class InMemoryDataset(DatasetBase):
             return global_data_size[0]
         return local_data_size[0]
 
+    def _set_fea_eval(self, record_candidate_size, fea_eval=True):
+        """
+        set fea eval mode for slots shuffle to debug the importance level of
+        slots(features), fea_eval need to be set True for slots shuffle.
+        
+        Args:
+            record_candidate_size(int): size of instances candidate to shuffle 
+                                        one slot
+            fea_eval(bool): whether enable fea eval mode to enable slots shuffle.
+                            default is True.
+            
+        Examples:
+            .. code-block:: python
+
+            import paddle
+            dataset = paddle.distributed.InMemoryDataset()
+            dataset._set_fea_eval(1000000, True)
+
+        """
+        if fea_eval:
+            self.dataset.set_fea_eval(fea_eval, record_candidate_size)
+        self.fea_eval = fea_eval
+
+    def slots_shuffle(self, slots):
+        """
+        Slots Shuffle 
+        Slots Shuffle is a shuffle method in slots level, which is usually used 
+        in sparse feature with large scale of instances. To compare the metric, i.e.
+        auc while doing slots shuffle on one or several slots with baseline to 
+        evaluate the importance level of slots(features).
+        
+        Args:
+            slots(list[string]): the set of slots(string) to do slots shuffle.
+
+        Examples:
+            import paddle
+            dataset = paddle.distributed.InMemoryDataset()
+            dataset.set_merge_by_lineid()
+            #suppose there is a slot 0
+            dataset.slots_shuffle(['0'])
+        """
+        if self.fea_eval:
+            slots_set = set(slots)
+            self.dataset.slots_shuffle(slots_set)
+
 
 class QueueDataset(DatasetBase):
     """
@@ -857,19 +906,24 @@ class QueueDataset(DatasetBase):
     Examples:
         .. code-block:: python
 
-          import paddle.fluid as fluid
-          dataset = fluid.DatasetFactory().create_dataset("QueueDataset")
+          import paddle
+          dataset = paddle.distributed.QueueDataset()
 
     """
 
     def __init__(self):
         """
         Initialize QueueDataset
-        This class should be created by DatasetFactory
         """
         super(QueueDataset, self).__init__()
         self.proto_desc.name = "MultiSlotDataFeed"
 
+    def init(self, **kwargs):
+        """
+        should be called only once in user's python scripts to initialize setings of dataset instance
+        """
+        super(QueueDataset, self).init(**kwargs)
+
     def _prepare_to_run(self):
         """
         Set data_feed_desc/thread num/filelist before run,
@@ -881,57 +935,9 @@ class QueueDataset(DatasetBase):
             self.thread_num = 1
         self.dataset.set_thread_num(self.thread_num)
         self.dataset.set_filelist(self.filelist)
-        self.dataset.set_data_feed_desc(self.desc())
+        self.dataset.set_data_feed_desc(self._desc())
         self.dataset.create_readers()
 
-    def local_shuffle(self):
-        """
-        Local shuffle data.
-
-        Local shuffle is not supported in QueueDataset
-        NotImplementedError will be raised
-
-        Examples:
-            .. code-block:: python
-
-              import paddle.fluid as fluid
-              dataset = fluid.DatasetFactory().create_dataset("QueueDataset")
-              dataset.local_shuffle()
-
-        Raises:
-            NotImplementedError: QueueDataset does not support local shuffle
-
-        """
-        raise NotImplementedError(
-            "QueueDataset does not support local shuffle, "
-            "please use InMemoryDataset for local_shuffle")
-
-    def global_shuffle(self, fleet=None):
-        """
-        Global shuffle data.
-
-        Global shuffle is not supported in QueueDataset
-        NotImplementedError will be raised
-
-        Args:
-            fleet(Fleet): fleet singleton. Default None.
-
-        Examples:
-            .. code-block:: python
-
-              import paddle.fluid as fluid
-              from paddle.fluid.incubate.fleet.parameter_server.pslib import fleet
-              dataset = fluid.DatasetFactory().create_dataset("QueueDataset")
-              dataset.global_shuffle(fleet)
-
-        Raises:
-            NotImplementedError: QueueDataset does not support global shuffle
-
-        """
-        raise NotImplementedError(
-            "QueueDataset does not support global shuffle, "
-            "please use InMemoryDataset for global_shuffle")
-
 
 class FileInstantDataset(DatasetBase):
     """
@@ -940,35 +946,22 @@ class FileInstantDataset(DatasetBase):
     Examples:
         .. code-block:: python
 
-          import paddle.fluid as fluid
-          dataset = fluid.DatasetFactory.create_dataset("FileInstantDataset")
+          import paddle
+          dataset = paddle.distributed.fleet.FileInstantDataset()
     """
 
     def __init__(self):
         """
         Initialize FileInstantDataset
-        This class should be created by DatasetFactory
         """
         super(FileInstantDataset, self).__init__()
         self.proto_desc.name = "MultiSlotFileInstantDataFeed"
 
-    def local_shuffle(self):
+    def init(self, **kwargs):
         """
-        Local shuffle
-        FileInstantDataset does not support local shuffle
+        should be called only once in user's python scripts to initialize setings of dataset instance
         """
-        raise NotImplementedError(
-            "FileInstantDataset does not support local shuffle, "
-            "please use InMemoryDataset for local_shuffle")
-
-    def global_shuffle(self, fleet=None):
-        """
-        Global shuffle
-        FileInstantDataset does not support global shuffle
-        """
-        raise NotImplementedError(
-            "FileInstantDataset does not support global shuffle, "
-            "please use InMemoryDataset for global_shuffle")
+        super(FileInstantDataset, self).init(**kwargs)
 
 
 class BoxPSDataset(InMemoryDataset):
@@ -978,19 +971,119 @@ class BoxPSDataset(InMemoryDataset):
     Examples:
         .. code-block:: python
 
-          import paddle.fluid as fluid
-          dataset = fluid.DatasetFactory().create_dataset("BoxPSDataset")
+          import paddle
+          dataset = paddle.distributed.fleet.BoxPSDataset()
     """
 
     def __init__(self):
         """
         Initialize BoxPSDataset
-        This class should be created by DatasetFactory
         """
         super(BoxPSDataset, self).__init__()
         self.boxps = core.BoxPS(self.dataset)
         self.proto_desc.name = "PaddleBoxDataFeed"
 
+    def init(self, **kwargs):
+        """
+        should be called only once in user's python scripts to initialize setings of dataset instance
+        """
+        super(BoxPSDataset, self).init(**kwargs)
+
+        rank_offset = kwargs.get("rank_offset", "")
+        self._set_rank_offset(rank_offset)
+        pv_batch_size = kwargs.get("pv_batch_size", 1)
+        self._set_pv_batch_size(pv_batch_size)
+        parse_logkey = kwargs.get("parse_logkey", False)
+        self._set_parse_logkey(parse_logkey)
+        merge_by_sid = kwargs.get("merge_by_sid", False)
+        self._set_merge_by_sid(merge_by_sid)
+        enable_pv_merge = kwargs.get("enable_pv_merge", False)
+        self._set_enable_pv_merge(enable_pv_merge)
+
+    def _set_rank_offset(self, rank_offset):
+        """
+        Set rank_offset for merge_pv. It set the message of Pv.
+
+        Examples:
+            .. code-block:: python
+
+              import paddle
+              dataset = paddle.distributed.fleet.BoxPSDataset()
+              dataset._set_rank_offset("rank_offset")
+
+        Args:
+            rank_offset(str): rank_offset's name
+
+        """
+        self.proto_desc.rank_offset = rank_offset
+
+    def _set_pv_batch_size(self, pv_batch_size):
+        """
+        Set pv batch size. It will be effective during enable_pv_merge
+
+        Examples:
+            .. code-block:: python
+
+              import paddle
+              dataset = paddle.distributed.fleet.BoxPSDataset()
+              dataset._set_pv_batch_size(128)
+        Args:
+            pv_batch_size(int): pv batch size
+
+        """
+        self.proto_desc.pv_batch_size = pv_batch_size
+
+    def _set_parse_logkey(self, parse_logkey):
+        """
+        Set if Dataset need to parse logkey
+
+        Args:
+            parse_content(bool): if parse logkey or not
+
+        Examples:
+            .. code-block:: python
+
+              import paddle
+              dataset = paddle.distributed.fleet.BoxPSDataset()
+              dataset._set_parse_logkey(True)
+
+        """
+        self.parse_logkey = parse_logkey
+
+    def _set_merge_by_sid(self, merge_by_sid):
+        """
+        Set if Dataset need to merge sid. If not, one ins means one Pv.
+
+        Args:
+            merge_by_sid(bool): if merge sid or not
+
+        Examples:
+            .. code-block:: python
+
+              import paddle
+              dataset = paddle.distributed.fleet.BoxPSDataset()
+              dataset._set_merge_by_sid(True)
+
+        """
+        self.merge_by_sid = merge_by_sid
+
+    def _set_enable_pv_merge(self, enable_pv_merge):
+        """
+        Set if Dataset need to merge pv.
+
+        Args:
+            enable_pv_merge(bool): if enable_pv_merge or not
+
+        Examples:
+            .. code-block:: python
+
+              import paddle
+              dataset = paddle.distributed.fleet.BoxPSDataset()
+              dataset._set_enable_pv_merge(True)
+
+        """
+        self.enable_pv_merge = enable_pv_merge
+
     def set_date(self, date):
         """
         Workaround for date
@@ -1008,8 +1101,8 @@ class BoxPSDataset(InMemoryDataset):
         Examples:
             .. code-block:: python
 
-              import paddle.fluid as fluid
-              dataset = fluid.DatasetFactory().create_dataset("BoxPSDataset")
+              import paddle
+              dataset = paddle.distributed.fleet.BoxPSDataset()
               dataset.begin_pass()
         """
         self.boxps.begin_pass()
@@ -1021,8 +1114,8 @@ class BoxPSDataset(InMemoryDataset):
         Examples:
             .. code-block:: python
 
-              import paddle.fluid as fluid
-              dataset = fluid.DatasetFactory().create_dataset("BoxPSDataset")
+              import paddle
+              dataset = paddle.distributed.fleet.BoxPSDataset()
               dataset.end_pass(True)
         """
         self.boxps.end_pass(need_save_delta)
@@ -1034,8 +1127,8 @@ class BoxPSDataset(InMemoryDataset):
         Examples:
             .. code-block:: python
 
-              import paddle.fluid as fluid
-              dataset = fluid.DatasetFactory().create_dataset("BoxPSDataset")
+              import paddle
+              dataset = paddle.distributed.fleet.BoxPSDataset()
               filelist = ["a.txt", "b.txt"]
               dataset.set_filelist(filelist)
               dataset.preload_into_memory()
@@ -1049,8 +1142,8 @@ class BoxPSDataset(InMemoryDataset):
         Examples:
             .. code-block:: python
 
-              import paddle.fluid as fluid
-              dataset = fluid.DatasetFactory().create_dataset("BoxPSDataset")
+              import paddle
+              dataset = paddle.distributed.fleet.BoxPSDataset()
               filelist = ["a.txt", "b.txt"]
               dataset.set_filelist(filelist)
               dataset.load_into_memory()
@@ -1064,8 +1157,8 @@ class BoxPSDataset(InMemoryDataset):
         Examples:
             .. code-block:: python
 
-              import paddle.fluid as fluid
-              dataset = fluid.DatasetFactory().create_dataset("BoxPSDataset")
+              import paddle
+              dataset = paddle.distributed.fleet.BoxPSDataset()
               filelist = ["a.txt", "b.txt"]
               dataset.set_filelist(filelist)
               dataset.preload_into_memory()
@@ -1093,11 +1186,90 @@ class BoxPSDataset(InMemoryDataset):
             slots(list[string]): the set of slots(string) to do slots shuffle.
 
         Examples:
-            import paddle.fluid as fluid
-            dataset = fluid.DatasetFactory().create_dataset("InMemoryDataset")
+            import paddle
+            dataset = paddle.distributed.fleet.BoxPSDataset()
             dataset.set_merge_by_lineid()
             #suppose there is a slot 0
             dataset.slots_shuffle(['0'])
         """
         slots_set = set(slots)
         self.boxps.slots_shuffle(slots_set)
+
+    def set_current_phase(self, current_phase):
+        """
+        Set current phase in train. It is useful for untest.
+        current_phase : 1 for join, 0 for update.
+
+        Examples:
+            .. code-block:: python
+
+              import paddle
+              dataset = paddle.distributed.fleet.BoxPSDataset()
+              filelist = ["a.txt", "b.txt"]
+              dataset.set_filelist(filelist)
+              dataset.load_into_memory()
+              dataset.set_current_phase(1)
+
+        """
+        self.dataset.set_current_phase(current_phase)
+
+    def get_pv_data_size(self):
+        """
+        Get memory data size of Pv, user can call this function to know the pv num
+        of ins in all workers after load into memory.
+
+        Note:
+            This function may cause bad performance, because it has barrier
+
+        Returns:
+            The size of memory pv data.
+
+        Examples:
+            .. code-block:: python
+
+              import paddle
+              dataset = paddle.distributed.fleet.BoxPSDataset()
+              filelist = ["a.txt", "b.txt"]
+              dataset.set_filelist(filelist)
+              dataset.load_into_memory()
+              print dataset.get_pv_data_size()
+
+        """
+        return self.dataset.get_pv_data_size()
+
+    def preprocess_instance(self):
+        """
+        Merge pv instance and convey it from input_channel to input_pv_channel. 
+        It will be effective when enable_pv_merge_ is True.
+
+        Examples:
+            .. code-block:: python
+
+              import paddle
+              dataset = paddle.distributed.fleet.BoxPSDataset()
+              filelist = ["a.txt", "b.txt"]
+              dataset.set_filelist(filelist)
+              dataset.load_into_memory()
+              dataset.preprocess_instance()
+
+        """
+        self.dataset.preprocess_instance()
+
+    def postprocess_instance(self):
+        """
+        Divide pv instance and convey it to input_channel.
+
+        Examples:
+            .. code-block:: python
+
+              import paddle
+              dataset = paddle.distributed.fleet.BoxPSDataset()
+              filelist = ["a.txt", "b.txt"]
+              dataset.set_filelist(filelist)
+              dataset.load_into_memory()
+              dataset.preprocess_instance()
+              exe.train_from_dataset(dataset)
+              dataset.postprocess_instance()
+
+        """
+        self.dataset.postprocess_instance()
diff --git a/python/paddle/distributed/fleet/launch.py b/python/paddle/distributed/fleet/launch.py
index 29a1bda92f17443e6c38b070379481aaa419b1d4..d63c9f9184c0eb9aafec73df09b225d598f3413f 100644
--- a/python/paddle/distributed/fleet/launch.py
+++ b/python/paddle/distributed/fleet/launch.py
@@ -55,7 +55,10 @@ launch a process on each of the given gpu card or cpu machine.
 """
 
 from __future__ import print_function
+
+import shutil
 import sys
+import tempfile
 from sys import version
 import subprocess
 import os
@@ -87,7 +90,7 @@ def _parse_args():
 see: http://www.paddlepaddle.org/documentation/docs/zh/1.6/user_guides/howto/training/cluster_howto.html#permalink-8--nccl2-
 ''')
 
-    #Optional arguments for the launch helper
+    # Optional arguments for the launch helper
     parser.add_argument(
         "--ips",
         type=str,
@@ -115,7 +118,7 @@ see: http://www.paddlepaddle.org/documentation/docs/zh/1.6/user_guides/howto/tra
         default="log",
         help="The path for each process's log.If it's not set, the log will printed to default pipe."
     )
-    #positional
+    # positional
     parser.add_argument(
         "training_script",
         type=str,
@@ -124,7 +127,7 @@ see: http://www.paddlepaddle.org/documentation/docs/zh/1.6/user_guides/howto/tra
         "followed by all the arguments for the "
         "training script")
 
-    #rest from the training program
+    # rest from the training program
     parser.add_argument('training_script_args', nargs=REMAINDER)
     return parser.parse_args()
 
@@ -138,7 +141,7 @@ def get_cluster_from_args(args, gpus):
 
     # node_ip = args.node_ip
     assert node_ip in node_ips, "Can't find your local ip {%s} in node_ips: {%s}" \
-                % (node_ip, node_ips)
+        % (node_ip, node_ips)
     node_rank = node_ips.index(node_ip)
 
     logger.debug("parsed from args: node_ips:{} node_ip:{} node_rank:{}".format(
@@ -153,21 +156,24 @@ def get_cluster_from_args(args, gpus):
     else:
         start_port = 6070
         if os.environ.get('FLAGS_START_PORT') is not None:
-            start_port = os.environ.get('FLAGS_START_PORT')
+            start_port = int(os.environ.get('FLAGS_START_PORT'))
 
         free_ports = [x for x in range(start_port, start_port + len(gpus))]
 
-    return get_cluster(node_ips, node_ip, free_ports, gpus)
+    trainer_endpoints = []
+    for ip in node_ips:
+        trainer_endpoints.append(["%s:%d" % (ip, port) for port in free_ports])
+    return get_cluster(node_ips, node_ip, trainer_endpoints, gpus)
 
 
 def get_gpus(gpus):
     if gpus is None:
         gpus_num = fluid.core.get_cuda_device_count()
-        gpus = [str(x) for x in range(0, gpus_num)]
+        res_gpus = [str(x) for x in range(0, gpus_num)]
     else:
         cuda_visible_devices = os.getenv("CUDA_VISIBLE_DEVICES")
         if cuda_visible_devices is None or cuda_visible_devices == "":
-            gpus = [x.strip() for x in gpus.split(',')]
+            res_gpus = [x.strip() for x in gpus.split(',')]
         else:
             # change gpus into relative values
             # e.g. CUDA_VISIBLE_DEVICES=4,5,6,7; args.gpus=4,5,6,7;
@@ -177,12 +183,16 @@ def get_gpus(gpus):
                 assert x in cuda_visible_devices_list, "Can't find "\
                 "your gpus %s in CUDA_VISIBLE_DEVICES[%s]."\
                 % (x, cuda_visible_devices)
-            gpus = [
+            res_gpus = [
                 cuda_visible_devices_list.index(x.strip())
                 for x in gpus.split(',')
             ]
+            logger.info("Change selected_gpus into reletive values. --ips:{} "
+                        "will change into relative_ips:{} according to your "
+                        "CUDA_VISIBLE_DEVICES:{}".format(
+                            gpus, res_gpus, cuda_visible_devices_list))
 
-    return gpus
+    return res_gpus
 
 
 def launch_collective(args):
@@ -200,28 +210,40 @@ def launch_collective(args):
         start_port = os.environ.get('FLAGS_START_PORT')
     if cloud_utils.use_paddlecloud() and trainers_num != 1:
         cluster, pod = cloud_utils.get_cloud_cluster(args.ips, gpus, start_port)
-        logger.info("get cluster from cloud:{}".format(cluster))
+        logger.debug("get cluster from cloud:{}".format(cluster))
     else:
         # trainers_num = 1 or not use paddlecloud ips="a,b"
         cluster, pod = get_cluster_from_args(args, gpus)
-        logger.info("get cluster from args:{}".format(cluster))
+        logger.debug("get cluster from args:{}".format(cluster))
+
+    global_envs = copy.copy(os.environ.copy())
+    gloo_rendezvous_dir = tempfile.mkdtemp()
+    # add gloo env
+    global_envs["PADDLE_WITH_GLOO"] = "1"
+    global_envs["PADDLE_GLOO_RENDEZVOUS"] = "2"
+    global_envs["PADDLE_GLOO_FS_PATH"] = gloo_rendezvous_dir
 
     procs = start_local_trainers(
         cluster,
         pod,
         training_script=args.training_script,
         training_script_args=args.training_script_args,
-        log_dir=args.log_dir)
+        log_dir=args.log_dir,
+        envs=global_envs)
 
     while True:
         alive = watch_local_trainers(procs, cluster.trainers_nranks())
 
         if not alive:
-            logger.info("Local procs complete, POD info:{}".format(pod))
+            logger.info("Local processes completed.")
+            logger.debug("POD info:{}".format(pod))
             break
 
         time.sleep(3)
 
+    if os.path.exists(gloo_rendezvous_dir):
+        shutil.rmtree(gloo_rendezvous_dir)
+
 
 def launch_ps(args):
     ports = None
@@ -272,7 +294,7 @@ def launch_ps(args):
         _, current_node_ip = get_host_name_ip()
 
     assert current_node_ip in node_ips, "Can't find your local ip {%s} in args.servers and args.workers ips: {%s}" \
-                % (current_node_ip, node_ips)
+        % (current_node_ip, node_ips)
     node_rank = node_ips.index(current_node_ip)
     logger.debug(
         "parsed from args: node_ips:{} current_node_ip:{} node_rank:{}, server_ports:{}".
@@ -307,24 +329,41 @@ def launch_ps(args):
 
     default_env = os.environ.copy()
     current_env = copy.copy(default_env)
+
+    gloo_rendezvous_dir = tempfile.mkdtemp()
+    # add gloo env
+    current_env["PADDLE_WITH_GLOO"] = "1"
+    current_env["PADDLE_GLOO_RENDEZVOUS"] = "2"
+    current_env["PADDLE_GLOO_FS_PATH"] = gloo_rendezvous_dir
+
     current_env.pop("http_proxy", None)
     current_env.pop("https_proxy", None)
     procs = []
     cmds = []
     log_fns = []
     for idx, cur_server in enumerate(pod.servers):
-        current_env.update({
+        proc_env = {
             "PADDLE_PSERVERS_IP_PORT_LIST": server_endpoints,
+            "PADDLE_TRAINER_ENDPOINTS": worker_endpoints,
             "PADDLE_PORT": cur_server.endpoint.split(":")[1],
             "TRAINING_ROLE": "PSERVER",
             "PADDLE_TRAINERS_NUM": str(worker_num),
-            "POD_IP": cur_server.endpoint.split(":")[0]
-        })
+            "POD_IP": cur_server.endpoint.split(":")[0],
+            "PADDLE_WITH_GLOO": "1"
+        }
+        current_env.update(proc_env)
 
         cmd = [sys.executable, "-u", args.training_script
                ] + args.training_script_args
         cmds.append(cmd)
 
+        if idx == 0:
+            logger.info(
+                "Local server start {} processes. First process distributed "
+                "environment info (Only For Debug): {}".format(
+                    len(pod.servers),
+                    pretty_print_envs(proc_env, ("Distributed Envs", "Value"))))
+
         if args.log_dir is not None:
             os.system("mkdir -p {}".format(args.log_dir))
             fn = open("%s/serverlog.%d" % (args.log_dir, idx), "w")
@@ -338,21 +377,33 @@ def launch_ps(args):
         tp.rank = cur_server.rank
         tp.local_rank = idx
         tp.log_fn = fn
-        tp.log_offset = 0 if fn else None
+        tp.log_offset = fn.tell() if fn else None
         tp.cmd = cmd
 
         procs.append(tp)
 
     for idx, cur_worker in enumerate(pod.workers):
-        current_env.update({
+        proc_env = {
             "PADDLE_PSERVERS_IP_PORT_LIST": server_endpoints,
+            "PADDLE_TRAINER_ENDPOINTS": worker_endpoints,
             "PADDLE_TRAINERS_NUM": str(worker_num),
             "TRAINING_ROLE": "TRAINER",
-            "PADDLE_TRAINER_ID": str(cur_worker.rank)
-        })
+            "PADDLE_TRAINER_ID": str(cur_worker.rank),
+            "PADDLE_WITH_GLOO": "1"
+        }
+        current_env.update(proc_env)
+
         cmd = [sys.executable, "-u", args.training_script
                ] + args.training_script_args
         cmds.append(cmd)
+
+        if idx == 0:
+            logger.info(
+                "Local worker start {} processes. First process distributed "
+                "environment info (Only For Debug): {}".format(
+                    len(pod.workers),
+                    pretty_print_envs(proc_env, ("Distributed Envs", "Value"))))
+
         if args.log_dir is not None:
             os.system("mkdir -p {}".format(args.log_dir))
             fn = open("%s/workerlog.%d" % (args.log_dir, idx), "w")
@@ -366,11 +417,14 @@ def launch_ps(args):
         tp.rank = cur_worker.rank
         tp.local_rank = idx
         tp.log_fn = fn
-        tp.log_offset = 0 if fn else None
+        tp.log_offset = fn.tell() if fn else None
         tp.cmd = cmd
 
         procs.append(tp)
 
+    logger.info(
+        "Please check servers and workers logs in {}/workerlog.* and {}/serverlog.*".
+        format(args.log_dir, args.log_dir))
     # only wait worker to finish here
     for i, proc in enumerate(procs):
         if i < len(pod.servers):
@@ -386,6 +440,9 @@ def launch_ps(args):
         procs[i].proc.terminate()
     print("all parameter server are killed", file=sys.stderr)
 
+    if os.path.exists(gloo_rendezvous_dir):
+        shutil.rmtree(gloo_rendezvous_dir)
+
 
 def launch():
     args = _parse_args()
@@ -400,19 +457,22 @@ def launch():
         co_arg for co_arg in collective_args
         if co_arg in " ".join(sys.argv[1:-1])
     ]
-    cuda_device_num = fluid.core.get_cuda_device_count()
+    if fluid.core.is_compiled_with_cuda():
+        cuda_device_num = fluid.core.get_cuda_device_count()
+    else:
+        cuda_device_num = 0
+
     if len(has_ps_args) > 0 or cuda_device_num == 0:
-        logger.info(
-            "Run parameter-sever cpu mode. pserver args:{}, cuda count:{}".
-            format(has_ps_args, cuda_device_num))
+        logger.info("Run parameter-sever cpu mode. pserver arguments:{}".format(
+            has_ps_args))
         launch_ps(args)
     elif len(has_collective_args) > 0:
-        logger.info("Run collective gpu mode. gpu args:{}, cuda count:{}".
+        logger.info("Run collective gpu mode. gpu arguments:{}, cuda count:{}".
                     format(has_collective_args, cuda_device_num))
         launch_collective(args)
     else:
         logger.warning(
-            "Not found distinct args. Default use gpu collective mode")
+            "Not found distinct arguments. Default use gpu collective mode")
         launch_collective(args)
 
 
diff --git a/python/paddle/distributed/fleet/launch_utils.py b/python/paddle/distributed/fleet/launch_utils.py
index 350d8ae2b44db3e8f8e6b00d95c2b7a9ca91f88b..7540cd9f4c1f352804550561c6f75b63104f9381 100644
--- a/python/paddle/distributed/fleet/launch_utils.py
+++ b/python/paddle/distributed/fleet/launch_utils.py
@@ -227,18 +227,23 @@ def get_logger(log_level=20, name="root"):
     return logger
 
 
-def get_cluster(node_ips, node_ip, paddle_ports, selected_gpus):
-    assert type(paddle_ports) is list, "paddle_ports must be list"
+def get_cluster(node_ips, node_ip, trainer_endpoints, selected_gpus):
+    assert type(trainer_endpoints) is list, "trainer_endpoints must be list"
     cluster = Cluster(hdfs=None)
     trainer_rank = 0
     for node_rank, ip in enumerate(node_ips):
         pod = Pod()
         pod.rank = node_rank
         pod.addr = ip
+        cur_node_endpoints = trainer_endpoints[node_rank]
+        # when use paddlecloud, endpoints may > selected_gpus(user_defined)
+        assert len(cur_node_endpoints) >= len(
+            selected_gpus
+        ), "current trainer_endpoints size should be greater equal than selected_gpus size."
         for i in range(len(selected_gpus)):
             trainer = Trainer()
             trainer.gpus.append(selected_gpus[i])
-            trainer.endpoint = "%s:%d" % (ip, paddle_ports[i])
+            trainer.endpoint = "%s" % (cur_node_endpoints[i])
             trainer.rank = trainer_rank
             trainer_rank += 1
 
@@ -253,7 +258,8 @@ def terminate_local_procs(procs):
     for p in procs:
         if p.proc.poll() is None:
             p.proc.terminate()
-            p.log_fn.close()
+            if p.log_fn:
+                p.log_fn.close()
             logger.debug("terminate process id:{}".format(p.proc.pid))
 
     #wait all process terminiated
@@ -338,6 +344,46 @@ def get_ports(num, offset):
     return ports
 
 
+def pretty_print_envs(envs, header=None):
+    spacing = 2
+    max_k = 40
+    max_v = 45
+
+    for k, v in envs.items():
+        max_k = max(max_k, len(k))
+
+    h_format = "    " + "|{{:>{}s}}{}{{:^{}s}}|\n".format(max_k, " " * spacing,
+                                                          max_v)
+    l_format = "    " + "|{{:>{}s}}{{}}{{:^{}s}}|\n".format(max_k, max_v)
+    length = max_k + max_v + spacing
+
+    border = "    +" + "".join(["="] * length) + "+"
+    line = "    +" + "".join(["-"] * length) + "+"
+
+    draws = ""
+    draws += border + "\n"
+
+    if header:
+        draws += h_format.format(header[0], header[1])
+    else:
+        draws += h_format.format("fleetrun Distributed Envs", "Value")
+
+    draws += line + "\n"
+
+    for k, v in envs.items():
+        if isinstance(v, str) and len(v) >= max_v:
+            str_v = "... " + v[-41:]
+        else:
+            str_v = v
+
+        draws += l_format.format(k, " " * spacing, str(str_v))
+
+    draws += border
+
+    _str = "\n{}\n".format(draws)
+    return _str
+
+
 class TrainerProc(object):
     def __init__(self):
         self.proc = None
@@ -352,8 +398,14 @@ def start_local_trainers(cluster,
                          pod,
                          training_script,
                          training_script_args,
-                         log_dir=None):
-    current_env = copy.copy(os.environ.copy())
+                         log_dir=None,
+                         envs=None):
+
+    if envs is None:
+        current_env = copy.copy(os.environ.copy())
+    else:
+        current_env = copy.copy(envs)
+
     #paddle broadcast ncclUniqueId use socket, and
     #proxy maybe make trainers unreachable, so delete them.
     #if we set them to "", grpc will log error message "bad uri"
@@ -373,15 +425,27 @@ def start_local_trainers(cluster,
 
         current_env.update(proc_env)
 
-        logger.debug("trainer proc env:{}".format(current_env))
-
         cmd = [sys.executable, "-u", training_script] + training_script_args
 
-        logger.info("start trainer proc:{} env:{}".format(cmd, proc_env))
-
+        logger.debug("start trainer proc{}  env:{}".format(cmd, current_env))
+
+        if idx == 0:
+            logger.info("Local start {} processes. First process distributed "
+                        "environment info (Only For Debug): {}".format(
+                            len(pod.trainers),
+                            pretty_print_envs(proc_env, ("Distributed Envs",
+                                                         "Value"))))
+            logger.info(
+                "details abouts PADDLE_TRAINER_ENDPOINTS can be found in {}/endpoints.log.".
+                format(log_dir))
         fn = None
         if log_dir is not None:
             os.system("mkdir -p {}".format(log_dir))
+            if os.path.exists("%s/endpoints.log" % log_dir):
+                os.system("rm -f {}/endpoints.log".format(log_dir))
+            with open("%s/endpoints.log" % log_dir, "w") as f:
+                f.write("PADDLE_TRAINER_ENDPOINTS: \n")
+                f.write("\n".join(cluster.trainers_endpoints()))
             fn = open("%s/workerlog.%d" % (log_dir, idx), "a")
             proc = subprocess.Popen(cmd, env=current_env, stdout=fn, stderr=fn)
         else:
diff --git a/python/paddle/distributed/fleet/meta_optimizers/__init__.py b/python/paddle/distributed/fleet/meta_optimizers/__init__.py
index d98b2ef3e2a083861647b2847bafad3b08c86cfd..a3a2dee70387d69b9e8e09cd86d69a76890d7a1f 100644
--- a/python/paddle/distributed/fleet/meta_optimizers/__init__.py
+++ b/python/paddle/distributed/fleet/meta_optimizers/__init__.py
@@ -18,6 +18,7 @@ from .graph_execution_optimizer import GraphExecutionOptimizer
 from .parameter_server_optimizer import ParameterServerOptimizer
 from .pipeline_optimizer import PipelineOptimizer
 from .localsgd_optimizer import LocalSGDOptimizer
+from .localsgd_optimizer import AdaptiveLocalSGDOptimizer
 from .lars_optimizer import LarsOptimizer
 from .parameter_server_graph_optimizer import ParameterServerGraphOptimizer
 from .dgc_optimizer import DGCOptimizer
diff --git a/python/paddle/distributed/fleet/meta_optimizers/amp_optimizer.py b/python/paddle/distributed/fleet/meta_optimizers/amp_optimizer.py
index b1952276e44cd1466bc443440505462924115ab7..ad96e1426694f090943bdd08902e5e2219d32eda 100644
--- a/python/paddle/distributed/fleet/meta_optimizers/amp_optimizer.py
+++ b/python/paddle/distributed/fleet/meta_optimizers/amp_optimizer.py
@@ -22,9 +22,13 @@ class AMPOptimizer(MetaOptimizerBase):
         self.amp_opt = None
         # we do not allow meta optimizer to be inner optimizer currently
         self.meta_optimizers_white_list = [
-            "LarsOptimizer", "LambOptimizer", "RecomputeOptimizer",
-            "LocalSGDOptimizer", "GradientMergeOptimizer",
-            "GraphExecutionOptimizer"
+            "LarsOptimizer",
+            "LambOptimizer",
+            "RecomputeOptimizer",
+            "LocalSGDOptimizer",
+            "GradientMergeOptimizer",
+            "GraphExecutionOptimizer",
+            "AdaptiveLocalSGDOptimizer",
         ]
         self.meta_optimizers_black_list = ["DGCOptimizer"]
 
@@ -34,6 +38,9 @@ class AMPOptimizer(MetaOptimizerBase):
             loss, role_maker, user_defined_optimizer, user_defined_strategy)
 
     def _can_apply(self):
+        if not self.role_maker._is_collective:
+            return False
+
         if self.user_defined_strategy.amp:
             return True
         return False
@@ -42,6 +49,17 @@ class AMPOptimizer(MetaOptimizerBase):
         dist_strategy.amp = False
         dist_strategy.amp_configs = {}
 
+    def _enable_strategy(self, dist_strategy, context):
+        dist_strategy.amp = True
+        dist_strategy.amp_configs = {
+            "init_loss_scaling": 32768.0,
+            "incr_every_n_steps": 1000,
+            "decr_every_n_nan_or_inf": 2,
+            "incr_ratio": 2.0,
+            "decr_ratio": 8.0,
+            "use_dynamic_loss_scaling": True
+        }
+
     def minimize_impl(self,
                       loss,
                       startup_program=None,
diff --git a/python/paddle/distributed/fleet/meta_optimizers/common.py b/python/paddle/distributed/fleet/meta_optimizers/common.py
index 70b010978bb4d5be98310efa8ff04a3f853602ab..8ff4114bf8eda4080c252a736d7b6ee69990faa4 100644
--- a/python/paddle/distributed/fleet/meta_optimizers/common.py
+++ b/python/paddle/distributed/fleet/meta_optimizers/common.py
@@ -57,12 +57,12 @@ class CollectiveHelper(object):
         if startup_program is None:
             self.startup_program = fluid.default_startup_program()
 
-        endpoints = self.role_maker.get_trainer_endpoints()
-        current_endpoint = endpoints[self.role_maker.worker_index()]
+        endpoints = self.role_maker._get_trainer_endpoints()
+        current_endpoint = endpoints[self.role_maker._worker_index()]
         for ring_id in range(self.nrings):
             self._init_communicator(
                 self.startup_program, current_endpoint, endpoints,
-                self.role_maker.worker_index(), ring_id, self.wait_port)
+                self.role_maker._worker_index(), ring_id, self.wait_port)
         self._broadcast_params()
 
     def _init_communicator(self, program, current_endpoint, endpoints, rank,
diff --git a/python/paddle/distributed/fleet/meta_optimizers/dgc_optimizer.py b/python/paddle/distributed/fleet/meta_optimizers/dgc_optimizer.py
index f1c6defc5c982c7d56980642898aaa333c199bbe..6806a479d30f467bd8b6f6d5c6832dda63af4055 100644
--- a/python/paddle/distributed/fleet/meta_optimizers/dgc_optimizer.py
+++ b/python/paddle/distributed/fleet/meta_optimizers/dgc_optimizer.py
@@ -47,17 +47,20 @@ class DGCOptimizer(MetaOptimizerBase):
             sparsity=configs['sparsity'],
             parameter_list=opt._parameter_list,
             use_nesterov=opt._use_nesterov,
-            num_trainers=self.role_maker.worker_num(),
+            num_trainers=self.role_maker._worker_num(),
             regularization=opt.regularization,
             grad_clip=opt._grad_clip,
             name=opt._name)
 
     def _can_apply(self):
+        if not self.role_maker._is_collective:
+            return False
+
         if self.user_defined_strategy.dgc:
             if not isinstance(self.inner_opt, Momentum):
                 logging.warn("dgc only works on Momentum optimizer")
                 return False
-            if self.role_maker.worker_num() <= 1:
+            if self.role_maker._worker_num() <= 1:
                 logging.warn("dgc only works on multi cards")
                 return False
 
@@ -69,6 +72,10 @@ class DGCOptimizer(MetaOptimizerBase):
         dist_strategy.dgc = False
         dist_strategy.dgc_configs = {}
 
+    def _enable_strategy(self, dist_strategy, context):
+        dist_strategy.dgc = True
+        dist_strategy.dgc_configs = {"rampup_begin_step": 0, "rampup_step": 1}
+
     def backward(self,
                  loss,
                  startup_program=None,
@@ -85,5 +92,5 @@ class DGCOptimizer(MetaOptimizerBase):
                       no_grad_set=None):
         optimize_ops, params_grads = \
             self.dgc_opt.minimize(loss, startup_program,
-                                      parameter_list, no_grad_set)
+                                  parameter_list, no_grad_set)
         return optimize_ops, params_grads
diff --git a/python/paddle/distributed/fleet/meta_optimizers/gradient_merge_optimizer.py b/python/paddle/distributed/fleet/meta_optimizers/gradient_merge_optimizer.py
index 7db79ad7b5b7081172209faa2396d9f2a31bbdb3..f1b3680976541806d96ca815be64b03bcd499469 100644
--- a/python/paddle/distributed/fleet/meta_optimizers/gradient_merge_optimizer.py
+++ b/python/paddle/distributed/fleet/meta_optimizers/gradient_merge_optimizer.py
@@ -37,14 +37,21 @@ class GradientMergeOptimizer(MetaOptimizerBase):
             self.user_defined_strategy.gradient_merge_configs["avg"])
 
     def _can_apply(self):
+        if not self.role_maker._is_collective:
+            return False
+
         can_apply = (self.user_defined_strategy.gradient_merge == True) and \
-                  self.user_defined_strategy.gradient_merge_configs["k_steps"] > 1
+            self.user_defined_strategy.gradient_merge_configs["k_steps"] > 1
         return can_apply
 
     def _disable_strategy(self, dist_strategy):
         dist_strategy.gradient_merge = False
         dist_strategy.gradient_merge_configs = {}
 
+    def _enable_strategy(self, dist_strategy, context):
+        # we currently do not support auto-enable gradient merge
+        return
+
     def minimize_impl(self,
                       loss,
                       startup_program=None,
diff --git a/python/paddle/distributed/fleet/meta_optimizers/graph_execution_optimizer.py b/python/paddle/distributed/fleet/meta_optimizers/graph_execution_optimizer.py
index ace31687338f918ef260b3134b0bd429795542d0..0ad9e5680eab4a1beb340359e1af44fce9217097 100644
--- a/python/paddle/distributed/fleet/meta_optimizers/graph_execution_optimizer.py
+++ b/python/paddle/distributed/fleet/meta_optimizers/graph_execution_optimizer.py
@@ -48,14 +48,14 @@ class GraphExecutionOptimizer(MetaOptimizerBase):
                  callbacks=None):
         pass
 
-    # should fix the variable 
+    # should fix the variable
     def _setup_nccl_op(self, startup_program, main_program, build_strategy):
-        trainer_endpoints = self.role_maker.get_trainer_endpoints()
+        trainer_endpoints = self.role_maker._get_trainer_endpoints()
         trainers = trainer_endpoints
-        trainer_id = self.role_maker.worker_index()
-        current_endpoint = self.role_maker.get_trainer_endpoints()[trainer_id]
+        trainer_id = self.role_maker._worker_index()
+        current_endpoint = self.role_maker._get_trainer_endpoints()[trainer_id]
         trainer_endpoints_env = ",".join(trainer_endpoints)
-        trainers_num = self.role_maker.worker_num()
+        trainers_num = self.role_maker._worker_num()
         nccl_id_var = startup_program.global_block().create_var(
             name="NCCLID", persistable=True, type=core.VarDesc.VarType.RAW)
         for i in range(1, build_strategy.nccl_comm_num):
@@ -94,31 +94,31 @@ class GraphExecutionOptimizer(MetaOptimizerBase):
         dist_strategy = self.user_defined_strategy
         local_build_strategy = paddle.fluid.BuildStrategy()
         local_build_strategy.enable_sequential_execution = \
-                    dist_strategy.build_strategy.enable_sequential_execution
+            dist_strategy.build_strategy.enable_sequential_execution
         local_build_strategy.fuse_elewise_add_act_ops = \
-                    dist_strategy.build_strategy.fuse_elewise_add_act_ops
+            dist_strategy.build_strategy.fuse_elewise_add_act_ops
         local_build_strategy.fuse_bn_act_ops = \
-                    dist_strategy.build_strategy.fuse_bn_act_ops
+            dist_strategy.build_strategy.fuse_bn_act_ops
         local_build_strategy.enable_auto_fusion = \
-                    dist_strategy.build_strategy.enable_auto_fusion
+            dist_strategy.build_strategy.enable_auto_fusion
         local_build_strategy.fuse_relu_depthwise_conv = \
-                    dist_strategy.build_strategy.fuse_relu_depthwise_conv
+            dist_strategy.build_strategy.fuse_relu_depthwise_conv
         local_build_strategy.fuse_broadcast_ops = \
-                    dist_strategy.build_strategy.fuse_broadcast_ops
+            dist_strategy.build_strategy.fuse_broadcast_ops
         local_build_strategy.fuse_all_optimizer_ops = \
-                    dist_strategy.build_strategy.fuse_all_optimizer_ops
+            dist_strategy.build_strategy.fuse_all_optimizer_ops
         local_build_strategy.enable_inplace = \
-                    dist_strategy.build_strategy.enable_inplace
+            dist_strategy.build_strategy.enable_inplace
         local_build_strategy.use_hierarchical_allreduce = \
-                    dist_strategy.use_hierarchical_allreduce
+            dist_strategy.use_hierarchical_allreduce
         local_build_strategy.hierarchical_allreduce_inter_nranks = \
-                    dist_strategy.hierarchical_allreduce_inter_nranks
+            dist_strategy.hierarchical_allreduce_inter_nranks
         local_build_strategy.sync_batch_norm = \
-                    dist_strategy.sync_batch_norm
+            dist_strategy.sync_batch_norm
         local_build_strategy.fuse_all_reduce_ops = \
-                    dist_strategy.fuse_all_reduce_ops
+            dist_strategy.fuse_all_reduce_ops
         local_build_strategy.nccl_comm_num = \
-                    dist_strategy.nccl_comm_num
+            dist_strategy.nccl_comm_num
 
         if self.user_defined_strategy.recompute == True:
             logging.warn(
@@ -127,8 +127,8 @@ class GraphExecutionOptimizer(MetaOptimizerBase):
             local_build_strategy.enable_sequential_execution = True
 
         exe_strategy = self.user_defined_strategy.execution_strategy
-        worker_num = self.role_maker.worker_num()
-        node_num = self.role_maker.node_num()
+        worker_num = self.role_maker._worker_num()
+        node_num = self.role_maker._node_num()
 
         if self.role_maker._is_collective:
             assert worker_num >= 1, "nccl2 worker_num must >= 1, now:{}" % worker_num
@@ -148,9 +148,6 @@ class GraphExecutionOptimizer(MetaOptimizerBase):
 
         sync_allreduce = dist_strategy.sync_nccl_allreduce
         if sync_allreduce:
-            paddle.fluid.framework.set_flags({
-                "FLAGS_sync_nccl_allreduce": True
-            })
             exe_strategy.num_threads = local_build_strategy.nccl_comm_num + 1
             if local_build_strategy.use_hierarchical_allreduce:
                 exe_strategy.num_threads = 2 * local_build_strategy.nccl_comm_num + 1
@@ -173,9 +170,9 @@ class GraphExecutionOptimizer(MetaOptimizerBase):
         # TODO(guru4elephant): should be an independent optimizer
         self._setup_nccl_op(startup_program, main_program, local_build_strategy)
 
-        local_build_strategy.num_trainers = self.role_maker.worker_num()
-        local_build_strategy.trainer_id = self.role_maker.worker_index()
-        local_build_strategy.trainers_endpoints = self.role_maker.get_trainer_endpoints(
+        local_build_strategy.num_trainers = self.role_maker._worker_num()
+        local_build_strategy.trainer_id = self.role_maker._worker_index()
+        local_build_strategy.trainers_endpoints = self.role_maker._get_trainer_endpoints(
         )
         local_build_strategy.enable_backward_optimizer_op_deps = True
 
@@ -191,7 +188,11 @@ class GraphExecutionOptimizer(MetaOptimizerBase):
 
     def _disable_strategy(self, dist_strategy):
         # TODO(guru4elephant): should close all PE related flags here
-        pass
+        return
+
+    def _enable_strategy(self, dist_strategy, context):
+        # by default, graph execution strategy is enabled
+        return
 
     def minimize(self,
                  loss,
diff --git a/python/paddle/distributed/fleet/meta_optimizers/lamb_optimizer.py b/python/paddle/distributed/fleet/meta_optimizers/lamb_optimizer.py
index 9fa29c4078e9f579a740ef8c0591979e7fbb962d..df9887759e16fddb0579abdcdf3ef5f9024825e7 100755
--- a/python/paddle/distributed/fleet/meta_optimizers/lamb_optimizer.py
+++ b/python/paddle/distributed/fleet/meta_optimizers/lamb_optimizer.py
@@ -62,6 +62,9 @@ class LambOptimizer(MetaOptimizerBase):
             name=opt._name)
 
     def _can_apply(self):
+        if not self.role_maker._is_collective:
+            return False
+
         if self.user_defined_strategy.lamb:
             if not isinstance(self.inner_opt, AdamOptimizer):
                 logging.warn(
@@ -75,6 +78,13 @@ class LambOptimizer(MetaOptimizerBase):
         dist_strategy.lamb = False
         dist_strategy.lamb_configs = {}
 
+    def _enable_strategy(self, dist_strategy, context):
+        dist_strategy.lamb = True
+        dist_strategy.lamb_configs = {
+            "lamb_weight_decay": 0.01,
+            "exclude_from_weight_decay": []
+        }
+
     def backward(self,
                  loss,
                  startup_program=None,
@@ -84,6 +94,10 @@ class LambOptimizer(MetaOptimizerBase):
         return self.lamb_opt.backward(loss, startup_program, parameter_list,
                                       no_grad_set, callbacks)
 
+    # the following function will be used by AMP if both LARS and AMP are turn on together.
+    def apply_gradients(self, params_grads):
+        return self.lamb_opt.apply_gradients(params_grads=params_grads)
+
     def minimize_impl(self,
                       loss,
                       startup_program=None,
@@ -91,5 +105,5 @@ class LambOptimizer(MetaOptimizerBase):
                       no_grad_set=None):
         optimize_ops, params_grads = \
             self.lamb_opt.minimize(loss, startup_program,
-                                      parameter_list, no_grad_set)
+                                   parameter_list, no_grad_set)
         return optimize_ops, params_grads
diff --git a/python/paddle/distributed/fleet/meta_optimizers/lars_optimizer.py b/python/paddle/distributed/fleet/meta_optimizers/lars_optimizer.py
index a7b856ff5b0dcb1ab30de82a12c91a2e1c14fe76..609d8b85e714c1c7247898f8d506f9dadab9f499 100755
--- a/python/paddle/distributed/fleet/meta_optimizers/lars_optimizer.py
+++ b/python/paddle/distributed/fleet/meta_optimizers/lars_optimizer.py
@@ -44,13 +44,19 @@ class LarsOptimizer(MetaOptimizerBase):
             parameter_list=opt._parameter_list,
             regularization=opt.regularization,
             grad_clip=opt._grad_clip,
-            name=opt._name)
+            name=opt._name,
+            exclude_from_weight_decay=configs['exclude_from_weight_decay'],
+            epsilon=configs['epsilon'])
 
     def _can_apply(self):
+        if not self.role_maker._is_collective:
+            return False
+
         if self.user_defined_strategy.lars:
             if not isinstance(self.inner_opt, Momentum):
                 logging.warn(
-                    "lars need the inner optimizer to be Momentum optimizer.")
+                    "lars need the inner optimizer to be Momentum optimizer but got {}.".
+                    format(self.inner_opt.type))
                 return False
             return True
         return False
@@ -59,6 +65,13 @@ class LarsOptimizer(MetaOptimizerBase):
         dist_strategy.lars = False
         dist_strategy.lars_configs = {}
 
+    def _enable_strategy(self, dist_strategy, context):
+        dist_strategy.lars = True
+        dist_strategy.lars_configs = {
+            "lars_coeff": 0.01,
+            "lars_weight_decay": 0.0005,
+        }
+
     def backward(self,
                  loss,
                  startup_program=None,
@@ -68,6 +81,10 @@ class LarsOptimizer(MetaOptimizerBase):
         return self.lars_opt.backward(loss, startup_program, parameter_list,
                                       no_grad_set, callbacks)
 
+    # the following function will be used by AMP if both LARS and AMP are turn on together.
+    def apply_gradients(self, params_grads):
+        return self.lars_opt.apply_gradients(params_grads=params_grads)
+
     def minimize_impl(self,
                       loss,
                       startup_program=None,
@@ -75,5 +92,5 @@ class LarsOptimizer(MetaOptimizerBase):
                       no_grad_set=None):
         optimize_ops, params_grads = \
             self.lars_opt.minimize(loss, startup_program,
-                                      parameter_list, no_grad_set)
+                                   parameter_list, no_grad_set)
         return optimize_ops, params_grads
diff --git a/python/paddle/distributed/fleet/meta_optimizers/localsgd_optimizer.py b/python/paddle/distributed/fleet/meta_optimizers/localsgd_optimizer.py
index e22127c13999bfde7aa753ad1a66536913ab04f9..9f094978d842a8ba194742b527dc6f3cd19234cd 100644
--- a/python/paddle/distributed/fleet/meta_optimizers/localsgd_optimizer.py
+++ b/python/paddle/distributed/fleet/meta_optimizers/localsgd_optimizer.py
@@ -14,8 +14,8 @@
 
 from __future__ import print_function
 
+import paddle
 from paddle.fluid import program_guard, layers, default_main_program
-from paddle.fluid.optimizer import Momentum, SGD
 from .meta_optimizer_base import MetaOptimizerBase
 from .common import OpRole, OP_ROLE_KEY, CollectiveHelper, is_update_op
 
@@ -25,23 +25,35 @@ class LocalSGDOptimizer(MetaOptimizerBase):
         super(LocalSGDOptimizer, self).__init__(optimizer)
         self.inner_opt = optimizer
         self.meta_optimizers_white_list = []
-        self.meta_optimizers_black_list = ["GraphExecutionOptimizer"]
+        self.meta_optimizers_black_list = [
+            "GraphExecutionOptimizer",
+            "AdaptiveLocalSGDOptimizer",
+        ]
         self.snapshot_key = '@SNAPSHOT'
 
     def _can_apply(self):
+        if not self.role_maker._is_collective:
+            return False
+
         if not self.user_defined_strategy.localsgd:
             return False
 
-        if self.role_maker.worker_num() <= 1:
+        if self.role_maker._worker_num() <= 1:
             return False
 
-        return isinstance(self.inner_opt, Momentum) \
-                or isinstance(self.inner_opt, SGD)
+        return isinstance(self.inner_opt, paddle.optimizer.momentum.Momentum) \
+            or isinstance(self.inner_opt, paddle.fluid.optimizer.Momentum) \
+            or isinstance(self.inner_opt, paddle.optimizer.sgd.SGD) \
+            or isinstance(self.inner_opt, paddle.fluid.optimizer.SGD)
 
     def _disable_strategy(self, dist_strategy):
         dist_strategy.localsgd = False
         dist_strategy.localsgd_configs = {}
 
+    def _enable_strategy(self, dist_strategy, context):
+        dist_strategy.localsgd = True
+        dist_strategy.localsgd_configs = {"k_steps": 1, "begin_step": 1}
+
     def snapshot_name(self, param_name):
         return param_name + self.snapshot_key
 
@@ -77,8 +89,9 @@ class LocalSGDOptimizer(MetaOptimizerBase):
         minimized = self.inner_opt.minimize(
             loss, startup_program=startup_program)
 
-        init_k_steps = self.user_defined_strategy.localsgd_configs['k_steps']
-        auto_steps = self.user_defined_strategy.auto
+        k_steps_value = self.user_defined_strategy.localsgd_configs['k_steps']
+        begin_step_value = self.user_defined_strategy.localsgd_configs[
+            'begin_step']
 
         if startup_program is None:
             startup_program = default_startup_program()
@@ -92,13 +105,221 @@ class LocalSGDOptimizer(MetaOptimizerBase):
 
         p2s = self.create_snapshot_vars(main_block.program)
         with program_guard(main_block.program, startup_program):
-            step = layers.autoincreased_step_counter(begin=0)
+            step = layers.autoincreased_step_counter(begin=1)
             k_steps = layers.create_global_var(
                 name="k_steps",
                 shape=[1],
-                value=init_k_steps,
+                value=k_steps_value,
                 dtype='int64',
                 persistable=True)
+
+            begin_step = layers.create_global_var(
+                name="begin_step",
+                shape=[1],
+                value=begin_step_value,
+                dtype='int64',
+                persistable=True)
+
+            last_step = layers.create_global_var(
+                name="last_step",
+                shape=[1],
+                value=begin_step_value,
+                dtype='int64',
+                persistable=True)
+
+            def communicate():
+                sub_block = default_main_program().current_block()
+                ring_id = -1
+                for param, snapshot in p2s:
+                    sub_block.append_op(
+                        type='elementwise_sub',
+                        inputs={'X': [snapshot],
+                                'Y': [param]},
+                        outputs={'Out': [param]},
+                        attrs={OP_ROLE_KEY: OpRole.Optimize})
+                    sub_block.append_op(
+                        type='c_sync_calc_stream',
+                        inputs={'X': param},
+                        outputs={'Out': param},
+                        attrs={OP_ROLE_KEY: OpRole.Optimize})
+                    ring_id = (ring_id + 1) % self.nrings
+                    sub_block.append_op(
+                        type='c_allreduce_sum',
+                        inputs={'X': [param]},
+                        outputs={'Out': [param]},
+                        attrs={
+                            'ring_id': ring_id,
+                            OP_ROLE_KEY: OpRole.Optimize
+                        })
+
+                for ring_id in range(self.nrings):
+                    sub_block.append_op(
+                        type='c_sync_comm_stream',
+                        inputs={'X': param},
+                        outputs={'Out': param},
+                        attrs={
+                            'ring_id': ring_id,
+                            OP_ROLE_KEY: OpRole.Optimize
+                        })
+
+                for param, snapshot in p2s:
+                    sub_block.append_op(
+                        type='scale',
+                        inputs={'X': [param]},
+                        outputs={'Out': [param]},
+                        attrs={
+                            'scale': 1.0 / self.role_maker._worker_num(),
+                            OP_ROLE_KEY: OpRole.Optimize
+                        })
+                    sub_block.append_op(
+                        type='elementwise_sub',
+                        inputs={'X': [snapshot],
+                                'Y': [param]},
+                        outputs={'Out': [param]},
+                        attrs={OP_ROLE_KEY: OpRole.Optimize})
+                    sub_block.append_op(
+                        type='assign',
+                        inputs={'X': [param]},
+                        outputs={'Out': [snapshot]},
+                        attrs={OP_ROLE_KEY: OpRole.Optimize})
+                layers.assign(step, last_step)
+
+            def begin_localsgd():
+                layers.cond(step - last_step == k_steps, communicate)
+
+            layers.cond(step > begin_step, begin_localsgd, communicate)
+        return minimized
+
+
+class AdaptiveLocalSGDOptimizer(MetaOptimizerBase):
+    def __init__(self, optimizer):
+        super(AdaptiveLocalSGDOptimizer, self).__init__(optimizer)
+        self.inner_opt = optimizer
+        self.meta_optimizers_white_list = []
+        self.meta_optimizers_black_list = [
+            "GraphExecutionOptimizer", "LocalSGDOptimizer"
+        ]
+        self.snapshot_key = '@SNAPSHOT'
+
+    def _can_apply(self):
+        if not self.role_maker._is_collective:
+            return False
+
+        if not self.user_defined_strategy.adaptive_localsgd:
+            return False
+
+        if self.role_maker._worker_num() <= 1:
+            return False
+
+        return isinstance(self.inner_opt, paddle.optimizer.momentum.Momentum) \
+            or isinstance(self.inner_opt, paddle.fluid.optimizer.Momentum) \
+            or isinstance(self.inner_opt, paddle.optimizer.sgd.SGD) \
+            or isinstance(self.inner_opt, paddle.fluid.optimizer.SGD)
+
+    def _disable_strategy(self, dist_strategy):
+        dist_strategy.adaptive_localsgd = False
+        dist_strategy.adaptive_localsgd_configs = {}
+
+    def _enable_strategy(self, dist_strategy, context):
+        dist_strategy.adaptive_localsgd = True
+        dist_strategy.adaptive_localsgd_configs = {
+            "init_k_steps": 1,
+            "begin_step": 1
+        }
+
+    def snapshot_name(self, param_name):
+        return param_name + self.snapshot_key
+
+    def create_snapshot_vars(self, program):
+        block = program.global_block()
+
+        non_dist_params = []
+        for param in block.iter_parameters():
+            if not param.is_distributed:
+                non_dist_params.append(param)
+
+        p2s = []
+        for param in non_dist_params:
+            snapshot = block.create_var(
+                name=self.snapshot_name(param.name),
+                shape=param.shape,
+                persistable=True,
+                stop_gradient=True,
+                dtype=param.dtype)
+            p2s.append([param, snapshot])
+        return p2s
+
+    def init_snapshot_vars(self, startup_program, param2snapshot):
+        with program_guard(startup_program):
+            for param, snapshot in param2snapshot:
+                layers.assign(param, snapshot)
+
+    def _generate_avg_loss(self, program_block, loss, avg_loss):
+        program_block.append_op(
+            type='c_allreduce_sum',
+            inputs={'X': [loss]},
+            outputs={'Out': [avg_loss]},
+            attrs={
+                'ring_id': 0,
+                OP_ROLE_KEY: OpRole.Optimize,
+                'use_calc_stream': True
+            })
+        program_block.append_op(
+            type='c_sync_calc_stream',
+            inputs={'X': [avg_loss]},
+            outputs={'Out': [avg_loss]},
+            attrs={OP_ROLE_KEY: OpRole.Optimize})
+
+        program_block.append_op(
+            type='scale',
+            inputs={'X': [avg_loss]},
+            outputs={'Out': [avg_loss]},
+            attrs={
+                'scale': 1.0 / self.role_maker._worker_num(),
+                OP_ROLE_KEY: OpRole.Optimize
+            })
+
+    def minimize_impl(self,
+                      loss,
+                      startup_program=None,
+                      parameter_list=None,
+                      no_grad_set=None):
+        minimized = self.inner_opt.minimize(
+            loss, startup_program=startup_program)
+
+        init_k_steps = self.user_defined_strategy.adaptive_localsgd_configs[
+            'init_k_steps']
+        begin_step_value = self.user_defined_strategy.adaptive_localsgd_configs[
+            'begin_step']
+
+        if startup_program is None:
+            startup_program = default_startup_program()
+        main_block = loss.block
+
+        self.nrings = 2
+        collective_helper = CollectiveHelper(self.role_maker, self.nrings)
+        collective_helper.update_startup_program(startup_program)
+        p2s = self.create_snapshot_vars(startup_program)
+        self.init_snapshot_vars(startup_program, p2s)
+
+        p2s = self.create_snapshot_vars(main_block.program)
+        with program_guard(main_block.program, startup_program):
+            step = layers.autoincreased_step_counter(begin=1)
+
+            k_steps = layers.create_global_var(
+                name="k_steps",
+                shape=[1],
+                value=int(init_k_steps),
+                dtype='int64',
+                persistable=True)
+
+            begin_step = layers.create_global_var(
+                name="begin_step",
+                shape=[1],
+                value=int(begin_step_value),
+                dtype='int64',
+                persistable=True)
+
             last_step = layers.create_global_var(
                 name="last_step",
                 shape=[1],
@@ -106,30 +327,35 @@ class LocalSGDOptimizer(MetaOptimizerBase):
                 dtype='int64',
                 persistable=True)
 
-            if auto_steps:
-                avg_loss = layers.collective._c_allreduce(
-                    loss) / self.role_maker.worker_num()
+            avg_loss = layers.create_global_var(
+                name="avg_loss",
+                shape=[1],
+                value=float(0),
+                dtype=loss.dtype,
+                persistable=True)
+
+            lr_0 = layers.create_global_var(
+                name="lr_0",
+                shape=[1],
+                value=float(0),
+                dtype='float32',
+                persistable=True)
 
-                lr_0 = layers.create_global_var(
-                    name="lr_0",
-                    shape=[1],
-                    value=float(0),
-                    dtype='float32',
-                    persistable=True)
-                loss_0 = layers.create_global_var(
-                    name="loss_0",
-                    shape=[1],
-                    value=float(0),
-                    dtype='float32',
-                    persistable=True)
+            loss_0 = layers.create_global_var(
+                name="loss_0",
+                shape=[1],
+                value=float(0),
+                dtype='float32',
+                persistable=True)
 
-                global_lr = self.inner_opt._global_learning_rate()
+            global_lr = self.inner_opt._global_learning_rate()
 
-                def initialize():
-                    layers.assign(loss, loss_0)
-                    layers.assign(global_lr, lr_0)
+            def initialize():
+                self._generate_avg_loss(main_block, loss, avg_loss)
+                layers.assign(avg_loss, loss_0)
+                layers.assign(global_lr, lr_0)
 
-                layers.cond(step == 0, initialize)
+            layers.cond(step == 1, initialize)
 
             def communicate():
                 sub_block = default_main_program().current_block()
@@ -172,7 +398,7 @@ class LocalSGDOptimizer(MetaOptimizerBase):
                         inputs={'X': [param]},
                         outputs={'Out': [param]},
                         attrs={
-                            'scale': 1.0 / self.role_maker.worker_num(),
+                            'scale': 1.0 / self.role_maker._worker_num(),
                             OP_ROLE_KEY: OpRole.Optimize
                         })
                     sub_block.append_op(
@@ -186,20 +412,29 @@ class LocalSGDOptimizer(MetaOptimizerBase):
                         inputs={'X': [param]},
                         outputs={'Out': [snapshot]},
                         attrs={OP_ROLE_KEY: OpRole.Optimize})
-
-                if auto_steps:
-                    next_local_steps = layers.cast(
-                        layers.ceil(
-                            layers.sqrt(lr_0 * loss / (global_lr * loss_0) *
-                                        float(init_k_steps))),
-                        dtype='int64')
-                    max_local_steps = layers.fill_constant(
-                        shape=[1], dtype='int64', value=16)
-                    next_local_steps = layers.elementwise_min(next_local_steps,
-                                                              max_local_steps)
-                    layers.assign(next_local_steps, k_steps)
                 layers.assign(step, last_step)
 
-            layers.cond(step - last_step == k_steps, communicate)
+            def communicate_avg_loss():
+                communicate()
+                self._generate_avg_loss(main_block, loss, avg_loss)
+                next_local_steps = layers.cast(
+                    layers.ceil(
+                        layers.sqrt(lr_0 * avg_loss / (global_lr * loss_0) *
+                                    float(init_k_steps))),
+                    dtype='int64')
+                max_local_steps = layers.fill_constant(
+                    shape=[1], dtype='int64', value=16)
+                min_local_steps = layers.fill_constant(
+                    shape=[1], dtype='int64', value=1)
+                next_local_steps = layers.elementwise_min(next_local_steps,
+                                                          max_local_steps)
+                next_local_steps = layers.elementwise_max(next_local_steps,
+                                                          min_local_steps)
+                layers.assign(next_local_steps, k_steps)
+
+            def begin_localsgd():
+                layers.cond(step - last_step == k_steps, communicate_avg_loss)
+
+            layers.cond(step > begin_step, begin_localsgd, communicate)
 
         return minimized
diff --git a/python/paddle/distributed/fleet/meta_optimizers/meta_optimizer_base.py b/python/paddle/distributed/fleet/meta_optimizers/meta_optimizer_base.py
index 073148e11a0a2b08253b89d36d7a014b830518f8..a12ca50442b1c3499d62216d1fecc709f3351382 100644
--- a/python/paddle/distributed/fleet/meta_optimizers/meta_optimizer_base.py
+++ b/python/paddle/distributed/fleet/meta_optimizers/meta_optimizer_base.py
@@ -48,6 +48,10 @@ class MetaOptimizerBase(Optimizer):
         raise NotImplementedError("you should implement disable strategy in {}".
                                   format(type(self).__name__))
 
+    def _enable_strategy(self, dist_strategy, context=None):
+        raise NotImplementedError("you should implement enable strategy in {}".
+                                  format(type(self).__name__))
+
     def apply_gradients(self, params_grads):
         return self.inner_opt.apply_gradients(params_grads=params_grads)
 
diff --git a/python/paddle/distributed/fleet/meta_optimizers/parameter_server_graph_optimizer.py b/python/paddle/distributed/fleet/meta_optimizers/parameter_server_graph_optimizer.py
index 878ed7422d733d3e2828e0395ec63ed16b4c489a..dfa765364f357b6e685c3983c73cfb4f1b2cce61 100644
--- a/python/paddle/distributed/fleet/meta_optimizers/parameter_server_graph_optimizer.py
+++ b/python/paddle/distributed/fleet/meta_optimizers/parameter_server_graph_optimizer.py
@@ -24,11 +24,14 @@ class ParameterServerGraphOptimizer(ParameterServerOptimizer):
         self.meta_optimizers_white_list = []
 
     def _can_apply(self):
+        if self.role_maker._is_collective:
+            return False
+
         k_steps = self.user_defined_strategy.a_sync_configs["k_steps"]
         if k_steps < 0:
             return False
 
-        if self.role_maker.is_server():
+        if self.role_maker._is_server():
             return False
 
         if self.role_maker._is_heter_parameter_server_mode:
@@ -37,7 +40,11 @@ class ParameterServerGraphOptimizer(ParameterServerOptimizer):
         return True
 
     def _disable_strategy(self, dist_strategy):
-        dist_strategy.a_sync_configs = {}
+        return
+
+    def _enable_strategy(self, dist_strategy, context):
+        # only open up the async mode for auto-parallel
+        return
 
     def _is_graph_out(self):
         return True
diff --git a/python/paddle/distributed/fleet/meta_optimizers/parameter_server_optimizer.py b/python/paddle/distributed/fleet/meta_optimizers/parameter_server_optimizer.py
index ecb198bedf9041aa3ffc929a72cce3c209f03b61..38ad41f8836b4e8c3b304dbf539b47d5293a8221 100644
--- a/python/paddle/distributed/fleet/meta_optimizers/parameter_server_optimizer.py
+++ b/python/paddle/distributed/fleet/meta_optimizers/parameter_server_optimizer.py
@@ -13,6 +13,10 @@
 
 from paddle import fluid
 from .meta_optimizer_base import MetaOptimizerBase
+from paddle.fluid import core
+import subprocess
+import re
+import platform
 
 
 class ParameterServerOptimizer(MetaOptimizerBase):
@@ -28,6 +32,7 @@ class ParameterServerOptimizer(MetaOptimizerBase):
     def _can_apply(self):
         if self.role_maker._is_collective:
             return False
+
         k_steps = self.user_defined_strategy.a_sync_configs["k_steps"]
         return True if k_steps >= 0 else False
 
@@ -127,6 +132,95 @@ class ParameterServerOptimizer(MetaOptimizerBase):
 
         return _main, _startup
 
+    def _can_apply_geo(self, dist_strategy, program):
+        def get_sys_free_mem():
+            plat = platform.system()
+            if platform.system() == "Darwin":
+                vm = subprocess.Popen(
+                    ['vm_stat'], stdout=subprocess.PIPE).communicate()[0]
+                # Process vm_stat
+                vmLines = vm.split('\n')
+                sep = re.compile(':[\s]+')
+                vmStats = {}
+                for row in range(1, len(vmLines) - 2):
+                    rowText = vmLines[row].strip()
+                    rowElements = sep.split(rowText)
+                    vmStats[(rowElements[0]
+                             )] = int(rowElements[1].strip('\.')) * 4096
+                return vmStats["Pages free"]
+            elif platform.system() == "Linux":
+                mems = {}
+                with open('/proc/meminfo', 'rb') as f:
+                    for line in f:
+                        fields = line.split()
+                        mems[fields[0]] = int(fields[1]) * 1024
+                free = mems[b'MemFree:']
+                return free
+            else:
+                raise ValueError(
+                    "%s platform is unsupported is parameter server optimizer" %
+                    (platform.system()))
+
+        if not isinstance(self.inner_opt, fluid.optimizer.SGDOptimizer):
+            return False
+
+        free = get_sys_free_mem()
+
+        from paddle.fluid.incubate.fleet.parameter_server.ir import vars_metatools
+
+        processed_var_names = set(["@EMPTY@"])
+        param_memory_size = 0
+        for varname in program.global_block().vars:
+            var = program.global_block().vars[varname]
+            if not var.persistable or var.desc.type(
+            ) != core.VarDesc.VarType.LOD_TENSOR:
+                continue
+            param = vars_metatools.create_var_struct(var)
+            param_memory_size += param.m_size
+            processed_var_names.add(varname)
+
+        upper_mem_use = param_memory_size * 5.0
+
+        program_tmp_vars = dict()
+        eval_batch_size = 1024
+        for op in program.global_block().ops:
+            for var_name in op.output_arg_names:
+                if var_name in processed_var_names:
+                    continue
+                processed_var_names.add(var_name)
+                var = program.global_block().vars[var_name]
+
+                if var.desc.type() != core.VarDesc.VarType.LOD_TENSOR:
+                    continue
+
+                data_count = 1
+                neg_dim_count = 0
+                for x in var.shape:
+                    if x < 0:
+                        if neg_dim_count >= 1:
+                            raise ValueError(
+                                "Var %s has more than one negative dim." %
+                                (var_name))
+                        neg_dim_count += 1
+                        data_count *= (-x)
+                    else:
+                        data_count *= x
+                program_tmp_vars[var_name] = (
+                    data_count, neg_dim_count,
+                    vars_metatools.dtype_to_size[var.dtype])
+
+        for varname in program_tmp_vars:
+            data_count, neg_dim_count, type_size = program_tmp_vars[varname]
+            if neg_dim_count == 1:
+                data_count *= eval_batch_size
+            var_memory = data_count * type_size
+            upper_mem_use += var_memory
+
+        if upper_mem_use < free:
+            return True
+        else:
+            return False
+
     def minimize_impl(self,
                       loss,
                       startup_program=None,
@@ -143,11 +237,12 @@ class ParameterServerOptimizer(MetaOptimizerBase):
         compiled_config = public.CompileTimeStrategy(_origin_main_program,
                                                      _origin_startup_program,
                                                      strategy, self.role_maker)
+        compiled_config.strategy = strategy
 
-        if self.role_maker.is_worker() or self.role_maker._is_heter_worker():
+        if self.role_maker._is_worker() or self.role_maker._is_heter_worker():
             main_program, startup_program = self._build_trainer_programs(
                 compiled_config)
-        elif self.role_maker.is_server():
+        elif self.role_maker._is_server():
             main_program, startup_program = self._build_pserver_programs(
                 compiled_config)
 
@@ -157,4 +252,24 @@ class ParameterServerOptimizer(MetaOptimizerBase):
         return None, None
 
     def _disable_strategy(self, dist_strategy):
-        self.user_defined_strategy.a_sync_configs = {}
+        dist_strategy.a_sync = False
+        a_sync_configs = dist_strategy.a_sync_configs
+        a_sync_configs["k_steps"] = -1
+        dist_strategy.a_sync_configs = a_sync_configs
+
+    def _enable_strategy(self, dist_strategy, context):
+        a_sync_configs = dist_strategy.a_sync_configs
+        if a_sync_configs["k_steps"] >= 0:
+            return
+
+        dist_strategy.a_sync = True
+        a_sync_configs = dist_strategy.a_sync_configs
+
+        is_geo = self._can_apply_geo(dist_strategy,
+                                     context["origin_main_program"])
+
+        if is_geo:
+            a_sync_configs["k_steps"] = 800
+        else:
+            a_sync_configs["k_steps"] = 0
+        dist_strategy.a_sync_configs = a_sync_configs
diff --git a/python/paddle/distributed/fleet/meta_optimizers/pipeline_optimizer.py b/python/paddle/distributed/fleet/meta_optimizers/pipeline_optimizer.py
index d5a45e2b4e1aeda2e1c66c0a5a36236622f093ec..889fec838ed3d6dc83d2c15e92138f49e62f01dd 100644
--- a/python/paddle/distributed/fleet/meta_optimizers/pipeline_optimizer.py
+++ b/python/paddle/distributed/fleet/meta_optimizers/pipeline_optimizer.py
@@ -103,6 +103,9 @@ class PipelineOptimizer(MetaOptimizerBase):
         self.wrapped_opt = PO(self.inner_opt, num_microbatches=num_microbatches)
 
     def _can_apply(self):
+        if not self.role_maker._is_collective:
+            return False
+
         if self.user_defined_strategy.pipeline == True:
             return True
         return False
@@ -111,6 +114,10 @@ class PipelineOptimizer(MetaOptimizerBase):
         dist_strategy.pipeline = False
         dist_strategy.pipeline_configs = {}
 
+    def _enable_strategy(self, dist_strategy, context):
+        # we do not support enable pipeline automatically right now
+        return
+
     def minimize_impl(self,
                       loss,
                       startup_program=None,
@@ -119,11 +126,11 @@ class PipelineOptimizer(MetaOptimizerBase):
         optimize_ops, params_grads, prog_list = \
             self.wrapped_opt.minimize(loss, startup_program,
                                       parameter_list, no_grad_set)
-        if self.role_maker.worker_num() == 1:
+        if self.role_maker._worker_num() == 1:
             return optimize_ops, params_grads
 
-        endpoints = self.role_maker.get_trainer_endpoints()
-        current_endpoint = endpoints[self.role_maker.worker_index()]
+        endpoints = self.role_maker._get_trainer_endpoints()
+        current_endpoint = endpoints[self.role_maker._worker_index()]
         self.startup_program = startup_program
         if startup_program is None:
             self.startup_program = fluid.default_startup_program()
@@ -135,7 +142,7 @@ class PipelineOptimizer(MetaOptimizerBase):
         self.nranks = nranks
         self.nrings = len(self.main_program_list)
 
-        self.rank = self.role_maker.worker_index()
+        self.rank = self.role_maker._worker_index()
         self.endpoints = endpoints
         self.current_endpoint = current_endpoint
 
@@ -176,7 +183,7 @@ class PipelineOptimizer(MetaOptimizerBase):
         grad = None
         for idx, op in reversed(list(enumerate(block.ops))):
             if is_backward_op(op) and \
-                OP_ROLE_VAR_KEY in op.attr_names:
+                    OP_ROLE_VAR_KEY in op.attr_names:
                 op_role_var = op.all_attrs()[OP_ROLE_VAR_KEY]
                 if len(op_role_var) == 0:
                     continue
diff --git a/python/paddle/distributed/fleet/meta_optimizers/recompute_optimizer.py b/python/paddle/distributed/fleet/meta_optimizers/recompute_optimizer.py
index 3eb3ca6127cfe0d0a7a458c6c44e09ce22e7b24a..59ca7e633099e8688a57fa9024575e29008c0341 100644
--- a/python/paddle/distributed/fleet/meta_optimizers/recompute_optimizer.py
+++ b/python/paddle/distributed/fleet/meta_optimizers/recompute_optimizer.py
@@ -38,6 +38,9 @@ class RecomputeOptimizer(MetaOptimizerBase):
             list(user_defined_strategy.recompute_configs["checkpoints"]))
 
     def _can_apply(self):
+        if not self.role_maker._is_collective:
+            return False
+
         if self.user_defined_strategy.recompute == True:
             if len(self.user_defined_strategy.recompute_configs[
                     "checkpoints"]) == 0:
@@ -49,6 +52,10 @@ class RecomputeOptimizer(MetaOptimizerBase):
         dist_strategy.recompute = False
         dist_strategy.recompute_configs = {}
 
+    def _enable_strategy(self, dist_strategy, context):
+        # we do not support automatically recompute checkpoints currently
+        return
+
     def backward(self,
                  loss,
                  startup_program=None,
diff --git a/python/paddle/distributed/fleet/runtime/parameter_server_runtime.py b/python/paddle/distributed/fleet/runtime/parameter_server_runtime.py
index 870c3fe8be4c87b8c7bde9b690c3ee1eded39393..ae5c53b8a37c4958e58ed5b09ce7cc8194f1ff52 100644
--- a/python/paddle/distributed/fleet/runtime/parameter_server_runtime.py
+++ b/python/paddle/distributed/fleet/runtime/parameter_server_runtime.py
@@ -104,9 +104,9 @@ class ParameterServerRuntime(RuntimeBase):
     def _init_worker(self):
         def sync_strategy_envs():
             kwargs = {}
-            kwargs["pserver_endpoints"] = self.role_maker.get_pserver_endpoints(
-            )
-            kwargs["trainer_id"] = self.role_maker.worker_index()
+            kwargs[
+                "pserver_endpoints"] = self.role_maker._get_pserver_endpoints()
+            kwargs["trainer_id"] = self.role_maker._worker_index()
             return kwargs
 
         def geo_strategy_envs():
@@ -150,7 +150,7 @@ class ParameterServerRuntime(RuntimeBase):
                 return "#".join(init_attrs)
 
             kwargs = {}
-            kwargs["trainers"] = self.role_maker.worker_num()
+            kwargs["trainers"] = self.role_maker._worker_num()
             kwargs["sparse_attrs"] = get_sparse_attrs()
             return kwargs
 
@@ -202,6 +202,9 @@ class ParameterServerRuntime(RuntimeBase):
             if self.role_maker._get_heter_worker_device() == "GPU":
                 gpu_id = int(os.getenv("FLAGS_selected_gpus", "0"))
                 executor = Executor(fluid.CUDAPlace(gpu_id))
+            elif self.role_maker._get_heter_worker_device() == "XPU":
+                xpu_id = int(os.getenv("FLAGS_selected_xpus", "0"))
+                executor = Executor(fluid.XPUPlace(xpu_id))
             else:
                 raise ValueError("Not Support Device {}".format(
                     self.role_maker._get_heter_worker_device()))
@@ -335,7 +338,7 @@ class ParameterServerRuntime(RuntimeBase):
                 block.append_op(
                     type='recv_save',
                     attrs={
-                        "trainer_id": self.role_maker.worker_index(),
+                        "trainer_id": self.role_maker._worker_index(),
                         "shape": var.shape,
                         "slice_shapes":
                         [",".join([str(i) for i in var.shape])],
@@ -375,14 +378,15 @@ class ParameterServerRuntime(RuntimeBase):
             block.append_op(
                 type='recv_save',
                 attrs={
-                    "trainer_id": self.role_maker.worker_index(),
+                    "trainer_id": self.role_maker._worker_index(),
                     "shape": var.shape,
                     "slice_shapes": slice_shapes,
                     "slice_varnames": var_ctx.split_varnames(),
                     "remote_varnames": var_ctx.split_varnames(),
                     "is_sparse": True,
                     "endpoints": var_ctx.split_endpoints(),
-                    "pserver_num": len(self.role_maker.get_pserver_endpoints()),
+                    "pserver_num":
+                    len(self.role_maker._get_pserver_endpoints()),
                     "file_path": os.path.join(dirname, var.name)
                 })
 
@@ -400,7 +404,7 @@ class ParameterServerRuntime(RuntimeBase):
                 block.append_op(
                     type='recv_save',
                     attrs={
-                        "trainer_id": self.role_maker.worker_index(),
+                        "trainer_id": self.role_maker._worker_index(),
                         "shape": var.shape,
                         "slice_shapes": slice_shapes,
                         "slice_varnames": slice_varnames,
@@ -408,7 +412,7 @@ class ParameterServerRuntime(RuntimeBase):
                         "is_sparse": True,
                         "endpoints": var_ctx.split_endpoints(),
                         "pserver_num":
-                        len(self.role_maker.get_pserver_endpoints()),
+                        len(self.role_maker._get_pserver_endpoints()),
                         "file_path": os.path.join(dirname, var.name)
                     })
 
@@ -419,7 +423,7 @@ class ParameterServerRuntime(RuntimeBase):
                 block.append_op(
                     type='recv_save',
                     attrs={
-                        "trainer_id": self.role_maker.worker_index(),
+                        "trainer_id": self.role_maker._worker_index(),
                         "shape": var.shape,
                         "slice_shapes":
                         [",".join([str(i) for i in var.shape])],
diff --git a/python/paddle/distributed/fleet/utils/__init__.py b/python/paddle/distributed/fleet/utils/__init__.py
index f1911408c84a9dde56a8674e88e0fb8ad575cae7..abf198b97e6e818e1fbe59006f98492640bcee54 100644
--- a/python/paddle/distributed/fleet/utils/__init__.py
+++ b/python/paddle/distributed/fleet/utils/__init__.py
@@ -11,8 +11,3 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-
-from .fs import *
-from .http_server import KVHandler, KVHTTPServer, KVServer
-
-#__all__ = ['KVHandler', 'KVHTTPServer', 'KVServer'] + fs.__all__
diff --git a/python/paddle/distributed/fleet/utils/fs.py b/python/paddle/distributed/fleet/utils/fs.py
index 2dbe5cefbb4944e219989358ebeb0c321f942551..b7c50bda3eae0a004bd3d0169fc62260393b28b8 100644
--- a/python/paddle/distributed/fleet/utils/fs.py
+++ b/python/paddle/distributed/fleet/utils/fs.py
@@ -32,10 +32,7 @@ import functools
 from pathlib import PurePosixPath, Path
 import shutil
 
-__all__ = [
-    'FS', 'LocalFS', 'HDFSClient', 'ExecuteError', 'FSTimeOut',
-    'FSFileExistsError', 'FSFileNotExistsError', 'FSShellCmdAborted'
-]
+__all__ = ['LocalFS', 'HDFSClient']
 
 
 class ExecuteError(Exception):
@@ -117,7 +114,37 @@ class FS(object):
 
 
 class LocalFS(FS):
+    """
+    A tool of local file system.
+
+    Examples:
+        .. code-block:: python
+
+            from paddle.distributed.fleet.utils.fs import LocalFS
+
+            client = LocalFS()
+            subdirs, files = client.ls_dir("./")
+    """
+
     def ls_dir(self, fs_path):
+        """	
+        List directorys and files under `fs_path` .
+
+        Args:
+            fs_path(str): The local file path.
+
+        Returns:
+            Tuple: Return a 2-tuple, the first is a list of all its subdirectories, 
+            and the second is a list of all its subfiles, e.g. ([subdirname1, subdirname1, ...], [filename1, filename2, ...]).
+
+        Examples:
+            .. code-block:: python
+
+                from paddle.distributed.fleet.utils.fs import LocalFS
+
+                client = LocalFS()
+                subdirs, files = client.ls_dir("./")
+        """
         if not self.is_exist(fs_path):
             return [], []
 
@@ -132,11 +159,46 @@ class LocalFS(FS):
         return dirs, files
 
     def mkdirs(self, fs_path):
+        """
+        Create a remote HDFS directory.
+
+        Args:
+            fs_path(str): The local directory path.
+
+        Examples:
+            .. code-block:: python
+
+                from paddle.distributed.fleet.utils.fs import LocalFS
+
+                client = LocalFS()
+                client.mkdirs("test_mkdirs")
+                client.delete("test_mkdirs")
+        """
         assert not os.path.isfile(fs_path), "{} is already a file".format(
             fs_path)
         os.system("mkdir -p {}".format(fs_path))
 
     def rename(self, fs_src_path, fs_dst_path):
+        """
+        Rename the file.
+
+        Args:
+            fs_src_path(str): The actual name of the file or directory
+            fs_dst_path(str): The new name of the file or directory.
+
+        Examples:
+            .. code-block:: python
+
+                from paddle.distributed.fleet.utils.fs import LocalFS
+
+                client = LocalFS()
+                client.touch("test_rename_src")
+                print(client.is_exists("test_rename_src")) # True
+                client.rename("test_rename_src", "test_rename_dst")
+                print(client.is_exists("test_rename_src")) # False
+                print(client.is_exists("test_rename_dst")) # True
+                client.delete("test_rename_dst")
+        """
         os.rename(fs_src_path, fs_dst_path)
 
     def _rmr(self, fs_path):
@@ -146,6 +208,21 @@ class LocalFS(FS):
         os.remove(fs_path)
 
     def delete(self, fs_path):
+        """
+        Delete the local file path, whether it's a file or directory.
+
+        Args:
+            fs_path(str): The local file path.
+
+        Examples:
+            .. code-block:: python
+
+                from paddle.distributed.fleet.utils.fs import LocalFS
+
+                client = LocalFS()
+                client.mkdirs("test_localFS_mkdirs")
+                client.delete("test_localFS_mkdirs")
+        """
         if not self.is_exist(fs_path):
             return
 
@@ -158,15 +235,88 @@ class LocalFS(FS):
         return False
 
     def is_file(self, fs_path):
+        """
+        Whether the local file path is a file.
+
+        Args:
+            fs_path(str): The local file path.
+
+        Returns:
+            Bool: Return true if the path exists and it's a file, otherwise return false.
+
+        Examples:
+            .. code-block:: python
+
+                from paddle.distributed.fleet.utils.fs import LocalFS
+
+                client = LocalFS()
+                client.touch("test_is_file")
+                print(client.is_file("test_is_file")) # True
+                client.delete("test_is_file")
+        """
         return os.path.isfile(fs_path)
 
     def is_dir(self, fs_path):
+        """
+        Whether the local file path is a directory.
+
+        Args:
+            fs_path(str): The local file path.
+
+        Returns:
+            Bool: Return true if the path exists and it's a directory, otherwise return false.
+
+        Examples:
+            .. code-block:: python
+
+                from paddle.distributed.fleet.utils.fs import LocalFS
+
+                client = LocalFS()
+                client.mkdirs("test_is_dir")
+                print(client.is_dir("test_is_file")) # True
+                client.delete("test_is_dir")
+        """
         return os.path.isdir(fs_path)
 
     def is_exist(self, fs_path):
+        """
+        Whether the local file path exists.
+
+        Args:
+            fs_path(str): The local file path.
+
+        Returns:
+            Bool: Wheter it's a file or directory, return true if the path exists, 
+            otherwise return false.
+
+        Examples:
+            .. code-block:: python
+
+                from paddle.distributed.fleet.utils.fs import LocalFS
+
+                client = LocalFS()
+                ret = local_fs.is_exist("test_is_exist")
+        """
         return os.path.exists(fs_path)
 
     def touch(self, fs_path, exist_ok=True):
+        """
+        Create a local file.
+
+        Args:
+            fs_path(str): The local file path.
+            exist_ok(bool): When `fs_path` exists, if `exist_ok` is set false,
+            program will throw an Exception. Default is true.
+
+        Examples:
+            .. code-block:: python
+
+                from paddle.distributed.fleet.utils.fs import LocalFS
+
+                client = LocalFS()
+                client.touch("test_touch")
+                client.delete("test_touch")
+        """
         if self.is_exist(fs_path):
             if exist_ok:
                 return
@@ -175,6 +325,26 @@ class LocalFS(FS):
         return Path(fs_path).touch(exist_ok=True)
 
     def mv(self, src_path, dst_path, overwrite=False, test_exists=False):
+        """
+        Move a local file or directory from `src_path` to `dst_path` .
+
+        Args:
+            src_path(str):  Name of the file or directory, that's needed to be moved.
+            dst_path(str):  Name of the file or directory to which to move to.
+            overwrite(bool): Whether to re-write `dst_path` if that exists. Default is False.
+            test_exists(bool): Check the existence of `src_path` and `dst_path` . 
+            When `test_exists` is set true, if `src_path` doesn't exist or `dst_path` exists, program will throw an Excetption. 
+
+        Examples:
+            .. code-block:: python
+
+                from paddle.distributed.fleet.utils.fs import LocalFS
+
+                client = LocalFS()
+                client.touch("test_mv_src")
+                client.mv("test_mv_src", "test_mv_dst")
+                client.delete("test_mv_dst")
+        """
         if not self.is_exist(src_path):
             raise FSFileNotExistsError
 
@@ -188,7 +358,21 @@ class LocalFS(FS):
 
     def list_dirs(self, fs_path):
         """	
-        list directory under fs_path, and only give the pure name, not include the fs_path	
+        Only list directorys under `fs_path` .
+
+        Args:
+            fs_path(str): The local file path.
+
+        Returns:
+            List: A list of all its subdirectories, e.g. [subdirname1, subdirname1, ...].
+
+        Examples:
+            .. code-block:: python
+
+                from paddle.distributed.fleet.utils.fs import LocalFS
+
+                client = LocalFS()
+                subdirs = client.list_dirs("./")
         """
         if not self.is_exist(fs_path):
             return []
@@ -200,26 +384,6 @@ class LocalFS(FS):
         return dirs
 
 
-"""HDFS Utils."""
-
-
-def _handle_errors(f):
-    def handler(*args, **kwargs):
-        start = time.time()
-        while True:
-            try:
-                return f(*args, **kwargs)
-            except ExecuteError as e:
-                o = args[0]
-                time_out = float(o._time_out) / 1000.0
-                inter = float(o._sleep_inter) / 1000.0
-                if time.time() - start >= time_out:
-                    raise FSTimeOut
-                time.sleep(inter)
-
-    return functools.wraps(f)(handler)
-
-
 def _handle_errors(max_time_out=None):
     def decorator(f):
         @functools.wraps(f)
@@ -237,7 +401,7 @@ def _handle_errors(max_time_out=None):
             while True:
                 try:
                     return f(*args, **kwargs)
-                #important: only ExecuteError need to retry
+                # important: only ExecuteError need to retry
                 except ExecuteError as e:
                     if time.time() - start >= time_out:
                         raise FSTimeOut("args:{} timeout:{}".format(
@@ -256,12 +420,36 @@ def _handle_errors(max_time_out=None):
 
 
 class HDFSClient(FS):
+    """
+    A tool of HDFS.
+
+    Args:
+        hadoop_home(str): Hadoop home. 
+        configs(dict): Hadoop config. It is a dictionary and needs to contain the
+            keys: "fs.default.name" and "hadoop.job.ugi".
+
+    Examples:
+
+        .. code-block:: text
+
+            from paddle.distributed.fleet.utils.fs import HDFSClient
+            hadoop_home = "/home/client/hadoop-client/hadoop/"
+
+            configs = {
+                "fs.default.name": "hdfs://xxx.hadoop.com:54310",
+                "hadoop.job.ugi": "hello,hello123"
+            }
+
+            client = HDFSClient(hadoop_home, configs)
+            client.ls_dir("hdfs:/test_hdfs_client")
+    """
+
     def __init__(
             self,
             hadoop_home,
             configs,
-            time_out=5 * 60 * 1000,  #ms
-            sleep_inter=1000):  #ms
+            time_out=5 * 60 * 1000,  # ms
+            sleep_inter=1000):  # ms
         # Raise exception if JAVA_HOME not exists.
         java_home = os.environ["JAVA_HOME"]
 
@@ -292,6 +480,30 @@ class HDFSClient(FS):
 
     @_handle_errors()
     def list_dirs(self, fs_path):
+        """	
+        Only list directorys under `fs_path` .
+
+        Args:
+            fs_path(str): The HDFS file path.
+
+        Returns:
+            List: A list of all its subdirectories, e.g. [subdirname1, subdirname1, ...].
+
+        Examples:
+
+            .. code-block:: text
+
+                from paddle.distributed.fleet.utils.fs import HDFSClient
+
+                hadoop_home = "/home/client/hadoop-client/hadoop/"
+                configs = {
+                    "fs.default.name": "hdfs://xxx.hadoop.com:54310",
+                    "hadoop.job.ugi": "hello,hello123"
+                }
+
+                client = HDFSClient(hadoop_home, configs)
+                subdirs = client.list_dirs("hdfs:/test_hdfs_client")
+        """
         if not self.is_exist(fs_path):
             return []
 
@@ -301,7 +513,29 @@ class HDFSClient(FS):
     @_handle_errors()
     def ls_dir(self, fs_path):
         """	
-        list directory under fs_path, and only give the pure name, not include the fs_path	
+        List directorys and files under `fs_path` .
+
+        Args:
+            fs_path(str): The HDFS file path.
+
+        Returns:
+            Tuple: Return a 2-tuple, the first element is the list of all its subdirectories, 
+            and the second one is the list of all its subfiles, e.g. ([subdirname1, subdirname1, ...], [filename1, filename2, ...]).
+
+        Examples:
+
+            .. code-block:: text
+
+                from paddle.distributed.fleet.utils.fs import HDFSClient
+
+                hadoop_home = "/home/client/hadoop-client/hadoop/"
+                configs = {
+                    "fs.default.name": "hdfs://xxx.hadoop.com:54310",
+                    "hadoop.job.ugi": "hello,hello123"
+                }
+
+                client = HDFSClient(hadoop_home, configs)
+                subdirs, files = client.ls_dir("hdfs:/test_hdfs_client")
         """
         if not self.is_exist(fs_path):
             return [], []
@@ -340,6 +574,30 @@ class HDFSClient(FS):
 
     @_handle_errors()
     def is_dir(self, fs_path):
+        """
+        Whether the remote HDFS path is a directory.
+
+        Args:
+            fs_path(str): The HDFS file path.
+
+        Returns:
+            Bool: Return true if the path exists and it's a directory, otherwise return false.
+
+        Examples:
+
+            .. code-block:: text
+
+                from paddle.distributed.fleet.utils.fs import HDFSClient
+
+                hadoop_home = "/home/client/hadoop-client/hadoop/"
+                configs = {
+                    "fs.default.name": "hdfs://xxx.hadoop.com:54310",
+                    "hadoop.job.ugi": "hello,hello123"
+                }
+
+                client = HDFSClient(hadoop_home, configs)
+                ret = client.is_file("hdfs:/test_hdfs_client")
+        """
         if not self.is_exist(fs_path):
             return False
 
@@ -358,6 +616,30 @@ class HDFSClient(FS):
         return True
 
     def is_file(self, fs_path):
+        """
+        Whether the remote HDFS path is a file.
+
+        Args:
+            fs_path(str): The HDFS file path.
+
+        Returns:
+            Bool: Return true if the path exists and it's a file, otherwise return false.
+
+        Examples:
+
+            .. code-block:: text
+
+                from paddle.distributed.fleet.utils.fs import HDFSClient
+
+                hadoop_home = "/home/client/hadoop-client/hadoop/"
+                configs = {
+                    "fs.default.name": "hdfs://xxx.hadoop.com:54310",
+                    "hadoop.job.ugi": "hello,hello123"
+                }
+
+                client = HDFSClient(hadoop_home, configs)
+                ret = client.is_file("hdfs:/test_hdfs_client")
+        """
         if not self.is_exist(fs_path):
             return False
 
@@ -365,6 +647,31 @@ class HDFSClient(FS):
 
     @_handle_errors()
     def is_exist(self, fs_path):
+        """
+        Whether the remote HDFS path exists.
+
+        Args:
+            fs_path(str): The hdfs file path.
+
+        Returns:
+            Bool: Whether it's is file or directory, return true if the path exists,
+            otherwise return false.
+
+        Examples:
+
+            .. code-block:: text
+
+                from paddle.distributed.fleet.utils.fs import HDFSClient
+
+                hadoop_home = "/home/client/hadoop-client/hadoop/"
+                configs = {
+                    "fs.default.name": "hdfs://xxx.hadoop.com:54310",
+                    "hadoop.job.ugi": "hello,hello123"
+                }
+
+                client = HDFSClient(hadoop_home, configs)
+                ret = client.is_exist("hdfs:/test_hdfs_client")
+        """
         cmd = "ls {} ".format(fs_path)
         ret, out = self._run_cmd(cmd, redirect_stderr=True)
         if ret != 0:
@@ -377,6 +684,28 @@ class HDFSClient(FS):
 
     # can't retry
     def upload(self, local_path, fs_path):
+        """
+        Upload the local path to remote HDFS.
+
+        Args:
+            local_path(str): The local path.
+            fs_path(str): The HDFS path.
+
+        Examples:
+
+            .. code-block:: text
+
+                from paddle.distributed.fleet.utils.fs import HDFSClient
+
+                hadoop_home = "/home/client/hadoop-client/hadoop/"
+                configs = {
+                    "fs.default.name": "hdfs://xxx.hadoop.com:54310",
+                    "hadoop.job.ugi": "hello,hello123"
+                }
+
+                client = HDFSClient(hadoop_home, configs)
+                client.upload("test_hdfs_client", "hdfs:/test_hdfs_client")
+        """
         if self.is_exist(fs_path):
             raise FSFileExistsError("{} exists".format(fs_path))
 
@@ -400,6 +729,28 @@ class HDFSClient(FS):
 
     # can't retry
     def download(self, fs_path, local_path):
+        """
+        Download remote HDFS path to the local.
+
+        Args:
+            fs_path(str):  The HDFS path.
+            local_path(str): The local path.
+
+        Examples:
+
+            .. code-block:: text
+
+                from paddle.distributed.fleet.utils.fs import HDFSClient
+
+                hadoop_home = "/home/client/hadoop-client/hadoop/"
+                configs = {
+                    "fs.default.name": "hdfs://xxx.hadoop.com:54310",
+                    "hadoop.job.ugi": "hello,hello123"
+                }
+
+                client = HDFSClient(hadoop_home, configs)
+                client.download("hdfs:/test_hdfs_client", "./")
+        """
         if self.is_exist(local_path):
             raise FSFileExistsError("{} exists".format(local_path))
 
@@ -423,6 +774,27 @@ class HDFSClient(FS):
 
     @_handle_errors()
     def mkdirs(self, fs_path):
+        """
+        Create a remote HDFS directory.
+
+        Args:
+            fs_path(str): The HDFS directory path.
+
+        Examples:
+
+            .. code-block:: text
+
+                from paddle.distributed.fleet.utils.fs import HDFSClient
+
+                hadoop_home = "/home/client/hadoop-client/hadoop/"
+                configs = {
+                    "fs.default.name": "hdfs://xxx.hadoop.com:54310",
+                    "hadoop.job.ugi": "hello,hello123"
+                }
+
+                client = HDFSClient(hadoop_home, configs)
+                client.mkdirs("hdfs:/test_hdfs_client")
+        """
         if self.is_exist(fs_path):
             return
 
@@ -445,6 +817,30 @@ class HDFSClient(FS):
                 raise ExecuteError(cmd)
 
     def mv(self, fs_src_path, fs_dst_path, overwrite=False, test_exists=True):
+        """
+        Move a remote HDFS file or directory from `fs_src_path` to `fs_dst_path` .
+
+        Args:
+            fs_src_path(str):  Name of the file or directory, that's needed to be moved.
+            fs_dst_path(str):  Name of the file or directory to which to move to.
+            overwrite(bool): Whether to re-write `fs_dst_path` if that exists. Default is False.
+            test_exists(bool): Check the existence of `fs_src_path` and `fs_dst_path` . When `test_exists` is set true, if `fs_src_path` doesn't exist or `fs_dst_path` exists, program will throw an Excetption. 
+
+        Examples:
+
+            .. code-block:: text
+
+                from paddle.distributed.fleet.utils.fs import HDFSClient
+
+                hadoop_home = "/home/client/hadoop-client/hadoop/"
+                configs = {
+                    "fs.default.name": "hdfs://xxx.hadoop.com:54310",
+                    "hadoop.job.ugi": "hello,hello123"
+                }
+
+                client = HDFSClient(hadoop_home, configs)
+                client.mv("hdfs:/test_hdfs_client", "hdfs:/test_hdfs_client2")
+        """
         if overwrite and self.is_exist(fs_dst_path):
             self.delete(fs_dst_path)
 
@@ -487,6 +883,27 @@ class HDFSClient(FS):
 
     @_handle_errors()
     def delete(self, fs_path):
+        """
+        Delete a remote HDFS path, whether it's a file or directory.
+
+        Args:
+            fs_path(str): The HDFS file path.
+
+        Examples:
+
+            .. code-block:: text
+
+                from paddle.distributed.fleet.utils.fs import HDFSClient
+
+                hadoop_home = "/home/client/hadoop-client/hadoop/"
+                configs = {
+                    "fs.default.name": "hdfs://xxx.hadoop.com:54310",
+                    "hadoop.job.ugi": "hello,hello123"
+                }
+
+                client = HDFSClient(hadoop_home, configs)
+                client.delete("hdfs:/test_hdfs_client")
+        """
         if not self.is_exist(fs_path):
             return
 
@@ -497,6 +914,27 @@ class HDFSClient(FS):
         return self._rm(fs_path)
 
     def touch(self, fs_path, exist_ok=True):
+        """
+        Create a remote HDFS file.
+
+        Args:
+            fs_path(str): The HDFS file path.
+
+        Examples:
+
+            .. code-block:: text
+
+                from paddle.distributed.fleet.utils.fs import HDFSClient
+
+                hadoop_home = "/home/client/hadoop-client/hadoop/"
+                configs = {
+                    "fs.default.name": "hdfs://xxx.hadoop.com:54310",
+                    "hadoop.job.ugi": "hello,hello123"
+                }
+
+                client = HDFSClient(hadoop_home, configs)
+                client.touch("hdfs:/test_hdfs_client")
+        """
         if self.is_exist(fs_path):
             if exist_ok:
                 return
diff --git a/python/paddle/distributed/launch.py b/python/paddle/distributed/launch.py
index e2ab321f9aebddd437c92ded9e6005495f760096..9b969cf3002379058b9cff0d604d2db750573028 100644
--- a/python/paddle/distributed/launch.py
+++ b/python/paddle/distributed/launch.py
@@ -160,18 +160,21 @@ def get_cluster_from_args(args, selected_gpus):
             x for x in range(started_port, started_port + len(selected_gpus))
         ]
 
-    return get_cluster(node_ips, node_ip, free_ports, selected_gpus)
+    trainer_endpoints = []
+    for ip in node_ips:
+        trainer_endpoints.append(["%s:%d" % (ip, port) for port in free_ports])
+    return get_cluster(node_ips, node_ip, trainer_endpoints, selected_gpus)
 
 
 def get_gpus(selected_gpus):
     if selected_gpus is None:
         from paddle.fluid import core
         gpus_num = core.get_cuda_device_count()
-        selected_gpus = [str(x) for x in range(0, gpus_num)]
+        gpus = [str(x) for x in range(0, gpus_num)]
     else:
         cuda_visible_devices = os.getenv("CUDA_VISIBLE_DEVICES")
         if cuda_visible_devices is None or cuda_visible_devices == "":
-            selected_gpus = [x.strip() for x in selected_gpus.split(',')]
+            gpus = [x.strip() for x in selected_gpus.split(',')]
         else:
             # change selected_gpus into relative values
             # e.g. CUDA_VISIBLE_DEVICES=4,5,6,7; args.selected_gpus=4,5,6,7;
@@ -181,12 +184,16 @@ def get_gpus(selected_gpus):
                 assert x in cuda_visible_devices_list, "Can't find "\
                 "your selected_gpus %s in CUDA_VISIBLE_DEVICES[%s]."\
                 % (x, cuda_visible_devices)
-            selected_gpus = [
+            gpus = [
                 cuda_visible_devices_list.index(x.strip())
                 for x in selected_gpus.split(',')
             ]
+            logger.info("Change selected_gpus into reletive values. --ips:{} "
+                        "will change into relative_ips:{} according to your "
+                        "CUDA_VISIBLE_DEVICES:{}".format(
+                            selected_gpus, gpus, cuda_visible_devices_list))
 
-    return selected_gpus
+    return gpus
 
 
 def get_cluster_and_pod(args):
diff --git a/python/paddle/distributed/utils.py b/python/paddle/distributed/utils.py
index 1fa307c4d1b89d4033a8f8346b254177053e9dc0..be144a55b86200042f4d03b112071a374612b3a5 100644
--- a/python/paddle/distributed/utils.py
+++ b/python/paddle/distributed/utils.py
@@ -227,18 +227,23 @@ def get_logger(log_level, name="root"):
     return logger
 
 
-def get_cluster(node_ips, node_ip, paddle_ports, selected_gpus):
-    assert type(paddle_ports) is list, "paddle_ports must be list"
+def get_cluster(node_ips, node_ip, trainer_endpoints, selected_gpus):
+    assert type(trainer_endpoints) is list, "trainer_endpoints must be list"
     cluster = Cluster(hdfs=None)
     trainer_rank = 0
     for node_rank, ip in enumerate(node_ips):
         pod = Pod()
         pod.rank = node_rank
         pod.addr = ip
+        cur_node_endpoints = trainer_endpoints[node_rank]
+        # when use paddlecloud, endpoints may > selected_gpus(user_defined)
+        assert len(cur_node_endpoints) >= len(
+            selected_gpus
+        ), "current trainer_endpoints size should be greater equal than selected_gpus size."
         for i in range(len(selected_gpus)):
             trainer = Trainer()
             trainer.gpus.append(selected_gpus[i])
-            trainer.endpoint = "%s:%d" % (ip, paddle_ports[i])
+            trainer.endpoint = "%s" % (cur_node_endpoints[i])
             trainer.rank = trainer_rank
             trainer_rank += 1
 
@@ -253,7 +258,8 @@ def terminate_local_procs(procs):
     for p in procs:
         if p.proc.poll() is None:
             p.proc.terminate()
-            p.log_fn.close()
+            if p.log_fn:
+                p.log_fn.close()
             logger.debug("terminate process id:{}".format(p.proc.pid))
 
     #wait all process terminiated
diff --git a/python/paddle/distribution.py b/python/paddle/distribution.py
index 49e98805d24f3f8f5dc1cfcbf3ddc8d9fb835fde..35204affb3fd168b8bd137d78c3413a08885e2bb 100644
--- a/python/paddle/distribution.py
+++ b/python/paddle/distribution.py
@@ -102,21 +102,24 @@ class Distribution(object):
         tmp = 0.
 
         for arg in args:
-            valid_arg = False
-            for cls in [float, list, np.ndarray, tensor.Variable]:
-                if isinstance(arg, cls):
-                    valid_arg = True
-                    break
-            assert valid_arg, "type of input args must be float, list, numpy.ndarray or Tensor."
             if isinstance(arg, float):
-                arg = np.zeros(1) + arg
+                arg = [arg]
+            if not isinstance(arg, (list, np.ndarray, tensor.Variable)):
+                raise TypeError(
+                    "Type of input args must be float, list, numpy.ndarray or Tensor, but received type {}".
+                    format(type(arg)))
+
             arg_np = np.array(arg)
             arg_dtype = arg_np.dtype
-            if str(arg_dtype) not in ['float32']:
-                warnings.warn(
-                    "data type of argument only support float32, your argument will be convert to float32."
-                )
+            if str(arg_dtype) != 'float32':
+                if str(arg_dtype) != 'float64':
+                    # "assign" op doesn't support float64. if dtype is float64, float32 variable will be generated
+                    #  and converted to float64 later using "cast".
+                    warnings.warn(
+                        "data type of argument only support float32 and float64, your argument will be convert to float32."
+                    )
                 arg_np = arg_np.astype('float32')
+            # tmp is used to support broadcast, it summarizes shapes of all the args and get the mixed shape.
             tmp = tmp + arg_np
             numpy_args.append(arg_np)
 
@@ -129,6 +132,37 @@ class Distribution(object):
 
         return tuple(variable_args)
 
+    def _check_values_dtype_in_probs(self, param, value):
+        """
+        Log_prob and probs methods have input ``value``, if value's dtype is different from param,
+        convert value's dtype to be consistent with param's dtype.
+
+        Args:
+            param (Tensor): low and high in Uniform class, loc and scale in Normal class.
+            value (Tensor): The input tensor.
+
+        Returns:
+            value (Tensor): Change value's dtype if value's dtype is different from param.
+        """
+        if in_dygraph_mode():
+            if value.dtype != param.dtype and convert_dtype(
+                    value.dtype) in ['float32', 'float64']:
+                warnings.warn(
+                    "dtype of input 'value' needs to be the same as parameters of distribution class. dtype of 'value' will be converted."
+                )
+                return core.ops.cast(value, 'in_dtype', value.dtype,
+                                     'out_dtype', param.dtype)
+            return value
+
+        check_variable_and_dtype(value, 'value', ['float32', 'float64'],
+                                 'log_prob')
+        if value.dtype != param.dtype:
+            warnings.warn(
+                "dtype of input 'value' needs to be the same as parameters of distribution class. dtype of 'value' will be converted."
+            )
+            return tensor.cast(value, dtype=param.dtype)
+        return value
+
 
 class Uniform(Distribution):
     """Uniform distribution with `low` and `high` parameters.
@@ -155,8 +189,8 @@ class Uniform(Distribution):
     [broadcasting](https://www.paddlepaddle.org.cn/documentation/docs/en/develop/beginners_guide/basic_concept/broadcasting_en.html) (e.g., `high - low` is a valid operation).
 
     Args:
-        low(int|float|list|numpy.ndarray|Tensor): The lower boundary of uniform distribution.The data type is int, float32, list, numpy.ndarray or Tensor
-        high(int|float|list|numpy.ndarray|Tensor): The higher boundary of uniform distribution.The data type is int, float32, list, numpy.ndarray or Tensor
+        low(int|float|list|numpy.ndarray|Tensor): The lower boundary of uniform distribution.The data type is int, float, list, numpy.ndarray or Tensor
+        high(int|float|list|numpy.ndarray|Tensor): The higher boundary of uniform distribution.The data type is int, float, list, numpy.ndarray or Tensor
         name(str, optional): Name for the operation (optional, default is None). For more information, please refer to :ref:`api_guide_Name`.
 
     Examples:
@@ -206,6 +240,7 @@ class Uniform(Distribution):
         self.all_arg_is_float = False
         self.batch_size_unknown = False
         self.name = name if name is not None else 'Uniform'
+        self.dtype = 'float32'
 
         if isinstance(low, int):
             low = float(low)
@@ -216,10 +251,22 @@ class Uniform(Distribution):
             self.batch_size_unknown = True
             self.low = low
             self.high = high
+            self.dtype = convert_dtype(low.dtype)
         else:
             if isinstance(low, float) and isinstance(high, float):
                 self.all_arg_is_float = True
+            if isinstance(
+                    low,
+                    np.ndarray) and str(low.dtype) in ['float32', 'float64']:
+                self.dtype = low.dtype
+            elif isinstance(
+                    high,
+                    np.ndarray) and str(high.dtype) in ['float32', 'float64']:
+                self.dtype = high.dtype
             self.low, self.high = self._to_tensor(low, high)
+            if self.dtype != convert_dtype(self.low.dtype):
+                self.low = tensor.cast(self.low, dtype=self.dtype)
+                self.high = tensor.cast(self.high, dtype=self.dtype)
 
     def sample(self, shape, seed=0):
         """Generate samples of the specified shape.
@@ -241,11 +288,11 @@ class Uniform(Distribution):
         if self.batch_size_unknown:
             output_shape = shape + batch_shape
             zero_tmp = tensor.fill_constant_batch_size_like(
-                self.low + self.high, batch_shape + shape, self.low.dtype, 0.)
+                self.low + self.high, batch_shape + shape, self.dtype, 0.)
             uniform_random_tmp = nn.uniform_random_batch_size_like(
                 zero_tmp,
                 zero_tmp.shape,
-                dtype=convert_dtype(zero_tmp.dtype),
+                dtype=self.dtype,
                 min=0.,
                 max=1.,
                 seed=seed)
@@ -259,9 +306,8 @@ class Uniform(Distribution):
         else:
             output_shape = shape + batch_shape
             output = nn.uniform_random(
-                output_shape, seed=seed) * (tensor.zeros(
-                    output_shape, dtype=self.low.dtype) +
-                                            (self.high - self.low))
+                output_shape, seed=seed, dtype=self.dtype) * (tensor.zeros(
+                    output_shape, dtype=self.dtype) + (self.high - self.low))
             output = elementwise_add(output, self.low, name=name)
             if self.all_arg_is_float:
                 return nn.reshape(output, shape, name=name)
@@ -279,22 +325,20 @@ class Uniform(Distribution):
 
         """
         name = self.name + '_log_prob'
+        value = self._check_values_dtype_in_probs(self.low, value)
         if in_dygraph_mode():
+            # ensure value in [low, high]
             lb_bool = self.low < value
             ub_bool = value < self.high
 
-            dtype = value.dtype
             lb = core.ops.cast(lb_bool, 'in_dtype', lb_bool.dtype, 'out_dtype',
-                               dtype)
+                               value.dtype)
             ub = core.ops.cast(ub_bool, 'in_dtype', ub_bool.dtype, 'out_dtype',
-                               dtype)
+                               value.dtype)
             return nn.log(lb * ub) - nn.log(self.high - self.low)
 
-        check_variable_and_dtype(value, 'value', ['float32', 'float64'],
-                                 'log_prob')
-
-        lb_bool = control_flow.less_than(self.low, value)
-        ub_bool = control_flow.less_than(value, self.high)
+        lb_bool = self.low < value
+        ub_bool = value < self.high
         lb = tensor.cast(lb_bool, dtype=value.dtype)
         ub = tensor.cast(ub_bool, dtype=value.dtype)
         return elementwise_sub(
@@ -311,22 +355,19 @@ class Uniform(Distribution):
 
         """
         name = self.name + '_probs'
+        value = self._check_values_dtype_in_probs(self.low, value)
         if in_dygraph_mode():
             lb_bool = self.low < value
             ub_bool = value < self.high
 
-            dtype = value.dtype
             lb = core.ops.cast(lb_bool, 'in_dtype', lb_bool.dtype, 'out_dtype',
-                               dtype)
+                               value.dtype)
             ub = core.ops.cast(ub_bool, 'in_dtype', ub_bool.dtype, 'out_dtype',
-                               dtype)
+                               value.dtype)
             return (lb * ub) / (self.high - self.low)
 
-        check_variable_and_dtype(value, 'value', ['float32', 'float64'],
-                                 'log_prob')
-
-        lb_bool = control_flow.less_than(self.low, value)
-        ub_bool = control_flow.less_than(value, self.high)
+        lb_bool = self.low < value
+        ub_bool = value < self.high
         lb = tensor.cast(lb_bool, dtype=value.dtype)
         ub = tensor.cast(ub_bool, dtype=value.dtype)
         return elementwise_div((lb * ub), (self.high - self.low), name=name)
@@ -334,6 +375,12 @@ class Uniform(Distribution):
     def entropy(self):
         """Shannon entropy in nats.
 
+        The entropy is
+
+        .. math::
+
+            entropy(low, high) = \\log (high - low)
+
         Returns:
           Tensor: Shannon entropy of uniform distribution.The data type is float32.
 
@@ -364,8 +411,8 @@ class Normal(Distribution):
     * :math:`Z`: is the normalization constant.
 
     Args:
-        loc(int|float|list|numpy.ndarray|Tensor): The mean of normal distribution.The data type is int, float32, list, numpy.ndarray or Tensor.
-        scale(int|float|list|numpy.ndarray|Tensor): The std of normal distribution.The data type is int, float32, list, numpy.ndarray or Tensor.
+        loc(int|float|list|numpy.ndarray|Tensor): The mean of normal distribution.The data type is int, float, list, numpy.ndarray or Tensor.
+        scale(int|float|list|numpy.ndarray|Tensor): The std of normal distribution.The data type is int, float, list, numpy.ndarray or Tensor.
         name(str, optional): Name for the operation (optional, default is None). For more information, please refer to :ref:`api_guide_Name`.
 
     Examples:
@@ -418,6 +465,7 @@ class Normal(Distribution):
         self.batch_size_unknown = False
         self.all_arg_is_float = False
         self.name = name if name is not None else 'Normal'
+        self.dtype = 'float32'
 
         if isinstance(loc, int):
             loc = float(loc)
@@ -428,10 +476,22 @@ class Normal(Distribution):
             self.batch_size_unknown = True
             self.loc = loc
             self.scale = scale
+            self.dtype = convert_dtype(loc.dtype)
         else:
             if isinstance(loc, float) and isinstance(scale, float):
                 self.all_arg_is_float = True
+            if isinstance(
+                    loc,
+                    np.ndarray) and str(loc.dtype) in ['float32', 'float64']:
+                self.dtype = loc.dtype
+            elif isinstance(
+                    scale,
+                    np.ndarray) and str(scale.dtype) in ['float32', 'float64']:
+                self.dtype = scale.dtype
             self.loc, self.scale = self._to_tensor(loc, scale)
+            if self.dtype != convert_dtype(self.loc.dtype):
+                self.loc = tensor.cast(self.loc, dtype=self.dtype)
+                self.scale = tensor.cast(self.scale, dtype=self.dtype)
 
     def sample(self, shape, seed=0):
         """Generate samples of the specified shape.
@@ -454,22 +514,18 @@ class Normal(Distribution):
         if self.batch_size_unknown:
             output_shape = shape + batch_shape
             zero_tmp = tensor.fill_constant_batch_size_like(
-                self.loc + self.scale, batch_shape + shape, self.loc.dtype, 0.)
+                self.loc + self.scale, batch_shape + shape, self.dtype, 0.)
             zero_tmp_reshape = nn.reshape(zero_tmp, output_shape)
             zero_tmp_shape = nn.shape(zero_tmp_reshape)
             normal_random_tmp = nn.gaussian_random(
-                zero_tmp_shape,
-                mean=0.,
-                std=1.,
-                seed=seed,
-                dtype=convert_dtype(self.loc.dtype))
+                zero_tmp_shape, mean=0., std=1., seed=seed, dtype=self.dtype)
             output = normal_random_tmp * (zero_tmp_reshape + self.scale)
             output = elementwise_add(output, self.loc, name=name)
             return output
         else:
             output_shape = shape + batch_shape
-            output = nn.gaussian_random(output_shape, mean=0., std=1., seed=seed) * \
-                     (tensor.zeros(output_shape, dtype=self.loc.dtype) + self.scale)
+            output = nn.gaussian_random(output_shape, mean=0., std=1., seed=seed, dtype=self.dtype) * \
+                     (tensor.zeros(output_shape, dtype=self.dtype) + self.scale)
             output = elementwise_add(output, self.loc, name=name)
             if self.all_arg_is_float:
                 return nn.reshape(output, shape, name=name)
@@ -479,6 +535,16 @@ class Normal(Distribution):
     def entropy(self):
         """Shannon entropy in nats.
 
+        The entropy is
+
+        .. math::
+
+            entropy(\sigma) = 0.5 \\log (2 \pi e \sigma^2)
+
+        In the above equation:
+
+        * :math:`scale = \sigma`: is the std.
+
         Returns:
           Tensor: Shannon entropy of normal distribution.The data type is float32.
 
@@ -486,7 +552,7 @@ class Normal(Distribution):
         name = self.name + '_entropy'
         batch_shape = list((self.loc + self.scale).shape)
         zero_tmp = tensor.fill_constant_batch_size_like(
-            self.loc + self.scale, batch_shape, self.loc.dtype, 0.)
+            self.loc + self.scale, batch_shape, self.dtype, 0.)
         return elementwise_add(
             0.5 + zero_tmp,
             0.5 * math.log(2 * math.pi) + nn.log((self.scale + zero_tmp)),
@@ -502,11 +568,9 @@ class Normal(Distribution):
           Tensor: log probability.The data type is same with value.
 
         """
-        if not in_dygraph_mode():
-            check_variable_and_dtype(value, 'value', ['float32', 'float64'],
-                                     'log_prob')
-
         name = self.name + '_log_prob'
+        value = self._check_values_dtype_in_probs(self.loc, value)
+
         var = self.scale * self.scale
         log_scale = nn.log(self.scale)
         return elementwise_sub(
@@ -524,11 +588,9 @@ class Normal(Distribution):
           Tensor: probability.The data type is same with value.
 
         """
-        if not in_dygraph_mode():
-            check_variable_and_dtype(value, 'value', ['float32', 'float64'],
-                                     'log_prob')
-
         name = self.name + '_probs'
+        value = self._check_values_dtype_in_probs(self.loc, value)
+
         var = self.scale * self.scale
         return elementwise_div(
             ops.exp(-1. * ((value - self.loc) * (value - self.loc)) /
@@ -538,6 +600,29 @@ class Normal(Distribution):
     def kl_divergence(self, other):
         """The KL-divergence between two normal distributions.
 
+        The probability density function (pdf) is
+
+        .. math::
+
+            KL\_divergence(\mu_0, \sigma_0; \mu_1, \sigma_1) = 0.5 (ratio^2 + (\\frac{diff}{\sigma_1})^2 - 1 - 2 \\ln {ratio})
+
+        .. math::
+
+            ratio = \\frac{\sigma_0}{\sigma_1}
+        
+        .. math::
+
+            diff = \mu_1 - \mu_0
+
+        In the above equation:
+
+        * :math:`loc = \mu_0`: is the mean of current Normal distribution.
+        * :math:`scale = \sigma_0`: is the std of current Normal distribution.
+        * :math:`loc = \mu_1`: is the mean of other Normal distribution.
+        * :math:`scale = \sigma_1`: is the std of other Normal distribution.
+        * :math:`ratio`: is the ratio of scales.
+        * :math:`diff`: is the difference between means.
+
         Args:
             other (Normal): instance of Normal.
 
diff --git a/python/paddle/fluid/__init__.py b/python/paddle/fluid/__init__.py
index 9f748b7956f9faa6b1c948d87f0ef4659057a421..e8cc6ce99016075a950f13d9e23f2957c9686471 100644
--- a/python/paddle/fluid/__init__.py
+++ b/python/paddle/fluid/__init__.py
@@ -197,6 +197,7 @@ def __bootstrap__():
         'free_when_no_cache_hit',
         'call_stack_level',
         'sort_sum_gradient',
+        'max_inplace_grad_add',
     ]
     if 'Darwin' not in sysstr:
         read_env_flags.append('use_pinned_memory')
diff --git a/python/paddle/fluid/backward.py b/python/paddle/fluid/backward.py
index 898c7d295641863740288e3f4e1da39266bce183..478fecf74e4013e0d695c68af86a0e39a4a4e845 100644
--- a/python/paddle/fluid/backward.py
+++ b/python/paddle/fluid/backward.py
@@ -251,12 +251,19 @@ def _rename_arg_(op_descs, old_name, new_name, begin_idx=None, end_idx=None):
         begin_idx = 0
     if end_idx is None:
         end_idx = len(op_descs)
-    for i in range(begin_idx, end_idx):
-        op_desc = op_descs[i]
-        if isinstance(op_desc, tuple):
-            op_desc = op_desc[0]
-        op_desc._rename_input(old_name, new_name)
-        op_desc._rename_output(old_name, new_name)
+    if isinstance(op_descs, (list, tuple)):
+        for i in range(begin_idx, end_idx):
+            op_desc = op_descs[i]
+            if isinstance(op_desc, tuple):
+                op_desc = op_desc[0]
+            op_desc._rename_input(old_name, new_name)
+            op_desc._rename_output(old_name, new_name)
+    if isinstance(op_descs, collections.OrderedDict):
+        for key, value in op_descs.items():
+            if isinstance(value, (list, tuple)):
+                for op_desc in value:
+                    op_desc._rename_input(old_name, new_name)
+                    op_desc._rename_output(old_name, new_name)
 
 
 def _create_op_desc_(op_type, inputs, outputs, attrs):
@@ -369,6 +376,41 @@ def _append_grad_suffix_(name):
     return cpt.to_text(name) + core.grad_var_suffix()
 
 
+def _accumulate_gradients_by_sum_op_(var_name, renamed_vars, pending_sum_ops,
+                                     op_idx):
+    """
+    Use sum op to accumulate_gradients, the gradients are stored in renamed_vars.
+    """
+    if op_idx not in pending_sum_ops.keys():
+        pending_sum_ops[op_idx] = []
+    pending_sum_ops[op_idx].append(
+        _create_op_desc_("sum", {"X": renamed_vars[var_name]},
+                         {"Out": [var_name]}, {"use_mkldnn": False}))
+    renamed_vars[var_name] = [var_name]
+
+
+def _accumulate_gradients_by_add_ops_(var_name, renamed_vars, pending_sum_ops,
+                                      op_idx):
+    """
+    Use several inplace add op to accumulate_gradients, the gradients are stored in renamed_vars.
+    """
+    if op_idx not in pending_sum_ops.keys():
+        pending_sum_ops[op_idx] = []
+    out_name = renamed_vars[var_name][0]
+    for i in range(1, len(renamed_vars[var_name])):
+        x_name = out_name
+        y_name = renamed_vars[var_name][i]
+        if i != len(renamed_vars[var_name]) - 1:
+            out_name = var_name + '@ADD@' + str(i)
+        else:
+            out_name = var_name
+        pending_sum_ops[op_idx].append(
+            _create_op_desc_("grad_add", {"X": [x_name],
+                                          "Y": [y_name]}, {"Out": [out_name]},
+                             {"use_mkldnn": False}))
+    renamed_vars[var_name] = [var_name]
+
+
 def _addup_repetitive_outputs_(op_descs, block_idx):
     """
     In backward part, an variable may be the output of more than one ops.
@@ -376,7 +418,9 @@ def _addup_repetitive_outputs_(op_descs, block_idx):
     In these cases, the variable should be the accumulation of all the outputs.
     `sum_op`s are added to implement the accumulate.
     """
-    pending_sum_ops = []
+    _MAX_ADD_NUM_ = core.globals()['FLAGS_max_inplace_grad_add']
+    #pending_sum_ops = []
+    pending_sum_ops = collections.OrderedDict()
     var_rename_count = collections.defaultdict(int)
     renamed_vars = collections.defaultdict(list)
     renamed_var_start_idx = collections.defaultdict(list)
@@ -385,10 +429,13 @@ def _addup_repetitive_outputs_(op_descs, block_idx):
             if "@GRAD" not in var_name:
                 continue
             if len(renamed_vars[var_name]) > 1:
-                pending_sum_ops.append((_create_op_desc_(
-                    "sum", {"X": renamed_vars[var_name]}, {"Out": [var_name]},
-                    {"use_mkldnn": False}), idx))
-                renamed_vars[var_name] = [var_name]
+                if len(renamed_vars[var_name]) > _MAX_ADD_NUM_:
+                    _accumulate_gradients_by_sum_op_(var_name, renamed_vars,
+                                                     pending_sum_ops, idx)
+                else:
+                    _accumulate_gradients_by_add_ops_(var_name, renamed_vars,
+                                                      pending_sum_ops, idx)
+
         for param_idx, param_name in enumerate(op_desc.output_names()):
             arg_names = op_desc.output(param_name)
             for arg_idx, var_name in enumerate(arg_names):
@@ -440,13 +487,26 @@ def _addup_repetitive_outputs_(op_descs, block_idx):
                     renamed_vars[var_name].append(new_name)
 
     for var_name, inputs in six.iteritems(renamed_vars):
-        if len(inputs) > 1:
-            pending_sum_ops.append(
-                (_create_op_desc_("sum", {"X": inputs}, {"Out": [var_name]},
-                                  {"use_mkldnn": False}), len(op_descs)))
+        if len(renamed_vars[var_name]) > 1:
+            if len(renamed_vars[var_name]) > _MAX_ADD_NUM_:
+                _accumulate_gradients_by_sum_op_(var_name, renamed_vars,
+                                                 pending_sum_ops, len(op_descs))
+            else:
+                _accumulate_gradients_by_add_ops_(var_name, renamed_vars,
+                                                  pending_sum_ops,
+                                                  len(op_descs))
+
     # sum_op descs are sorted according to their insert position
-    for p in reversed(pending_sum_ops):
-        op_descs.insert(p[1], p[0])
+    for key, value in collections.OrderedDict(
+            reversed(list(pending_sum_ops.items()))).items():
+
+        # NOTE(zhiqiu): Since reversed, the idx of op_descs to be inserted will remains correct.
+        # For example, [0, 1, 2], and we want to insert 'a' at idx 1, 'b' at idx 2, and the expected result is [0, 1, 'a', 2, 'b'].
+        # If reversed, we first insert 'b' at idx 2, it becomes [0, 1, 2, 'b'], and then insert 'a' at idx 1, it becomes [0, 1, 'a', 2, 'b'].
+        # If not reverse, we first insert 'a' at idx 1, it becomes [0, 1, 'a', 2], and then insert 'b' at idx 2, it becomes [0, 1, 'a', 'b', 2].
+        idx = key
+        for i, op in enumerate(value):
+            op_descs.insert(idx + i, op)
 
     return op_descs
 
@@ -1756,6 +1816,12 @@ def calc_gradient(targets, inputs, target_gradients=None, no_grad_set=None):
     op_path_dict = dict()
     op_path = _find_op_path_(block, targets, inputs, block_no_grad_set,
                              op_path_dict)
+
+    # find no grad var by op_path
+    no_grad_vars = _find_no_grad_vars(block, op_path, targets,
+                                      block_no_grad_set)
+    block_no_grad_set.update(no_grad_vars)
+
     no_grad_dict[0].update(list(map(_append_grad_suffix_, block_no_grad_set)))
     grad_to_var = dict()
     grad_info_map = dict()
diff --git a/python/paddle/fluid/contrib/layers/nn.py b/python/paddle/fluid/contrib/layers/nn.py
index 0e187d4174cd5cca65f79e4ab84b4cc32ecefd21..ac6493b1c2969a8c3319bc8d29983b0ccc3a67d9 100644
--- a/python/paddle/fluid/contrib/layers/nn.py
+++ b/python/paddle/fluid/contrib/layers/nn.py
@@ -37,7 +37,7 @@ import warnings
 import inspect
 
 import numpy as np
-
+import paddle
 from paddle.fluid.layer_helper import LayerHelper
 from paddle.fluid.layers import utils
 from ... import unique_name
@@ -45,6 +45,7 @@ from paddle.fluid.initializer import Normal, Constant, NumpyArrayInitializer
 from paddle.fluid.data_feeder import check_variable_and_dtype, check_type, check_dtype, convert_dtype
 
 from paddle.fluid import core
+from paddle.fluid.param_attr import ParamAttr
 from paddle.fluid.entry_attr import ProbabilityEntry, CountFilterEntry
 
 from paddle.fluid.framework import Variable, convert_np_dtype_to_dtype_
@@ -56,7 +57,8 @@ __all__ = [
     'match_matrix_tensor', 'tree_conv', 'fused_embedding_seq_pool',
     'multiclass_nms2', 'search_pyramid_hash', 'shuffle_batch', 'partial_concat',
     'sparse_embedding', 'partial_sum', 'tdm_child', 'rank_attention',
-    'tdm_sampler', 'batch_fc', '_pull_box_extended_sparse', 'bilateral_slice'
+    'tdm_sampler', 'batch_fc', '_pull_box_extended_sparse', 'bilateral_slice',
+    'correlation', 'fused_bn_add_act'
 ]
 
 
@@ -1546,3 +1548,269 @@ def bilateral_slice(x, guide, grid, has_offset, name=None):
         attrs={'has_offset': has_offset},
         outputs={'Out': out})
     return out
+
+
+def correlation(x,
+                y,
+                pad_size,
+                kernel_size,
+                max_displacement,
+                stride1,
+                stride2,
+                corr_type_multiply=1):
+    """
+
+    This operation compute correlation of two tensor.
+    For more information of correlation, please refer to PWC-Net: 
+    CNNs for Optical Flow Using Pyramid, Warping, and Cost Volume 
+    <https://arxiv.org/pdf/1709.02371.pdf>_
+
+    Args:
+        x(Tensor): The input x is 4-D Tensor with shape [N, C, H, W]. The data type is float32 and float64.
+        y(Tensor): The input y is 4-D Tensor with shape [N, C, H, W]. The data type is float32 and float64.
+        pad_size(int): Pad size. The data type is int.
+        max_displacement(int): Max displacement. The data type is int.
+        stride1(int): stride size of x. The data type is int.
+        stride2(int): stride size of y. The data type is int.
+        corr_type_multiply(int, optional): The type of multiply. The data type is int. Default: 1.
+
+    Returns:
+        Tensor: The data type is same as input tensor.
+
+    Examples:
+
+        .. code-block:: python
+
+            import paddle.fluid as fluid
+
+            x1 = fluid.layers.data(name='x1',
+                               shape=x_shape,
+                               dtype=x_type,
+                               append_batch_size=False)
+            x2 = fluid.layers.data(name='x2',
+                                shape=x_shape,
+                                dtype=x_type,
+                                append_batch_size=False)
+
+
+            out = fluid.contrib.correlation(
+                            x1,
+                            x2,
+                            pad_size=4,
+                            kernel_size=1,
+                            max_displacement=4,
+                            stride1=1,
+                            stride2=1)
+
+    """
+
+    helper = LayerHelper("correlation", **locals())
+    output = helper.create_variable_for_type_inference(dtype=x.dtype)
+    if paddle.fluid.in_dygraph_mode():
+        attrs = ("pad_size", pad_size, "kernel_size", kernel_size,
+                 "max_displacement", max_displacement, "stride1", stride1,
+                 "stride2", stride2, "corr_type_multiply", corr_type_multiply)
+        output = getattr(core.ops, "correlation")(x, y, *attrs)
+    else:
+        helper.append_op(
+            type="correlation",
+            inputs={"Input1": x,
+                    "Input2": y},
+            attrs={
+                "pad_size": pad_size,
+                "kernel_size": kernel_size,
+                "max_displacement": max_displacement,
+                "stride1": stride1,
+                "stride2": stride2,
+                "corr_type_multiply": corr_type_multiply
+            },
+            outputs={"Output": output})
+    return output
+
+
+def fused_bn_add_act(x,
+                     y,
+                     momentum=0.9,
+                     epsilon=1e-05,
+                     param_attr=None,
+                     bias_attr=None,
+                     moving_mean_name=None,
+                     moving_variance_name=None,
+                     act=None,
+                     name=None):
+    """
+    This Op performs batch norm on input x, and adds the result to input y. Then
+    it performs activation on the sum. The data format of inputs must be NHWC
+    `[batch, in_height, in_width, in_channels]`.
+
+    Args:
+        x(Tensor): The rank of input tensor can be 2, 3, 4, 5. The data type
+            is float16.
+        y(Tensor): The rank of input tensor can be 2, 3, 4, 5. The data type
+            is float16.
+        momentum(float|Tensor, optional): The value used for the moving_mean and
+            moving_var computation. This should be a float number or a tensor with
+            shape [1] and data type as float32. The updated formula is:
+            :math:`moving\_mean = moving\_mean * momentum + new\_mean * (1. - momentum)`
+            :math:`moving\_var = moving\_var * momentum + new\_var * (1. - momentum)`
+            Default is 0.9.
+        epsilon(float, optional): A value added to the denominator for
+            numerical stability. Default is 1e-5.
+        param_attr(ParamAttr, optional): The parameter attribute for Parameter `scale`
+            of batch_norm. If it is set to None or one attribute of ParamAttr, batch_norm
+	        will create ParamAttr as param_attr, the name of scale can be set in ParamAttr.
+	        If the Initializer of the param_attr is not set, the parameter is initialized
+	        with Xavier. Default: None.
+        bias_attr(ParamAttr, optional): The parameter attribute for the bias of batch_norm.
+            If it is set to None or one attribute of ParamAttr, batch_norm
+	        will create ParamAttr as bias_attr, the name of bias can be set in ParamAttr.
+	        If the Initializer of the bias_attr is not set, the bias is initialized zero.
+	        Default: None.
+        moving_mean_name(str, optional): The name of moving_mean which store the global Mean. If it
+            is set to None, batch_norm will save global mean with a random name, otherwise, batch_norm
+            will save global mean with the string.
+        moving_variance_name(str, optional): The name of the moving_variance which store the global Variance.
+            If it is set to None, batch_norm will save global variance with a random name, otherwise, batch_norm
+            will save global variance with the string.
+        act(string, optional): Activation type, linear|relu|prelu|...
+        name(str, optional): For detailed information, please refer to :ref:`api_guide_Name`.
+            Usually name is no need to set and None by default.
+
+    Examples:
+            .. code-block:: python
+
+            import paddle.fluid as fluid
+
+            def build_program(main_program, startup_program):
+                with fluid.program_guard(main_program, startup_program):
+                    x = fluid.layers.data(name='x', shape=[1, 28, 28], dtype='float32')
+                    y = fluid.layers.data(name="y", shape=[1], dtype='int64')
+                    conv1_1 = fluid.layers.conv2d(
+                        input=x,
+                        filter_size=3,
+                        num_filters=32,
+                        stride=1,
+                        padding=1,
+                        act=None,
+                        bias_attr=False,
+                        data_format='NHWC')
+                    conv1_2 = fluid.layers.conv2d(
+                        input=x,
+                        filter_size=3,
+                        num_filters=32,
+                        stride=1,
+                        padding=1,
+                        act=None,
+                        bias_attr=False,
+                        data_format='NHWC')
+                    bn = fluid.layers.batch_norm(
+                        input=conv1_1,
+                        act=None,
+                        data_layout='NHWC')
+                    fused_bn_add_act = fluid.contrib.layers.fused_bn_add_act(conv1_2, bn)
+                    prediction = fluid.layers.fc(input=fused_bn_add_act, size=10, act='softmax')
+                    loss = fluid.layers.cross_entropy(input=prediction, label=y)
+                    loss = fluid.layers.mean(loss)
+                    sgd = fluid.optimizer.SGD(learning_rate=0.001)
+                    sgd = fluid.contrib.mixed_precision.decorate(
+                        sgd, use_dynamic_loss_scaling=True, init_loss_scaling=128.0)
+                    sgd.minimize(loss)
+
+                return x, y, loss
+
+            iters = 5
+            batch_size = 16
+            support_gpu = fluid.is_compiled_with_cuda()
+            if support_gpu:
+                main_program = fluid.Program()
+                startup_program = fluid.Program()
+                place = fluid.CUDAPlace(0)
+                x, y, loss = build_program(main_program, startup_program)
+  
+                feeder = fluid.DataFeeder(feed_list=[x, y], place=place)
+                train_reader = paddle.batch(
+                    paddle.dataset.mnist.train(), batch_size=batch_size)
+                exe = fluid.Executor(place)
+                scope = fluid.Scope()
+                with fluid.scope_guard(scope):
+                    exe.run(startup_program)
+                    for _ in range(iters):
+                        data = next(train_reader())
+                        loss_v = exe.run(main_program, feed=feeder.feed(data), fetch_list=[loss])
+    """
+    helper = LayerHelper('fused_bn_add_act', **locals())
+
+    check_variable_and_dtype(x, 'input', ['float16', 'float32', 'float64'],
+                             'fused_bn_add_act')
+    check_variable_and_dtype(y, 'input', ['float16', 'float32', 'float64'],
+                             'fused_bn_add_act')
+    bn_param_dtype = core.VarDesc.VarType.FP32
+
+    x_shape = x.shape
+    channel_num = x_shape[-1]
+    param_shape = [channel_num]
+
+    # create parameter
+    scale = helper.create_parameter(
+        attr=helper.param_attr,
+        shape=param_shape,
+        dtype=bn_param_dtype,
+        default_initializer=Constant(1.0))
+    bias = helper.create_parameter(
+        attr=helper.bias_attr,
+        shape=param_shape,
+        dtype=bn_param_dtype,
+        is_bias=True)
+    mean = helper.create_parameter(
+        attr=ParamAttr(
+            name=moving_mean_name, initializer=Constant(0.0), trainable=False),
+        shape=param_shape,
+        dtype=bn_param_dtype)
+    mean.stop_gradient = True
+    variance = helper.create_parameter(
+        attr=ParamAttr(
+            name=moving_variance_name,
+            initializer=Constant(1.0),
+            trainable=False),
+        shape=param_shape,
+        dtype=bn_param_dtype)
+    variance.stop_gradient = True
+
+    # create output
+    # mean and mean_out share the same memory
+    mean_out = mean
+    # variance and variance out share the same memory
+    variance_out = variance
+    saved_mean = helper.create_variable_for_type_inference(
+        dtype=bn_param_dtype, stop_gradient=True)
+    saved_variance = helper.create_variable_for_type_inference(
+        dtype=bn_param_dtype, stop_gradient=True)
+    reserve_space = helper.create_variable_for_type_inference(
+        dtype=core.VarDesc.VarType.FP16, stop_gradient=True)
+    batch_norm_out = helper.create_variable_for_type_inference(
+        core.VarDesc.VarType.FP16)
+
+    inputs = {
+        "X": x,
+        "Z": y,
+        "Scale": scale,
+        "Bias": bias,
+    }
+    attrs = {"epsilon": epsilon, 'momentum': momentum}
+
+    outputs = {
+        "Y": batch_norm_out,
+        "MeanOut": mean_out,
+        "VarianceOut": variance_out,
+        "SavedMean": saved_mean,
+        "SavedVariance": saved_variance,
+        "ReserveSpace": reserve_space
+    }
+
+    helper.append_op(
+        type="fused_bn_add_activation",
+        inputs=inputs,
+        outputs=outputs,
+        attrs=attrs)
+
+    return batch_norm_out
diff --git a/python/paddle/fluid/contrib/mixed_precision/amp_nn.py b/python/paddle/fluid/contrib/mixed_precision/amp_nn.py
new file mode 100644
index 0000000000000000000000000000000000000000..d4dc968ca0de44b01741bf1f1fbaac7a9a65287e
--- /dev/null
+++ b/python/paddle/fluid/contrib/mixed_precision/amp_nn.py
@@ -0,0 +1,124 @@
+#   Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from paddle.fluid.data_feeder import check_variable_and_dtype, check_type
+from paddle.fluid.layer_helper import LayerHelper
+from paddle.fluid.framework import Variable
+
+__all__ = ['check_finite_and_unscale', 'update_loss_scaling']
+
+
+def check_finite_and_unscale(x, scale, name=None):
+    """
+    Check if input X contains all finite data, if yes, scale it by input Scale.
+
+    $$Out = X / scale$$
+
+    If any tensor in X contains Inf or Nan, the Out will generate a indicator.
+    FoundInfinite will be 1 (True), and Out will not be scaled. In this case, the data of 
+    Out should not be used, and its data may not be deterministic. 
+    Otherwise, FoundInfinite will be 0 (False).
+    Args:
+        x(list|tuple): The input tensors of check_finite_and_unscale operator.
+        scale: The scale of check_finite_and_unscale operator.
+    """
+    check_type(x, 'x', (tuple, list), 'check_finite_and_unscale')
+    for e in x:
+        check_variable_and_dtype(e, "x", ['float32', 'float64'],
+                                 'check_finite_and_unscale')
+
+    helper = LayerHelper("check_finite_and_unscale", **locals())
+    found_inf = helper.create_variable_for_type_inference(dtype='bool')
+
+    inputs = {'X': x, 'Scale': scale}
+    outputs = {'Out': x, 'FoundInfinite': found_inf}
+    helper.append_op(
+        type='check_finite_and_unscale', inputs=inputs, outputs=outputs)
+
+    return x, found_inf
+
+
+def update_loss_scaling(x,
+                        found_inf,
+                        prev_loss_scaling,
+                        num_good_steps,
+                        num_bad_steps,
+                        incr_every_n_steps,
+                        decr_every_n_nan_or_inf,
+                        incr_ratio,
+                        decr_ratio,
+                        name=None):
+    """
+    Update loss scaling according to overall gradients. If all gradients is 
+    finite after incr_every_n_steps, loss scaling will increase by incr_ratio. 
+    Otherwise, loss scaling will decrease by decr_ratio after
+    decr_every_n_nan_or_inf steps and each step some gradients are infinite.
+
+    Args:
+        x(list|tuple): The input tensors of update_loss_scaling operator.
+        found_inf (Variable): A boolean variable indicates whether 
+                                     there is any infinite gradient.
+        prev_loss_scaling (Variable): Previous loss scaling.
+        num_good_steps (Variable): A variable accumulates good steps in which 
+                                   all gradients are finite.
+        num_bad_steps (Variable): A variable accumulates bad steps in which 
+                                  some gradients are infinite.
+        incr_every_n_steps (int): A variable represents increasing loss 
+                                       scaling every n consecutive steps with 
+                                       finite gradients.
+        decr_every_n_nan_or_inf (int): A variable represents decreasing 
+                                            loss scaling every n accumulated 
+                                            steps with nan or inf gradients.
+        incr_ratio(float): The multiplier to use when increasing the loss 
+                           scaling.
+        decr_ratio(float): The less-than-one-multiplier to use when decreasing 
+                           loss scaling.
+    """
+
+    check_variable_and_dtype(prev_loss_scaling, "prev_loss_scaling",
+                             ['float32', 'float64'], "update_loss_scaling")
+    check_type(x, 'x', (tuple, list), 'update_loss_scaling')
+    for e in x:
+        check_variable_and_dtype(e, "x", ['float32', 'float64'],
+                                 'update_loss_scaling')
+        assert prev_loss_scaling.dtype == e.dtype, "The dtype of prev_loss_scaling should be equal to the dtype of x."
+
+    helper = LayerHelper("update_loss_scaling", **locals())
+
+    inputs = {
+        'X': x,
+        'FoundInfinite': found_inf,
+        'PrevLossScaling': prev_loss_scaling,
+        'InGoodSteps': num_good_steps,
+        'InBadSteps': num_bad_steps
+    }
+
+    outputs = {
+        'Out': x,
+        'LossScaling': prev_loss_scaling,
+        'OutGoodSteps': num_good_steps,
+        'OutBadSteps': num_bad_steps
+    }
+
+    attrs = {
+        'incr_every_n_steps': incr_every_n_steps,
+        'decr_every_n_nan_or_inf': decr_every_n_nan_or_inf,
+        'incr_ratio': incr_ratio,
+        'decr_ratio': decr_ratio,
+    }
+
+    helper.append_op(
+        type='update_loss_scaling', inputs=inputs, outputs=outputs, attrs=attrs)
+
+    return x
diff --git a/python/paddle/fluid/contrib/mixed_precision/decorator.py b/python/paddle/fluid/contrib/mixed_precision/decorator.py
index bfbd2700ae10bac4ad37462b5d7844b90dd05bbe..c9112ac849ce0506b7afd941b2213710e06bd1c6 100644
--- a/python/paddle/fluid/contrib/mixed_precision/decorator.py
+++ b/python/paddle/fluid/contrib/mixed_precision/decorator.py
@@ -17,9 +17,11 @@ from ... import default_startup_program
 from ... import layers
 from ... import unique_name
 from . import fp16_utils
-from .fp16_utils import update_loss_scaling, rewrite_program
+from .fp16_utils import rewrite_program
 from .fp16_utils import update_role_var_grad
 from .fp16_lists import AutoMixedPrecisionLists
+from .amp_nn import check_finite_and_unscale
+from .amp_nn import update_loss_scaling
 
 __all__ = ["decorate"]
 
@@ -67,10 +69,8 @@ class OptimizerWithMixedPrecision(object):
             persistable=True)
         self._use_dynamic_loss_scaling = use_dynamic_loss_scaling
         if self._use_dynamic_loss_scaling:
-            self._incr_every_n_steps = layers.fill_constant(
-                shape=[1], dtype='int32', value=incr_every_n_steps)
-            self._decr_every_n_nan_or_inf = layers.fill_constant(
-                shape=[1], dtype='int32', value=decr_every_n_nan_or_inf)
+            self._incr_every_n_steps = incr_every_n_steps
+            self._decr_every_n_nan_or_inf = decr_every_n_nan_or_inf
             self._incr_ratio = incr_ratio
             self._decr_ratio = decr_ratio
             self._num_good_steps = layers.create_global_var(
@@ -139,49 +139,46 @@ class OptimizerWithMixedPrecision(object):
         # Change the op_role_var attr for some ops, so that gradients
         # transferred across GPUs can be FP16.
         update_role_var_grad(self._train_program, self._params_grads)
-        scaled_params_grads = []
-        for p, g in self._params_grads:
-            with self._train_program._optimized_guard([p, g]):
-                scaled_g = g / self._loss_scaling
-                scaled_params_grads.append([p, scaled_g])
 
-        return scaled_params_grads
+        return self._params_grads
 
-    def apply_gradients(self, scaled_params_grads):
+    def apply_gradients(self, params_grads):
         """
         Check scaled gradients to determine whether to update loss scaling and update 
         parameters by their scaled gradients, 
   
         Args:
-            scaled_params_grads (list): A list of params and scaled grads.
+            params_grads (list): A list of params and scaled grads.
     
         Returns:
             A list of optimize operators.
         """
 
-        if self._use_dynamic_loss_scaling:
+        grads = [g for _, g in params_grads]
+        with self._train_program._optimized_guard(grads):
+            grads, found_inf = check_finite_and_unscale(
+                grads, self._loss_scaling, name="find_infinite_scale")
 
-            grads = [layers.reduce_sum(g) for [_, g] in scaled_params_grads]
-            all_grads = layers.concat(grads)
-            all_grads_sum = layers.reduce_sum(all_grads)
-            is_overall_finite = layers.isfinite(all_grads_sum)
-
-            update_loss_scaling(is_overall_finite, self._loss_scaling,
-                                self._num_good_steps, self._num_bad_steps,
-                                self._incr_every_n_steps,
-                                self._decr_every_n_nan_or_inf, self._incr_ratio,
-                                self._decr_ratio)
-
-            # apply_gradient append all ops in global block, thus we shouldn't
-            # apply gradient in the switch branch.
-            with layers.Switch() as switch:
-                with switch.case(is_overall_finite):
-                    pass
-                with switch.default():
-                    for _, g in scaled_params_grads:
-                        layers.assign(layers.zeros_like(g), g)
-
-        optimize_ops = self._optimizer.apply_gradients(scaled_params_grads)
+        if self._use_dynamic_loss_scaling:
+            with self._train_program._optimized_guard(grads):
+                grads = update_loss_scaling(
+                    grads,
+                    found_inf,
+                    self._loss_scaling,
+                    self._num_good_steps,
+                    self._num_bad_steps,
+                    self._incr_every_n_steps,
+                    self._decr_every_n_nan_or_inf,
+                    self._incr_ratio,
+                    self._decr_ratio,
+                    name="update_loss_scaling")
+
+        params_unscaled_grads = []
+        for pg, new_g in zip(params_grads, grads):
+            params_unscaled_grads.append((pg[0], new_g))
+        # apply_gradient append all ops in global block, thus we shouldn't
+        # apply gradient in the switch branch.
+        optimize_ops = self._optimizer.apply_gradients(params_unscaled_grads)
 
         return optimize_ops
 
diff --git a/python/paddle/fluid/contrib/mixed_precision/fp16_lists.py b/python/paddle/fluid/contrib/mixed_precision/fp16_lists.py
index 1f301b7148d005d4e3d5d272fd78f78af6dc1e6a..a9f080c514dff078b0068bce262fa177fd0b0db2 100644
--- a/python/paddle/fluid/contrib/mixed_precision/fp16_lists.py
+++ b/python/paddle/fluid/contrib/mixed_precision/fp16_lists.py
@@ -135,6 +135,7 @@ gray_list = {
     'get_tensor_from_selected_rows',
     'sign',
     'cast',
+    'fused_bn_add_activation',
 }
 '''
 # The set of ops that don't support fp16 calculation
diff --git a/python/paddle/fluid/contrib/mixed_precision/fp16_utils.py b/python/paddle/fluid/contrib/mixed_precision/fp16_utils.py
index 93013ef8bf8442311621202e0a86dd65e7c38b30..0ff166d8dc89ac79c36343df9bc379cb171c36fd 100644
--- a/python/paddle/fluid/contrib/mixed_precision/fp16_utils.py
+++ b/python/paddle/fluid/contrib/mixed_precision/fp16_utils.py
@@ -69,12 +69,14 @@ def _insert_cast_op(block, op, idx, src_dtype, dest_dtype):
     ]
 
     for in_name in op.input_names:
-        if src_dtype == core.VarDesc.VarType.FP32 and op.type == 'batch_norm':
-            if in_name != 'X':
+        if src_dtype == core.VarDesc.VarType.FP32 and op.type in [
+                'batch_norm', 'fused_bn_add_activation'
+        ]:
+            if in_name not in {'X', 'Z'}:
                 continue
         for in_var_name in op.input(in_name):
             in_var = block.var(in_var_name)
-            if in_var.type not in valid_types:
+            if in_var.type not in valid_types or in_var.dtype == dest_dtype:
                 continue
             if in_var.dtype == src_dtype:
                 cast_name = in_var.name + '.cast_' + _dtype_to_str(dest_dtype)
@@ -84,7 +86,7 @@ def _insert_cast_op(block, op, idx, src_dtype, dest_dtype):
                         name=cast_name,
                         dtype=dest_dtype,
                         persistable=False,
-                        stop_gradient=False)
+                        stop_gradient=in_var.stop_gradient)
 
                     block._insert_op(
                         idx,
@@ -100,9 +102,10 @@ def _insert_cast_op(block, op, idx, src_dtype, dest_dtype):
             else:
                 if op.has_attr('in_dtype'):
                     op._set_attr('in_dtype', dest_dtype)
-    if src_dtype == core.VarDesc.VarType.FP32:
+    if src_dtype == core.VarDesc.VarType.FP32 and dest_dtype == core.VarDesc.VarType.FP16:
         for out_name in op.output_names:
-            if op.type == 'batch_norm' and out_name != 'Y':
+            if op.type in ['batch_norm', 'fused_bn_add_activation'
+                           ] and out_name != 'Y':
                 continue
             for out_var_name in op.output(out_name):
                 out_var = block.var(out_var_name)
@@ -328,77 +331,3 @@ def update_role_var_grad(main_prog, params_grads):
                 raise ValueError("The op {0} is not in program".format(op))
             block.desc._remove_op(op_idx, op_idx + 1)
         block._sync_with_cpp()
-
-
-def update_loss_scaling(is_overall_finite, prev_loss_scaling, num_good_steps,
-                        num_bad_steps, incr_every_n_steps,
-                        decr_every_n_nan_or_inf, incr_ratio, decr_ratio):
-    """
-    Update loss scaling according to overall gradients. If all gradients is 
-    finite after incr_every_n_steps, loss scaling will increase by incr_ratio. 
-    Otherwise, loss scaling will decrease by decr_ratio after
-    decr_every_n_nan_or_inf steps and each step some gradients are infinite.
-
-    Args:
-        is_overall_finite (Variable): A boolean variable indicates whether 
-                                     all gradients are finite.
-        prev_loss_scaling (Variable): Previous loss scaling.
-        num_good_steps (Variable): A variable accumulates good steps in which 
-                                   all gradients are finite.
-        num_bad_steps (Variable): A variable accumulates bad steps in which 
-                                  some gradients are infinite.
-        incr_every_n_steps (Variable): A variable represents increasing loss 
-                                       scaling every n consecutive steps with 
-                                       finite gradients.
-        decr_every_n_nan_or_inf (Variable): A variable represents decreasing 
-                                            loss scaling every n accumulated 
-                                            steps with nan or inf gradients.
-        incr_ratio(float): The multiplier to use when increasing the loss 
-                           scaling.
-        decr_ratio(float): The less-than-one-multiplier to use when decreasing 
-                           loss scaling.
-    """
-    zero_steps = layers.fill_constant(shape=[1], dtype='int32', value=0)
-    with layers.Switch() as switch:
-        with switch.case(is_overall_finite):
-            should_incr_loss_scaling = layers.less_than(incr_every_n_steps,
-                                                        num_good_steps + 1)
-            with layers.Switch() as switch1:
-                with switch1.case(should_incr_loss_scaling):
-                    new_loss_scaling = prev_loss_scaling * incr_ratio
-                    loss_scaling_is_finite = layers.isfinite(new_loss_scaling)
-                    with layers.Switch() as switch2:
-                        with switch2.case(loss_scaling_is_finite):
-                            layers.assign(new_loss_scaling, prev_loss_scaling)
-                        with switch2.default():
-                            pass
-                    layers.assign(zero_steps, num_good_steps)
-                    layers.assign(zero_steps, num_bad_steps)
-
-                with switch1.default():
-                    layers.increment(num_good_steps)
-                    layers.assign(zero_steps, num_bad_steps)
-
-        with switch.default():
-            should_decr_loss_scaling = layers.less_than(decr_every_n_nan_or_inf,
-                                                        num_bad_steps + 1)
-            with layers.Switch() as switch3:
-                with switch3.case(should_decr_loss_scaling):
-                    new_loss_scaling = prev_loss_scaling * decr_ratio
-                    static_loss_scaling = \
-                        layers.fill_constant(shape=[1],
-                                             dtype='float32',
-                                             value=1.0)
-                    less_than_one = layers.less_than(new_loss_scaling,
-                                                     static_loss_scaling)
-                    with layers.Switch() as switch4:
-                        with switch4.case(less_than_one):
-                            layers.assign(static_loss_scaling,
-                                          prev_loss_scaling)
-                        with switch4.default():
-                            layers.assign(new_loss_scaling, prev_loss_scaling)
-                    layers.assign(zero_steps, num_good_steps)
-                    layers.assign(zero_steps, num_bad_steps)
-                with switch3.default():
-                    layers.assign(zero_steps, num_good_steps)
-                    layers.increment(num_bad_steps)
diff --git a/python/paddle/fluid/contrib/slim/quantization/imperative/qat.py b/python/paddle/fluid/contrib/slim/quantization/imperative/qat.py
index 5662284483bf529034e42178c8a431f6286e31b8..8d7ebcf4caa53929c5dd97159e63cf3cd02f5636 100644
--- a/python/paddle/fluid/contrib/slim/quantization/imperative/qat.py
+++ b/python/paddle/fluid/contrib/slim/quantization/imperative/qat.py
@@ -67,6 +67,7 @@ class ImperativeQuantAware(object):
         Examples:
         .. code-block:: python
 
+            import paddle
             from paddle.fluid.contrib.slim.quantization \
                 import ImperativeQuantAware
             from paddle.vision.models \
@@ -86,20 +87,24 @@ class ImperativeQuantAware(object):
             # ...
             
             # Save quant model for the inference.
-            imperative_qat.save_quantized_model(
-                dirname="./resnet50_qat",
-                model=model,
-                input_shape=[(3, 224, 224)],
-                input_dtype=['float32'],
-                feed=[0],
-                fetch=[0])
+            paddle.jit.save(
+                layer=model,
+                model_path="./resnet50_qat",
+                input_spec=[
+                    paddle.static.InputSpec(
+                    shape=[None, 3, 224, 224], dtype='float32')])
         """
         super(ImperativeQuantAware, self).__init__()
         self._weight_bits = weight_bits
         self._activation_bits = activation_bits
         self._moving_rate = moving_rate
 
-        quant_type = {'abs_max', 'moving_average_abs_max'}
+        quant_type = {
+            'abs_max', 'moving_average_abs_max', 'channel_wise_abs_max'
+        }
+
+        assert activation_quantize_type != 'channel_wise_abs_max', \
+            "The activation quantization type does not support 'channel_wise_abs_max'."
         if activation_quantize_type not in quant_type:
             raise ValueError(
                 "Unknown activation_quantize_type : '%s'. It can only be "
@@ -108,8 +113,8 @@ class ImperativeQuantAware(object):
         if weight_quantize_type not in quant_type:
             raise ValueError(
                 "Unknown weight_quantize_type: '%s'. It can only be "
-                "'abs_max' or 'moving_average_abs_max' now." %
-                (str(weight_quantize_type)))
+                "'abs_max' or 'moving_average_abs_max' or 'channel_wise_abs_max' now."
+                % (str(weight_quantize_type)))
         self._activation_quantize_type = activation_quantize_type
         self._weight_quantize_type = weight_quantize_type
 
@@ -148,76 +153,6 @@ class ImperativeQuantAware(object):
             quant_layer = self._get_quantized_counterpart(layer)
             setattr(obj, target, quant_layer)
 
-    def save_quantized_model(self,
-                             dirname,
-                             model,
-                             input_shape,
-                             input_dtype,
-                             feed,
-                             fetch,
-                             append_batch_size=True):
-        """
-        Save the quantized model for the inference.
-
-        Args:
-            dirname (str): the directory to save the quantized model.
-            model(fluid.dygraph.Layer): the quantized model to be saved.
-            input_shape(list[tuple(int)]): The shape value for each input,
-                e.g. [(3, 224, 224)].
-            input_dtype(list[str]): The dtype value for each input,
-                e.g. ['float32'].
-            feed(list[int]): the indices of the input variables of the
-                imperative functions which will be saved as input variables in
-                inference model.
-            fetch(list[int]): the indices of the returned variable of the
-                imperative functions which will be saved as output variables in
-                inference model.
-            append_batch_size(bool, optional):
-                If true, it prepends an extra axis to the input_shape, meanwhile,
-                the input_shape shouldn't contain the batch size dimension.
-                Otherwise, it just uses the input_shape. Default True.
-        Returns:
-            None
-        """
-        assert isinstance(
-            input_shape, list), "The parameter `input_shape` shoubld be a list."
-        assert isinstance(
-            input_dtype, list), "The parameter `input_dtype` shoubld be a list."
-        assert isinstance(feed, list), "The parameter `feed` shoubld be a list."
-        assert isinstance(fetch,
-                          list), "The parameter `fetch` shoubld be a list."
-        assert len(input_shape) == len(
-            input_dtype
-        ), "The length of input_shape should be equal to  input_dtype's."
-        assert len(input_dtype) == len(
-            feed), "The length of input_shape should be equal to  feed's."
-
-        prog_trans = dygraph.ProgramTranslator()
-        with dygraph.guard():
-            model.eval()
-            input_vars = []
-            for i, (shape, dtype) in enumerate(zip(input_shape, input_dtype)):
-                if append_batch_size:
-                    shape = [None] + list(shape)
-                # Note(Aurelius84): need a elegant way to name this.
-                in_spec = paddle.static.InputSpec(shape, dtype, 'feed_%d' % i)
-                input_vars.append(in_spec)
-            # use `declarative` to convert dygraph into static program
-            model.forward = dygraph.jit.declarative(
-                model.forward, input_spec=input_vars)
-            outputs = model.forward.concrete_program.outputs
-        input_spec = [input_vars[i] for i in feed]
-        configs = dygraph.jit.SaveLoadConfig()
-        configs.separate_params = True
-        if not isinstance(outputs, (tuple, list)):
-            outputs = [outputs]
-        configs.output_spec = [outputs[i] for i in fetch]
-        dygraph.jit.save(
-            layer=model,
-            model_path=dirname,
-            input_spec=input_spec,
-            configs=configs)
-
     def _get_quantized_counterpart(self, layer):
         quant_layers = tuple(self._quant_layers_map.values())
         quantized_counterpart = tuple('Quantized' + k
diff --git a/python/paddle/fluid/contrib/slim/quantization/imperative/quant_nn.py b/python/paddle/fluid/contrib/slim/quantization/imperative/quant_nn.py
index 59dd9867abb95dea74e1cdc362b671e7d4120d70..2e35ac288c7158a220e3b96babb146e28d50a5ee 100644
--- a/python/paddle/fluid/contrib/slim/quantization/imperative/quant_nn.py
+++ b/python/paddle/fluid/contrib/slim/quantization/imperative/quant_nn.py
@@ -24,7 +24,7 @@ from paddle.fluid.data_feeder import check_variable_and_dtype
 
 __all__ = [
     'FakeQuantMovingAverage', 'FakeQuantAbsMax', 'QuantizedConv2D',
-    'QuantizedLinear'
+    'QuantizedLinear', 'FakeChannelWiseQuantDequantAbsMax'
 ]
 
 
@@ -209,15 +209,114 @@ class FakeQuantAbsMax(layers.Layer):
         return quant_out
 
 
-def _get_fake_quant_type(quant_type, name, moving_rate, quant_bits, dtype,
-                         quant_on_weight):
+class FakeChannelWiseQuantDequantAbsMax(layers.Layer):
+    def __init__(self,
+                 name=None,
+                 channel_num=None,
+                 quant_bits=8,
+                 quant_axis=0,
+                 dtype='float32',
+                 quant_on_weight=False):
+        assert quant_on_weight == True, "Channel_wise only can be used on weight quantization."
+        super(FakeChannelWiseQuantDequantAbsMax, self).__init__()
+        self._quant_bits = quant_bits
+        self._quant_axis = quant_axis
+        self._dtype = dtype
+        self._name = name
+        self._channel_num = channel_num
+        scale_prefix = "{}.scale".format(
+            name) if name else 'quant_dequant.scale'
+        self._scale_name = unique_name.generate(scale_prefix)
+        if quant_on_weight:
+            scale_attr = ParamAttr(
+                name=self._scale_name,
+                initializer=Constant(0.0),
+                trainable=False)
+            self._scale = self.create_parameter(
+                shape=[self._channel_num], attr=scale_attr, dtype=self._dtype)
+            self._scale.stop_gradient = True
+        else:
+            self._scale = None
+
+    def forward(self, input):
+        if in_dygraph_mode():
+            attrs = ('bit_length', self._quant_bits, 'quant_axis',
+                     self._quant_axis)
+            quant_out = _varbase_creator(
+                type=input.type,
+                name="{}.quantized.dequantized".format(input.name),
+                shape=input.shape,
+                dtype=input.dtype,
+                persistable=False)
+
+            out_scale = self._scale
+            if out_scale is None:
+                out_scale = _varbase_creator(
+                    type=core.VarDesc.VarType.LOD_TENSOR,
+                    name=self._scale_name,
+                    shape=[self._channel_num],
+                    dtype=self._dtype,
+                    persistable=False)
+                out_scale.stop_gradient = True
+
+            out, _, = core.ops.fake_channel_wise_quantize_dequantize_abs_max(
+                input, quant_out, out_scale, *attrs)
+            return out
+
+        check_variable_and_dtype(input, 'input', ['float32'],
+                                 "FakeChannelWiseQuantDequantAbsMax")
+        attrs = {'bit_length': self._quant_bits, 'quant_axis': self._quant_axis}
+        inputs = {"X": [input]}
+        quant_out = self._helper.create_variable(
+            name="{}.quantized.dequantized".format(input.name),
+            dtype=input.dtype,
+            type=core.VarDesc.VarType.LOD_TENSOR,
+            persistable=False,
+            stop_gradient=False)
+        out_scale = self._scale
+        if not out_scale:
+            out_scale = self._helper.create_variable(
+                name=self._scale_name,
+                dtype=self._dtype,
+                type=core.VarDesc.VarType.LOD_TENSOR,
+                persistable=False,
+                stop_gradient=True)
+        outputs = {"Out": [quant_out], "OutScale": [out_scale]}
+
+        self._helper.append_op(
+            type="fake_channel_wise_quantize_dequantize_abs_max",
+            inputs=inputs,
+            outputs=outputs,
+            attrs=attrs)
+
+        return quant_out
+
+
+def _get_fake_quant_type(quant_type, **kwargs):
+    call_args = {
+        "name": kwargs.get("name", None),
+        "quant_bits": kwargs.get("quant_bits", 8),
+        "dtype": kwargs.get("dtype", "float32")
+    }
+
+    if quant_type == 'abs_max':
+        call_args["quant_on_weight"] = kwargs.get("quant_on_weight", False)
+    elif quant_type == 'moving_average_abs_max':
+        call_args["moving_rate"] = kwargs.get("moving_rate", 0.9)
+    elif quant_type == 'channel_wise_abs_max':
+        call_args["quant_on_weight"] = kwargs.get("quant_on_weight", False)
+        call_args["channel_num"] = kwargs.get("channel_num", None)
+        call_args["quant_axis"] = kwargs.get("quant_axis", 0)
+        assert call_args["channel_num"] is not None, (
+            "You need to input channel_num"
+            "when you use channel_wise_abs_max strategy.")
     fake_quant_map = {
-        'abs_max':
-        lambda: FakeQuantAbsMax(name, quant_bits, dtype, quant_on_weight),
-        'moving_average_abs_max':
-        lambda: FakeQuantMovingAverage(name, moving_rate, quant_bits, dtype)
+        'abs_max': FakeQuantAbsMax,
+        'moving_average_abs_max': FakeQuantMovingAverage,
+        'channel_wise_abs_max': FakeChannelWiseQuantDequantAbsMax
     }
-    return fake_quant_map[quant_type]()
+
+    return fake_quant_map[quant_type](**call_args)
 
 
 class QuantizedConv2D(layers.Layer):
@@ -246,12 +345,23 @@ class QuantizedConv2D(layers.Layer):
         self.weight = getattr(layer, 'weight')
         self.bias = getattr(layer, 'bias')
         # For FakeQuant
+        self._conv2d_quant_axis = 0
         self._fake_quant_weight = _get_fake_quant_type(
-            weight_quantize_type, self.weight.name, moving_rate, weight_bits,
-            self._dtype, True)
+            weight_quantize_type,
+            name=self.weight.name,
+            moving_rate=moving_rate,
+            quant_bits=weight_bits,
+            dtype=self._dtype,
+            quant_on_weight=True,
+            channel_num=self.weight.shape[self._conv2d_quant_axis],
+            quant_axis=self._conv2d_quant_axis)
         self._fake_quant_input = _get_fake_quant_type(
             activation_quantize_type,
-            layer.full_name(), moving_rate, activation_bits, self._dtype, False)
+            name=layer.full_name(),
+            moving_rate=moving_rate,
+            quant_bits=activation_bits,
+            dtype=self._dtype,
+            quant_on_weight=False)
 
     def forward(self, input):
         quant_input = self._fake_quant_input(input)
@@ -325,12 +435,23 @@ class QuantizedLinear(layers.Layer):
         self.weight = getattr(layer, 'weight')
         self.bias = getattr(layer, 'bias')
         # For FakeQuant
+        self._linear_quant_axis = 1
         self._fake_quant_weight = _get_fake_quant_type(
-            weight_quantize_type, self.weight.name, moving_rate, weight_bits,
-            self._dtype, True)
+            weight_quantize_type,
+            name=self.weight.name,
+            moving_rate=moving_rate,
+            quant_bits=weight_bits,
+            dtype=self._dtype,
+            quant_on_weight=True,
+            channel_num=self.weight.shape[self._linear_quant_axis],
+            quant_axis=self._linear_quant_axis)
         self._fake_quant_input = _get_fake_quant_type(
             activation_quantize_type,
-            layer.full_name(), moving_rate, activation_bits, self._dtype, False)
+            name=layer.full_name(),
+            moving_rate=moving_rate,
+            quant_bits=activation_bits,
+            dtype=self._dtype,
+            quant_on_weight=False)
 
     def forward(self, input):
         quant_input = self._fake_quant_input(input)
diff --git a/python/paddle/fluid/contrib/slim/quantization/post_training_quantization.py b/python/paddle/fluid/contrib/slim/quantization/post_training_quantization.py
index 244a621611060b87805846f1ea748615bcdde19a..ddbd99e16cebdfc839a8e96e44d4f96f02e70c55 100644
--- a/python/paddle/fluid/contrib/slim/quantization/post_training_quantization.py
+++ b/python/paddle/fluid/contrib/slim/quantization/post_training_quantization.py
@@ -143,7 +143,7 @@ class PostTrainingQuantization(object):
                  weight_quantize_type='channel_wise_abs_max',
                  optimize_model=False,
                  is_use_cache_file=False,
-                 cache_dir="./temp_post_training"):
+                 cache_dir=None):
         '''
         Constructor.
 
@@ -206,13 +206,8 @@ class PostTrainingQuantization(object):
                 `conv2d/depthwise_conv2d + bn`, the weights scale for all channel will
                 be different. In address this problem, fuse the pattern before
                 quantization. Default False.
-            is_use_cache_file(bool, optional): If set is_use_cache_file as False,
-                all temp data will be saved in memory. If set is_use_cache_file as True,
-                it will save temp data to disk. When the fp32 model is complex or
-                the number of calibrate data is large, we should set is_use_cache_file
-                as True. Defalut is False.
-            cache_dir(str, optional): When is_use_cache_file is True, set cache_dir as
-                the directory for saving temp data. Default is ./temp_post_training.
+            is_use_cache_file(bool, optional): This param is deprecated.
+            cache_dir(str, optional): This param is deprecated.
         Returns:
             None
 
@@ -302,10 +297,6 @@ class PostTrainingQuantization(object):
                 assert op_type in self._support_quantize_op_type, \
                     op_type + " is not supported for quantization."
         self._optimize_model = optimize_model
-        self._is_use_cache_file = is_use_cache_file
-        self._cache_dir = cache_dir
-        if self._is_use_cache_file and not os.path.exists(self._cache_dir):
-            os.mkdir(self._cache_dir)
 
         # Define variables
         self._place = self._executor.place
@@ -317,11 +308,17 @@ class PostTrainingQuantization(object):
         self._out_scale_op_list = _out_scale_op_list
         self._quantized_weight_var_name = set()
         self._quantized_act_var_name = set()
-        self.weight_op_pairs = {}
+        self._weight_op_pairs = {}
+        # The vars for alog = KL
+        self._sampling_act_abs_min_max = {}
+        self._sampling_act_histogram = {}
         self._sampling_data = {}
         self._quantized_var_kl_threshold = {}
+        self._histogram_bins = 2048
+        # The vars for algo = min_max
         self._quantized_var_min = {}
         self._quantized_var_max = {}
+        # The vars for algo = abs_max
         self._quantized_var_abs_max = {}
 
     def quantize(self):
@@ -339,6 +336,25 @@ class PostTrainingQuantization(object):
         self._collect_target_varnames()
         self._set_activation_persistable()
 
+        if self._algo == "KL":
+            _logger.info("Preparation stage ...")
+            batch_id = 0
+            for data in self._data_loader():
+                self._executor.run(program=self._program,
+                                   feed=data,
+                                   fetch_list=self._fetch_list,
+                                   return_numpy=False,
+                                   scope=self._scope)
+                self._collect_activation_abs_min_max()
+                if batch_id % 5 == 0:
+                    _logger.info("Run batch: " + str(batch_id))
+                batch_id += 1
+                if self._batch_nums and batch_id >= self._batch_nums:
+                    break
+            _logger.info("Finish preparation stage, all batch:" + str(batch_id))
+            self._init_sampling_act_histogram()
+
+        _logger.info("Sampling stage ...")
         batch_id = 0
         for data in self._data_loader():
             self._executor.run(program=self._program,
@@ -346,17 +362,13 @@ class PostTrainingQuantization(object):
                                fetch_list=self._fetch_list,
                                return_numpy=False,
                                scope=self._scope)
-            if self._algo == "KL":
-                self._sample_data(batch_id)
-            else:
-                self._sample_threshold()
-
+            self._sampling()
             if batch_id % 5 == 0:
                 _logger.info("Run batch: " + str(batch_id))
             batch_id += 1
             if self._batch_nums and batch_id >= self._batch_nums:
                 break
-        _logger.info("Finish all batch: " + str(batch_id))
+        _logger.info("Finish sampling stage, all batch: " + str(batch_id))
 
         self._reset_activation_persistable()
 
@@ -397,6 +409,7 @@ class PostTrainingQuantization(object):
             target_vars=self._fetch_list,
             executor=self._executor,
             main_program=self._program)
+        _logger.info("The quantized model is saved in " + save_model_path)
 
     def _load_model_data(self):
         '''
@@ -454,7 +467,7 @@ class PostTrainingQuantization(object):
             for var_name in var_name_list:
                 if var_name in persistable_var_names:
                     self._quantized_weight_var_name.add(var_name)
-                    self.weight_op_pairs[var_name] = op_type
+                    self._weight_op_pairs[var_name] = op_type
                 else:
                     self._quantized_act_var_name.add(var_name)
 
@@ -494,20 +507,18 @@ class PostTrainingQuantization(object):
             if var.name in self._quantized_act_var_name:
                 var.persistable = False
 
-    def _sample_threshold(self):
+    def _sampling(self):
         '''
-        Sample the input threshold(min, max, or abs_max) in every iterations.
+        Sample the min/max, abs_max or histogram in every iterations.
         '''
-        assert self._algo in ["abs_max", "min_max"], \
-            "The algo should be abs_max or min_max for _sample_threshold."
         if self._algo == "abs_max":
-            self._sample_threshold_abs_max()
+            self._sample_abs_max()
         elif self._algo == "min_max":
-            self._sample_threshold_min_max()
+            self._sample_min_max()
+        elif self._algo == "KL":
+            self._sample_histogram()
 
-    def _sample_threshold_abs_max(self):
-        assert self._algo == "abs_max", \
-            "The algo should be abs_max for _sample_threshold_abs_max."
+    def _sample_abs_max(self):
         # Only calculate abs_max value for weight for once
         if self._quantized_var_abs_max == {}:
             for var_name in self._quantized_weight_var_name:
@@ -516,7 +527,7 @@ class PostTrainingQuantization(object):
                     abs_max_value = float(np.max(np.abs(var_tensor)))
                 elif self._weight_quantize_type == "channel_wise_abs_max":
                     abs_max_value = []
-                    if self.weight_op_pairs[
+                    if self._weight_op_pairs[
                             var_name] in _channelwise_quant_axis1_ops:
                         for i in range(var_tensor.shape[1]):
                             abs_max_value.append(
@@ -534,9 +545,7 @@ class PostTrainingQuantization(object):
                 (abs_max_value > self._quantized_var_abs_max[var_name]):
                 self._quantized_var_abs_max[var_name] = abs_max_value
 
-    def _sample_threshold_min_max(self):
-        assert self._algo == "min_max", \
-            "The algo should be min_max for _sample_threshold_min_max."
+    def _sample_min_max(self):
         if self._quantized_var_min == {} and self._quantized_var_max == {}:
             for var_name in self._quantized_weight_var_name:
                 var_tensor = _load_variable_data(self._scope, var_name)
@@ -546,7 +555,7 @@ class PostTrainingQuantization(object):
                 elif self._weight_quantize_type == "channel_wise_abs_max":
                     min_value = []
                     max_value = []
-                    if self.weight_op_pairs[
+                    if self._weight_op_pairs[
                             var_name] in _channelwise_quant_axis1_ops:
                         for i in range(var_tensor.shape[1]):
                             min_value.append(float(np.min(var_tensor[:, i])))
@@ -569,6 +578,14 @@ class PostTrainingQuantization(object):
                 (max_value > self._quantized_var_max[var_name]):
                 self._quantized_var_max[var_name] = max_value
 
+    def _sample_histogram(self):
+        for var_name in self._quantized_act_var_name:
+            var_tensor = _load_variable_data(self._scope, var_name)
+            var_tensor_abs = np.abs(var_tensor)
+            bins = self._sampling_act_histogram[var_name][1]
+            hist, _ = np.histogram(var_tensor_abs, bins=bins)
+            self._sampling_act_histogram[var_name][0] += hist
+
     def _save_input_threhold(self):
         '''
         Save input threshold to the quantized op.
@@ -585,27 +602,36 @@ class PostTrainingQuantization(object):
                     op._set_attr(var_name + ".max",
                                  self._quantized_var_max[var_name])
 
-    def _sample_data(self, iter):
+    def _collect_activation_abs_min_max(self):
         '''
-        Sample the tensor data of quantized variables, 
-        applied in every iteration.
+        Collect the abs_min and abs_max for all activation. When algo = KL,
+        get the min and max value, and then calculate the threshold.
         '''
-        assert self._algo == "KL", "The algo should be KL to sample data."
-        if self._is_use_cache_file:
-            for var_name in self._quantized_act_var_name:
-                var_tensor = _load_variable_data(self._scope, var_name)
-                var_tensor = var_tensor.ravel()
-                save_path = os.path.join(
-                    self._cache_dir,
-                    var_name.replace("/", ".") + "_" + str(iter) + ".npy")
-                np.save(save_path, var_tensor)
-        else:
-            for var_name in self._quantized_act_var_name:
-                if var_name not in self._sampling_data:
-                    self._sampling_data[var_name] = []
-                var_tensor = _load_variable_data(self._scope, var_name)
-                var_tensor = var_tensor.ravel()
-                self._sampling_data[var_name].append(var_tensor)
+        for var_name in self._quantized_act_var_name:
+            var_tensor = _load_variable_data(self._scope, var_name)
+            var_tensor = np.abs(var_tensor)
+            min_value = float(np.min(var_tensor))
+            max_value = float(np.max(var_tensor))
+            if var_name not in self._sampling_act_abs_min_max:
+                self._sampling_act_abs_min_max[
+                    var_name] = [min_value, max_value]
+            else:
+                if min_value < self._sampling_act_abs_min_max[var_name][0]:
+                    self._sampling_act_abs_min_max[var_name][0] = min_value
+                if max_value > self._sampling_act_abs_min_max[var_name][1]:
+                    self._sampling_act_abs_min_max[var_name][1] = max_value
+
+    def _init_sampling_act_histogram(self):
+        '''
+        Based on the min/max value, init the sampling_act_histogram.
+        '''
+        for var_name in self._quantized_act_var_name:
+            if var_name not in self._sampling_act_histogram:
+                min_val = self._sampling_act_abs_min_max[var_name][0]
+                max_val = self._sampling_act_abs_min_max[var_name][1]
+                hist, hist_edeges = np.histogram(
+                    [], bins=self._histogram_bins, range=(min_val, max_val))
+                self._sampling_act_histogram[var_name] = [hist, hist_edeges]
 
     def _calculate_kl_threshold(self):
         '''
@@ -621,7 +647,7 @@ class PostTrainingQuantization(object):
                 weight_threshold = float(np.max(np.abs(weight_data)))
             elif self._weight_quantize_type == "channel_wise_abs_max":
                 weight_threshold = []
-                if self.weight_op_pairs[
+                if self._weight_op_pairs[
                         var_name] in _channelwise_quant_axis1_ops:
                     for i in range(weight_data.shape[1]):
                         weight_threshold.append(
@@ -632,25 +658,10 @@ class PostTrainingQuantization(object):
                             float(np.max(np.abs(weight_data[i]))))
             self._quantized_var_kl_threshold[var_name] = weight_threshold
 
-        # KL threshold for activations
-        if self._is_use_cache_file:
-            for var_name in self._quantized_act_var_name:
-                sampling_data = []
-                filenames = [f for f in os.listdir(self._cache_dir) \
-                    if re.match(var_name.replace("/", ".")  + '_[0-9]+.npy', f)]
-                for filename in filenames:
-                    file_path = os.path.join(self._cache_dir, filename)
-                    sampling_data.append(np.load(file_path))
-                    os.remove(file_path)
-                sampling_data = np.concatenate(sampling_data)
-                self._quantized_var_kl_threshold[var_name] = \
-                    self._get_kl_scaling_factor(np.abs(sampling_data))
-        else:
-            for var_name in self._quantized_act_var_name:
-                self._sampling_data[var_name] = np.concatenate(
-                    self._sampling_data[var_name])
-                self._quantized_var_kl_threshold[var_name] = \
-                    self._get_kl_scaling_factor(np.abs(self._sampling_data[var_name]))
+        for var_name in self._quantized_act_var_name:
+            hist, hist_edeges = self._sampling_act_histogram[var_name]
+            self._quantized_var_kl_threshold[var_name] = \
+                self._get_kl_scaling_factor(hist, hist_edeges)
 
     def _update_program(self):
         '''
@@ -765,22 +776,15 @@ class PostTrainingQuantization(object):
                 for var_name in out_var_names:
                     analysis_and_save_info(op, var_name)
 
-    def _get_kl_scaling_factor(self, activation_blob, num_quantized_bins=255):
+    def _get_kl_scaling_factor(self, hist, hist_edeges, num_quantized_bins=255):
         '''
         Using the KL-divergenc method to get the more precise scaling factor.
         '''
-        max_val = np.max(activation_blob)
-        min_val = np.min(activation_blob)
-        if min_val >= 0:
-            hist, hist_edeges = np.histogram(
-                activation_blob, bins=2048, range=(min_val, max_val))
-            ending_iter = 2047
-            starting_iter = int(ending_iter * 0.7)
-        else:
-            _logger.error("Please first apply abs to activation_blob.")
+        ending_iter = self._histogram_bins - 1
+        starting_iter = int(ending_iter * 0.7)
         bin_width = hist_edeges[1] - hist_edeges[0]
 
-        P_sum = len(np.array(activation_blob).ravel())
+        P_sum = np.sum(np.array(hist).ravel())
         min_kl_divergence = 0
         min_kl_index = 0
         kl_inited = False
diff --git a/python/paddle/fluid/contrib/slim/quantization/quant2_int8_mkldnn_pass.py b/python/paddle/fluid/contrib/slim/quantization/quant2_int8_mkldnn_pass.py
index 75e1ea43d15e432d2f6cbec271acd67624de1e01..dadc756c43ecc782a72c1c7d6626e00bc182f2c6 100644
--- a/python/paddle/fluid/contrib/slim/quantization/quant2_int8_mkldnn_pass.py
+++ b/python/paddle/fluid/contrib/slim/quantization/quant2_int8_mkldnn_pass.py
@@ -299,11 +299,14 @@ class Quant2Int8MkldnnPass(object):
         # Convert int8 range weights to fp32 range weights
         scales = self._weight_scales[output_var_name]
         weight = self._load_param(self._scope, weight_var_name)
-        assert scales.size == 1 or scales.size == len(
-            weight
-        ), "The size of weight scales vector ({}) does not match the number of output channels ({}) in the weights tensor {}.".format(
-            scales.size, len(weight), weight_var_name)
-        w_fp32 = np.divide(np.multiply(weight, self._s8_max).T, scales.T).T
+        if scales.size == 1 or scales.size == weight.shape[0]:
+            w_fp32 = np.divide(np.multiply(weight, self._s8_max).T, scales.T).T
+        elif len(weight.shape) > 1 and scales.size == weight.shape[1]:
+            w_fp32 = np.divide(np.multiply(weight, self._s8_max), scales)
+        else:
+            raise ValueError(
+                "The size of weight scales vector ({}) does not match the dimensions ({}) of the weights tensor {}."
+                .format(scales.size, weight.shape, weight_var_name))
         w_fp32 = w_fp32.reshape(weight.shape).astype(np.float32)
         self._restore_var(weight_var_name, w_fp32)
 
diff --git a/python/paddle/fluid/contrib/slim/tests/CMakeLists.txt b/python/paddle/fluid/contrib/slim/tests/CMakeLists.txt
index 007d701284dfc7ff2cafb128984414517579fce3..6ac005060e0b21d88f17619bbe88b7a56c23fdb8 100644
--- a/python/paddle/fluid/contrib/slim/tests/CMakeLists.txt
+++ b/python/paddle/fluid/contrib/slim/tests/CMakeLists.txt
@@ -270,7 +270,7 @@ foreach(src ${TEST_OPS})
 endforeach()
 
 # setting timeout value for old unittests
-if(NOT WIN32)
+if(NOT WIN32 AND NOT APPLE)
     set_tests_properties(test_post_training_quantization_mobilenetv1 PROPERTIES TIMEOUT 250 LABELS "RUN_TYPE=NIGHTLY")
-	  set_tests_properties(test_post_training_quantization_resnet50 PROPERTIES TIMEOUT 200 LABELS "RUN_TYPE=NIGHTLY")
+	set_tests_properties(test_post_training_quantization_resnet50 PROPERTIES TIMEOUT 200 LABELS "RUN_TYPE=NIGHTLY")
 endif()
diff --git a/python/paddle/fluid/contrib/slim/tests/convert_model2dot.py b/python/paddle/fluid/contrib/slim/tests/convert_model2dot.py
index 877897c0a0e7282546727d56b54c0af506e18bc0..0018d81dbf248726186cf3170fa9f5d32fa785fd 100644
--- a/python/paddle/fluid/contrib/slim/tests/convert_model2dot.py
+++ b/python/paddle/fluid/contrib/slim/tests/convert_model2dot.py
@@ -19,6 +19,9 @@ import argparse
 import paddle.fluid as fluid
 from paddle.fluid.framework import IrGraph
 from paddle.fluid import core
+import paddle
+
+paddle.enable_static()
 
 
 def parse_args():
diff --git a/python/paddle/fluid/contrib/slim/tests/quant2_int8_image_classification_comparison.py b/python/paddle/fluid/contrib/slim/tests/quant2_int8_image_classification_comparison.py
index 17e0f452e98220b2de97e9567311efeffdee27b4..3fba0e892184953b300a54dd8590e07e81bc5f2d 100644
--- a/python/paddle/fluid/contrib/slim/tests/quant2_int8_image_classification_comparison.py
+++ b/python/paddle/fluid/contrib/slim/tests/quant2_int8_image_classification_comparison.py
@@ -27,6 +27,8 @@ from paddle.fluid.framework import IrGraph
 from paddle.fluid.contrib.slim.quantization import Quant2Int8MkldnnPass
 from paddle.fluid import core
 
+paddle.enable_static()
+
 logging.basicConfig(format='%(asctime)s-%(levelname)s: %(message)s')
 _logger = logging.getLogger(__name__)
 _logger.setLevel(logging.INFO)
diff --git a/python/paddle/fluid/contrib/slim/tests/quant2_int8_nlp_comparison.py b/python/paddle/fluid/contrib/slim/tests/quant2_int8_nlp_comparison.py
index a534edb7efd51f5eb7fd0c20540d531a44a84f53..12d1cfcc41d53f1a4e979128631559f89c6c299b 100644
--- a/python/paddle/fluid/contrib/slim/tests/quant2_int8_nlp_comparison.py
+++ b/python/paddle/fluid/contrib/slim/tests/quant2_int8_nlp_comparison.py
@@ -25,6 +25,8 @@ from paddle.fluid.framework import IrGraph
 from paddle.fluid.contrib.slim.quantization import Quant2Int8MkldnnPass
 from paddle.fluid import core
 
+paddle.enable_static()
+
 logging.basicConfig(format='%(asctime)s-%(levelname)s: %(message)s')
 _logger = logging.getLogger(__name__)
 _logger.setLevel(logging.INFO)
diff --git a/python/paddle/fluid/contrib/slim/tests/quant_int8_image_classification_comparison.py b/python/paddle/fluid/contrib/slim/tests/quant_int8_image_classification_comparison.py
index 5f0a8f2d6fa9818481096249aaf74da27a852531..b81ef7b30ed4783133e46f7b895569db68438912 100644
--- a/python/paddle/fluid/contrib/slim/tests/quant_int8_image_classification_comparison.py
+++ b/python/paddle/fluid/contrib/slim/tests/quant_int8_image_classification_comparison.py
@@ -27,6 +27,8 @@ from paddle.fluid.framework import IrGraph
 from paddle.fluid.contrib.slim.quantization import QuantInt8MkldnnPass
 from paddle.fluid import core
 
+paddle.enable_static()
+
 logging.basicConfig(format='%(asctime)s-%(levelname)s: %(message)s')
 _logger = logging.getLogger(__name__)
 _logger.setLevel(logging.INFO)
diff --git a/python/paddle/fluid/contrib/slim/tests/save_quant_model.py b/python/paddle/fluid/contrib/slim/tests/save_quant_model.py
index dab4b63cda4cca8036b4236d44cb54660258c0d4..e38148250af2177801995d263dc6d3c9502bc501 100644
--- a/python/paddle/fluid/contrib/slim/tests/save_quant_model.py
+++ b/python/paddle/fluid/contrib/slim/tests/save_quant_model.py
@@ -27,6 +27,8 @@ from paddle.fluid.framework import IrGraph
 from paddle.fluid.contrib.slim.quantization import Quant2Int8MkldnnPass
 from paddle.fluid import core
 
+paddle.enable_static()
+
 
 def parse_args():
     parser = argparse.ArgumentParser()
diff --git a/python/paddle/fluid/contrib/slim/tests/test_graph.py b/python/paddle/fluid/contrib/slim/tests/test_graph.py
index 2cf897ec418fa75a70cfa7fa3fe0a4b9e79d3c65..435cefd73e733379eb96821519a5687dfba50046 100644
--- a/python/paddle/fluid/contrib/slim/tests/test_graph.py
+++ b/python/paddle/fluid/contrib/slim/tests/test_graph.py
@@ -22,6 +22,8 @@ import paddle.fluid as fluid
 from paddle.fluid.framework import IrGraph
 from paddle.fluid import core
 
+paddle.enable_static()
+
 os.environ["CUDA_VISIBLE_DEVICES"] = "0"
 os.environ["CPU_NUM"] = "1"
 
diff --git a/python/paddle/fluid/contrib/slim/tests/test_imperative_qat.py b/python/paddle/fluid/contrib/slim/tests/test_imperative_qat.py
index 79b0bbd6a4dd3850f49aa0b5124e9be86d4e6ee3..df505cf2435e73d4c30f641451fb1225a21816c6 100644
--- a/python/paddle/fluid/contrib/slim/tests/test_imperative_qat.py
+++ b/python/paddle/fluid/contrib/slim/tests/test_imperative_qat.py
@@ -32,6 +32,8 @@ from paddle.fluid.dygraph.nn import Pool2D
 from paddle.fluid.dygraph.nn import Linear
 from paddle.fluid.log_helper import get_logger
 
+paddle.enable_static()
+
 os.environ["CPU_NUM"] = "1"
 if core.is_compiled_with_cuda():
     fluid.set_flags({"FLAGS_cudnn_deterministic": True})
@@ -181,7 +183,6 @@ class TestImperativeQat(unittest.TestCase):
 
                     img = fluid.dygraph.to_variable(x_data)
                     label = fluid.dygraph.to_variable(y_data)
-
                     out = lenet(img)
                     acc = fluid.layers.accuracy(out, label)
                     loss = fluid.layers.cross_entropy(out, label)
@@ -221,7 +222,7 @@ class TestImperativeQat(unittest.TestCase):
             model_dict = lenet.state_dict()
             fluid.save_dygraph(model_dict, "save_temp")
 
-            # test the correctness of `save_quantized_model`
+            # test the correctness of `paddle.jit.save`
             data = next(test_reader())
             test_data = np.array([x[0].reshape(1, 28, 28)
                                   for x in data]).astype('float32')
@@ -231,13 +232,14 @@ class TestImperativeQat(unittest.TestCase):
 
         # save inference quantized model
         path = "./mnist_infer_model"
-        imperative_qat.save_quantized_model(
-            dirname=path,
-            model=lenet,
-            input_shape=[(1, 28, 28)],
-            input_dtype=['float32'],
-            feed=[0],
-            fetch=[0])
+        paddle.jit.save(
+            layer=lenet,
+            model_path=path,
+            input_spec=[
+                paddle.static.InputSpec(
+                    shape=[None, 1, 28, 28], dtype='float32')
+            ])
+
         if core.is_compiled_with_cuda():
             place = core.CUDAPlace(0)
         else:
@@ -245,7 +247,10 @@ class TestImperativeQat(unittest.TestCase):
         exe = fluid.Executor(place)
         [inference_program, feed_target_names, fetch_targets] = (
             fluid.io.load_inference_model(
-                dirname=path, executor=exe))
+                dirname=path,
+                executor=exe,
+                model_filename="__model__",
+                params_filename="__variables__"))
         after_save, = exe.run(inference_program,
                               feed={feed_target_names[0]: test_data},
                               fetch_list=fetch_targets)
@@ -332,13 +337,13 @@ class TestImperativeQat(unittest.TestCase):
                 if batch_id % 100 == 0:
                     _logger.info('{}: {}'.format('loss', avg_loss.numpy()))
 
-        imperative_qat.save_quantized_model(
-            dirname="./dynamic_mnist",
-            model=lenet,
-            input_shape=[(1, 28, 28)],
-            input_dtype=['float32'],
-            feed=[0],
-            fetch=[0])
+        paddle.jit.save(
+            layer=lenet,
+            model_path="./dynamic_mnist",
+            input_spec=[
+                paddle.static.InputSpec(
+                    shape=[None, 1, 28, 28], dtype='float32')
+            ])
 
         # static graph train
         _logger.info(
diff --git a/python/paddle/fluid/contrib/slim/tests/test_imperative_qat_channelwise.py b/python/paddle/fluid/contrib/slim/tests/test_imperative_qat_channelwise.py
new file mode 100644
index 0000000000000000000000000000000000000000..80d388ac0da6219bda8e485aabaaf7fea44f6cd0
--- /dev/null
+++ b/python/paddle/fluid/contrib/slim/tests/test_imperative_qat_channelwise.py
@@ -0,0 +1,430 @@
+#   copyright (c) 2018 paddlepaddle authors. all rights reserved.
+#
+# licensed under the apache license, version 2.0 (the "license");
+# you may not use this file except in compliance with the license.
+# you may obtain a copy of the license at
+#
+#     http://www.apache.org/licenses/license-2.0
+#
+# unless required by applicable law or agreed to in writing, software
+# distributed under the license is distributed on an "as is" basis,
+# without warranties or conditions of any kind, either express or implied.
+# see the license for the specific language governing permissions and
+# limitations under the license.
+
+from __future__ import print_function
+
+import os
+import numpy as np
+import random
+import unittest
+import logging
+import paddle
+import paddle.fluid as fluid
+from paddle.fluid import core
+from paddle.fluid.optimizer import AdamOptimizer
+from paddle.fluid.framework import IrGraph
+from paddle.fluid.contrib.slim.quantization import ImperativeQuantAware
+from paddle.fluid.contrib.slim.quantization import QuantizationTransformPass
+from paddle.fluid.dygraph.container import Sequential
+from paddle.fluid.dygraph.nn import Conv2D
+from paddle.fluid.dygraph.nn import Pool2D
+from paddle.fluid.dygraph.nn import Linear
+from paddle.fluid.log_helper import get_logger
+
+paddle.enable_static()
+
+os.environ["CPU_NUM"] = "1"
+if core.is_compiled_with_cuda():
+    fluid.set_flags({"FLAGS_cudnn_deterministic": True})
+
+_logger = get_logger(
+    __name__, logging.INFO, fmt='%(asctime)s-%(levelname)s: %(message)s')
+
+
+def StaticLenet(data, num_classes=10, classifier_activation='softmax'):
+    conv2d_w1_attr = fluid.ParamAttr(name="conv2d_w_1")
+    conv2d_w2_attr = fluid.ParamAttr(name="conv2d_w_2")
+    fc_w1_attr = fluid.ParamAttr(name="fc_w_1")
+    fc_w2_attr = fluid.ParamAttr(name="fc_w_2")
+    fc_w3_attr = fluid.ParamAttr(name="fc_w_3")
+    conv2d_b1_attr = fluid.ParamAttr(name="conv2d_b_1")
+    conv2d_b2_attr = fluid.ParamAttr(name="conv2d_b_2")
+    fc_b1_attr = fluid.ParamAttr(name="fc_b_1")
+    fc_b2_attr = fluid.ParamAttr(name="fc_b_2")
+    fc_b3_attr = fluid.ParamAttr(name="fc_b_3")
+    conv1 = fluid.layers.conv2d(
+        data,
+        num_filters=6,
+        filter_size=3,
+        stride=1,
+        padding=1,
+        param_attr=conv2d_w1_attr,
+        bias_attr=conv2d_b1_attr)
+    pool1 = fluid.layers.pool2d(
+        conv1, pool_size=2, pool_type='max', pool_stride=2)
+    conv2 = fluid.layers.conv2d(
+        pool1,
+        num_filters=16,
+        filter_size=5,
+        stride=1,
+        padding=0,
+        param_attr=conv2d_w2_attr,
+        bias_attr=conv2d_b2_attr)
+    pool2 = fluid.layers.pool2d(
+        conv2, pool_size=2, pool_type='max', pool_stride=2)
+
+    fc1 = fluid.layers.fc(input=pool2,
+                          size=120,
+                          param_attr=fc_w1_attr,
+                          bias_attr=fc_b1_attr)
+    fc2 = fluid.layers.fc(input=fc1,
+                          size=84,
+                          param_attr=fc_w2_attr,
+                          bias_attr=fc_b2_attr)
+    fc3 = fluid.layers.fc(input=fc2,
+                          size=num_classes,
+                          act=classifier_activation,
+                          param_attr=fc_w3_attr,
+                          bias_attr=fc_b3_attr)
+
+    return fc3
+
+
+class ImperativeLenet(fluid.dygraph.Layer):
+    def __init__(self, num_classes=10, classifier_activation='softmax'):
+        super(ImperativeLenet, self).__init__()
+        conv2d_w1_attr = fluid.ParamAttr(name="conv2d_w_1")
+        conv2d_w2_attr = fluid.ParamAttr(name="conv2d_w_2")
+        fc_w1_attr = fluid.ParamAttr(name="fc_w_1")
+        fc_w2_attr = fluid.ParamAttr(name="fc_w_2")
+        fc_w3_attr = fluid.ParamAttr(name="fc_w_3")
+        conv2d_b1_attr = fluid.ParamAttr(name="conv2d_b_1")
+        conv2d_b2_attr = fluid.ParamAttr(name="conv2d_b_2")
+        fc_b1_attr = fluid.ParamAttr(name="fc_b_1")
+        fc_b2_attr = fluid.ParamAttr(name="fc_b_2")
+        fc_b3_attr = fluid.ParamAttr(name="fc_b_3")
+        self.features = Sequential(
+            Conv2D(
+                num_channels=1,
+                num_filters=6,
+                filter_size=3,
+                stride=1,
+                padding=1,
+                param_attr=conv2d_w1_attr,
+                bias_attr=conv2d_b1_attr),
+            Pool2D(
+                pool_size=2, pool_type='max', pool_stride=2),
+            Conv2D(
+                num_channels=6,
+                num_filters=16,
+                filter_size=5,
+                stride=1,
+                padding=0,
+                param_attr=conv2d_w2_attr,
+                bias_attr=conv2d_b2_attr),
+            Pool2D(
+                pool_size=2, pool_type='max', pool_stride=2))
+
+        self.fc = Sequential(
+            Linear(
+                input_dim=400,
+                output_dim=120,
+                param_attr=fc_w1_attr,
+                bias_attr=fc_b1_attr),
+            Linear(
+                input_dim=120,
+                output_dim=84,
+                param_attr=fc_w2_attr,
+                bias_attr=fc_b2_attr),
+            Linear(
+                input_dim=84,
+                output_dim=num_classes,
+                act=classifier_activation,
+                param_attr=fc_w3_attr,
+                bias_attr=fc_b3_attr))
+
+    def forward(self, inputs):
+        x = self.features(inputs)
+
+        x = fluid.layers.flatten(x, 1)
+        x = self.fc(x)
+        return x
+
+
+class TestImperativeQat(unittest.TestCase):
+    """
+    QAT = quantization-aware training
+    """
+
+    def test_qat_save(self):
+        imperative_qat = ImperativeQuantAware(
+            weight_quantize_type='channel_wise_abs_max',
+            activation_quantize_type='moving_average_abs_max')
+
+        with fluid.dygraph.guard():
+            lenet = ImperativeLenet()
+            imperative_qat.quantize(lenet)
+            adam = AdamOptimizer(
+                learning_rate=0.001, parameter_list=lenet.parameters())
+            train_reader = paddle.batch(
+                paddle.dataset.mnist.train(), batch_size=32, drop_last=True)
+            test_reader = paddle.batch(
+                paddle.dataset.mnist.test(), batch_size=32)
+
+            epoch_num = 1
+            for epoch in range(epoch_num):
+                lenet.train()
+                for batch_id, data in enumerate(train_reader()):
+                    x_data = np.array([x[0].reshape(1, 28, 28)
+                                       for x in data]).astype('float32')
+                    y_data = np.array(
+                        [x[1] for x in data]).astype('int64').reshape(-1, 1)
+
+                    img = fluid.dygraph.to_variable(x_data)
+                    label = fluid.dygraph.to_variable(y_data)
+                    out = lenet(img)
+                    acc = fluid.layers.accuracy(out, label)
+                    loss = fluid.layers.cross_entropy(out, label)
+                    avg_loss = fluid.layers.mean(loss)
+                    avg_loss.backward()
+                    adam.minimize(avg_loss)
+                    lenet.clear_gradients()
+                    if batch_id % 100 == 0:
+                        _logger.info(
+                            "Train | At epoch {} step {}: loss = {:}, acc= {:}".
+                            format(epoch, batch_id,
+                                   avg_loss.numpy(), acc.numpy()))
+
+                lenet.eval()
+                for batch_id, data in enumerate(test_reader()):
+                    x_data = np.array([x[0].reshape(1, 28, 28)
+                                       for x in data]).astype('float32')
+                    y_data = np.array(
+                        [x[1] for x in data]).astype('int64').reshape(-1, 1)
+
+                    img = fluid.dygraph.to_variable(x_data)
+                    label = fluid.dygraph.to_variable(y_data)
+
+                    out = lenet(img)
+                    acc_top1 = fluid.layers.accuracy(
+                        input=out, label=label, k=1)
+                    acc_top5 = fluid.layers.accuracy(
+                        input=out, label=label, k=5)
+
+                    if batch_id % 100 == 0:
+                        _logger.info(
+                            "Test | At epoch {} step {}: acc1 = {:}, acc5 = {:}".
+                            format(epoch, batch_id,
+                                   acc_top1.numpy(), acc_top5.numpy()))
+
+            # save weights
+            model_dict = lenet.state_dict()
+            fluid.save_dygraph(model_dict, "save_temp")
+
+            # test the correctness of `paddle.jit.save`
+            data = next(test_reader())
+            test_data = np.array([x[0].reshape(1, 28, 28)
+                                  for x in data]).astype('float32')
+            test_img = fluid.dygraph.to_variable(test_data)
+            lenet.eval()
+            before_save = lenet(test_img)
+
+        # save inference quantized model
+        path = "./mnist_infer_model"
+        paddle.jit.save(
+            layer=lenet,
+            model_path=path,
+            input_spec=[
+                paddle.static.InputSpec(
+                    shape=[None, 1, 28, 28], dtype='float32')
+            ])
+
+        if core.is_compiled_with_cuda():
+            place = core.CUDAPlace(0)
+        else:
+            place = core.CPUPlace()
+        exe = fluid.Executor(place)
+        [inference_program, feed_target_names, fetch_targets] = (
+            fluid.io.load_inference_model(
+                dirname=path,
+                executor=exe,
+                model_filename="__model__",
+                params_filename="__variables__"))
+        after_save, = exe.run(inference_program,
+                              feed={feed_target_names[0]: test_data},
+                              fetch_list=fetch_targets)
+
+        self.assertTrue(
+            np.allclose(after_save, before_save.numpy()),
+            msg='Failed to save the inference quantized model.')
+
+    def test_qat_acc(self):
+        def _build_static_lenet(main, startup, is_test=False, seed=1000):
+            with fluid.unique_name.guard():
+                with fluid.program_guard(main, startup):
+                    main.random_seed = seed
+                    startup.random_seed = seed
+                    img = fluid.layers.data(
+                        name='image', shape=[1, 28, 28], dtype='float32')
+                    label = fluid.layers.data(
+                        name='label', shape=[1], dtype='int64')
+                    prediction = StaticLenet(img)
+                    if not is_test:
+                        loss = fluid.layers.cross_entropy(
+                            input=prediction, label=label)
+                        avg_loss = fluid.layers.mean(loss)
+                    else:
+                        avg_loss = prediction
+            return img, label, avg_loss
+
+        reader = paddle.batch(
+            paddle.dataset.mnist.test(), batch_size=32, drop_last=True)
+        weight_quantize_type = 'channel_wise_abs_max'
+        activation_quant_type = 'moving_average_abs_max'
+        param_init_map = {}
+        seed = 1000
+        lr = 0.1
+
+        # imperative train
+        _logger.info(
+            "--------------------------dynamic graph qat--------------------------"
+        )
+        imperative_qat = ImperativeQuantAware(
+            weight_quantize_type=weight_quantize_type,
+            activation_quantize_type=activation_quant_type)
+
+        with fluid.dygraph.guard():
+            np.random.seed(seed)
+            fluid.default_main_program().random_seed = seed
+            fluid.default_startup_program().random_seed = seed
+            lenet = ImperativeLenet()
+            fixed_state = {}
+            for name, param in lenet.named_parameters():
+                p_shape = param.numpy().shape
+                p_value = param.numpy()
+                if name.endswith("bias"):
+                    value = np.zeros_like(p_value).astype('float32')
+                else:
+                    value = np.random.normal(
+                        loc=0.0, scale=0.01, size=np.product(p_shape)).reshape(
+                            p_shape).astype('float32')
+                fixed_state[name] = value
+                param_init_map[param.name] = value
+            lenet.set_dict(fixed_state)
+
+            imperative_qat.quantize(lenet)
+            adam = AdamOptimizer(
+                learning_rate=lr, parameter_list=lenet.parameters())
+            dynamic_loss_rec = []
+            lenet.train()
+            for batch_id, data in enumerate(reader()):
+                x_data = np.array([x[0].reshape(1, 28, 28)
+                                   for x in data]).astype('float32')
+                y_data = np.array(
+                    [x[1] for x in data]).astype('int64').reshape(-1, 1)
+
+                img = fluid.dygraph.to_variable(x_data)
+                label = fluid.dygraph.to_variable(y_data)
+
+                out = lenet(img)
+                loss = fluid.layers.cross_entropy(out, label)
+                avg_loss = fluid.layers.mean(loss)
+                avg_loss.backward()
+                adam.minimize(avg_loss)
+                lenet.clear_gradients()
+                dynamic_loss_rec.append(avg_loss.numpy()[0])
+                if batch_id % 100 == 0:
+                    _logger.info('{}: {}'.format('loss', avg_loss.numpy()))
+
+        paddle.jit.save(
+            layer=lenet,
+            model_path="./dynamic_mnist",
+            input_spec=[
+                paddle.static.InputSpec(
+                    shape=[None, 1, 28, 28], dtype='float32')
+            ])
+
+        # static graph train
+        _logger.info(
+            "--------------------------static graph qat--------------------------"
+        )
+        static_loss_rec = []
+        if core.is_compiled_with_cuda():
+            place = core.CUDAPlace(0)
+        else:
+            place = core.CPUPlace()
+        exe = fluid.Executor(place)
+
+        main = fluid.Program()
+        infer = fluid.Program()
+        startup = fluid.Program()
+        static_img, static_label, static_loss = _build_static_lenet(
+            main, startup, False, seed)
+        infer_img, _, infer_pre = _build_static_lenet(infer, startup, True,
+                                                      seed)
+        with fluid.unique_name.guard():
+            with fluid.program_guard(main, startup):
+                opt = AdamOptimizer(learning_rate=lr)
+                opt.minimize(static_loss)
+
+        scope = core.Scope()
+        with fluid.scope_guard(scope):
+            exe.run(startup)
+        for param in main.all_parameters():
+            param_tensor = scope.var(param.name).get_tensor()
+            param_tensor.set(param_init_map[param.name], place)
+
+        main_graph = IrGraph(core.Graph(main.desc), for_test=False)
+        infer_graph = IrGraph(core.Graph(infer.desc), for_test=True)
+        transform_pass = QuantizationTransformPass(
+            scope=scope,
+            place=place,
+            activation_quantize_type=activation_quant_type,
+            weight_quantize_type=weight_quantize_type,
+            quantizable_op_type=['conv2d', 'depthwise_conv2d', 'mul'])
+        transform_pass.apply(main_graph)
+        transform_pass.apply(infer_graph)
+        build_strategy = fluid.BuildStrategy()
+        build_strategy.fuse_all_reduce_ops = False
+        binary = fluid.CompiledProgram(main_graph.graph).with_data_parallel(
+            loss_name=static_loss.name, build_strategy=build_strategy)
+
+        feeder = fluid.DataFeeder(
+            feed_list=[static_img, static_label], place=place)
+        with fluid.scope_guard(scope):
+            for batch_id, data in enumerate(reader()):
+                loss_v, = exe.run(binary,
+                                  feed=feeder.feed(data),
+                                  fetch_list=[static_loss])
+                static_loss_rec.append(loss_v[0])
+                if batch_id % 100 == 0:
+                    _logger.info('{}: {}'.format('loss', loss_v))
+
+        save_program = infer_graph.to_program()
+        with fluid.scope_guard(scope):
+            fluid.io.save_inference_model("./static_mnist", [infer_img.name],
+                                          [infer_pre], exe, save_program)
+        rtol = 1e-05
+        atol = 1e-08
+        for i, (loss_d,
+                loss_s) in enumerate(zip(dynamic_loss_rec, static_loss_rec)):
+            diff = np.abs(loss_d - loss_s)
+            if diff > (atol + rtol * np.abs(loss_s)):
+                _logger.info(
+                    "diff({}) at {}, dynamic loss = {}, static loss = {}".
+                    format(diff, i, loss_d, loss_s))
+                break
+
+        self.assertTrue(
+            np.allclose(
+                np.array(dynamic_loss_rec),
+                np.array(static_loss_rec),
+                rtol=rtol,
+                atol=atol,
+                equal_nan=True),
+            msg='Failed to do the imperative qat.')
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/fluid/contrib/slim/tests/test_post_training_quantization_mnist.py b/python/paddle/fluid/contrib/slim/tests/test_post_training_quantization_mnist.py
index 3ac1590b8aa6eaefbccd3907b314fb438386ffc6..3ea1c84f976a85850a2496218a248eb09ae20022 100644
--- a/python/paddle/fluid/contrib/slim/tests/test_post_training_quantization_mnist.py
+++ b/python/paddle/fluid/contrib/slim/tests/test_post_training_quantization_mnist.py
@@ -25,6 +25,8 @@ import paddle.fluid as fluid
 from paddle.dataset.common import download
 from paddle.fluid.contrib.slim.quantization import PostTrainingQuantization
 
+paddle.enable_static()
+
 random.seed(0)
 np.random.seed(0)
 
diff --git a/python/paddle/fluid/contrib/slim/tests/test_post_training_quantization_mobilenetv1.py b/python/paddle/fluid/contrib/slim/tests/test_post_training_quantization_mobilenetv1.py
index 864631ec27829e29aabb1a00a858cd0ce85e8389..18389d9433b9a5dd81e2f7e1725ce484a26d7a4a 100644
--- a/python/paddle/fluid/contrib/slim/tests/test_post_training_quantization_mobilenetv1.py
+++ b/python/paddle/fluid/contrib/slim/tests/test_post_training_quantization_mobilenetv1.py
@@ -26,6 +26,8 @@ import paddle.fluid as fluid
 from paddle.dataset.common import download
 from paddle.fluid.contrib.slim.quantization import PostTrainingQuantization
 
+paddle.enable_static()
+
 random.seed(0)
 np.random.seed(0)
 
diff --git a/python/paddle/fluid/contrib/slim/tests/test_post_training_quantization_resnet50.py b/python/paddle/fluid/contrib/slim/tests/test_post_training_quantization_resnet50.py
index a6c19b5e45a41ba8f30648befb44de5ad30d6fe8..12b5a2458a4da055710d4af08b97cdfff052ed8d 100644
--- a/python/paddle/fluid/contrib/slim/tests/test_post_training_quantization_resnet50.py
+++ b/python/paddle/fluid/contrib/slim/tests/test_post_training_quantization_resnet50.py
@@ -15,6 +15,9 @@
 import sys
 import unittest
 from test_post_training_quantization_mobilenetv1 import TestPostTrainingQuantization
+import paddle
+
+paddle.enable_static()
 
 
 class TestPostTrainingForResnet50(TestPostTrainingQuantization):
diff --git a/python/paddle/fluid/contrib/slim/tests/test_quant2_int8_mkldnn_pass.py b/python/paddle/fluid/contrib/slim/tests/test_quant2_int8_mkldnn_pass.py
index fcbb1b66ad1fd73a152b9128fa75a152baecd223..7f9209c8b3ff8c20040bdd80bb4302f39c621546 100644
--- a/python/paddle/fluid/contrib/slim/tests/test_quant2_int8_mkldnn_pass.py
+++ b/python/paddle/fluid/contrib/slim/tests/test_quant2_int8_mkldnn_pass.py
@@ -18,6 +18,9 @@ import paddle.fluid as fluid
 import paddle.fluid.core as core
 from paddle.fluid.framework import IrGraph
 from paddle.fluid.contrib.slim.quantization import Quant2Int8MkldnnPass
+import paddle
+
+paddle.enable_static()
 
 
 class TestQuant2Int8MkldnnPass(unittest.TestCase):
@@ -43,7 +46,7 @@ class TestQuant2Int8MkldnnPass(unittest.TestCase):
         self.conv_output = np.ndarray(self.conv_output_size).astype(self.dtype)
         self.conv_output2 = np.ndarray(self.conv_output2_size).astype(
             self.dtype)
-        self.quantized_ops = 'conv2d'
+        self.quantized_ops = 'conv2d,mul'
         self.variables = {
             "input": self.input,
             "filter": self.filter,
@@ -51,6 +54,22 @@ class TestQuant2Int8MkldnnPass(unittest.TestCase):
             "conv_output": self.conv_output,
             "conv_output2": self.conv_output2,
         }
+        self.mul_input_size = [1, 3]
+        self.mul_weights_size = [3, 5]
+        self.mul_output_size = [1, 5]
+        self.mul_input = np.random.random(self.mul_input_size).astype(
+            self.dtype)
+        self.mul_weights = np.ones(self.mul_weights_size, self.dtype)
+        self.mul_weights_bad = np.ones([1, 1], self.dtype)
+        self.mul_output = np.ndarray(self.mul_output_size).astype(self.dtype)
+        self.mul_output_scale = np.linspace(1, 5, num=5).astype(self.dtype)
+
+        self.variables_mul = {
+            "mul_input": self.mul_input,
+            "mul_weights": self.mul_weights,
+            "mul_output": self.mul_output,
+            "mul_weights_bad": self.mul_weights_bad
+        }
 
     def prepare_program(self, program):
         block = program.global_block()
@@ -92,6 +111,23 @@ class TestQuant2Int8MkldnnPass(unittest.TestCase):
                 'fuse_brelu': True
             })
 
+    def prepare_program_mul(self, program):
+        block = program.global_block()
+        for name in self.variables_mul:
+            block.create_var(
+                name=name,
+                dtype="float32",
+                shape=self.variables_mul[name].shape)
+
+        mul_op1 = block.append_op(
+            type="mul",
+            inputs={
+                "X": block.var('mul_input'),
+                "Y": block.var('mul_weights')
+            },
+            outputs={"Out": block.var('mul_output')},
+            attrs={'use_mkldnn': self.use_mkldnn})
+
     def remove_fuse_activation_attribute(self, graph):
         for op in graph.all_op_nodes():
             op.op().remove_attr("fuse_activation")
@@ -103,11 +139,13 @@ class TestQuant2Int8MkldnnPass(unittest.TestCase):
 
     def check_graph_after_pass(self, graph):
         for op in graph.all_op_nodes():
-            self.assertTrue(op.op().has_attr("fuse_activation"))
-            if op.op().has_attr("fuse_relu") and op.op().attr("fuse_relu"):
-                self.assertTrue(op.op().attr("fuse_activation") == "relu")
-            if op.op().has_attr("fuse_brelu") and op.op().attr("fuse_brelu"):
-                self.assertTrue(op.op().attr("fuse_activation") == "relu6")
+            if op.op().type() == "conv2d":
+                self.assertTrue(op.op().has_attr("fuse_activation"))
+                if op.op().has_attr("fuse_relu") and op.op().attr("fuse_relu"):
+                    self.assertTrue(op.op().attr("fuse_activation") == "relu")
+                if op.op().has_attr("fuse_brelu") and op.op().attr(
+                        "fuse_brelu"):
+                    self.assertTrue(op.op().attr("fuse_activation") == "relu6")
 
     def test_quant_update_activation(self):
         program = fluid.Program()
@@ -125,6 +163,39 @@ class TestQuant2Int8MkldnnPass(unittest.TestCase):
             graph = quant2_int8_mkldnn_pass._update_activations(graph)
             self.check_graph_after_pass(graph)
 
+    def test_dequantize_op_weights(self):
+        program = fluid.Program()
+        with fluid.program_guard(program):
+            self.prepare_program_mul(program)
+            graph = IrGraph(core.Graph(program.desc), for_test=True)
+
+            for op in graph.all_op_nodes():
+                if op.op().type() == "mul":
+                    op_node = op
+                    break
+
+            qpass = Quant2Int8MkldnnPass(
+                self.quantized_ops,
+                _scope=self.scope,
+                _place=self.place,
+                _core=core,
+                _debug=False)
+            qpass._weight_scales["mul_output"] = self.mul_output_scale
+            param = self.scope.var("mul_weights").get_tensor()
+            param.set(self.variables_mul["mul_weights"], self.place)
+            qpass._dequantize_op_weights(graph, op_node, "Y", "Out")
+
+            assert np.allclose(
+                self.scope.find_var("mul_weights").get_tensor(),
+                [[127, 63.5, 42.3333, 31.75, 25.4],
+                 [127, 63.5, 42.3333, 31.75, 25.4],
+                 [127, 63.5, 42.3333, 31.75, 25.4]])
+
+            param = self.scope.var("mul_weights").get_tensor()
+            param.set(self.variables_mul["mul_weights_bad"], self.place)
+            with self.assertRaises(ValueError):
+                qpass._dequantize_op_weights(graph, op_node, "Y", "Out")
+
 
 if __name__ == '__main__':
     unittest.main()
diff --git a/python/paddle/fluid/contrib/slim/tests/test_quantization_mkldnn_pass.py b/python/paddle/fluid/contrib/slim/tests/test_quantization_mkldnn_pass.py
index 3acbd8974195854da014990b13f3b1ba38e4c2c1..7ee0fd1d3e28f206b3c3a33fc0a2ceb25b0b4ab3 100644
--- a/python/paddle/fluid/contrib/slim/tests/test_quantization_mkldnn_pass.py
+++ b/python/paddle/fluid/contrib/slim/tests/test_quantization_mkldnn_pass.py
@@ -25,6 +25,7 @@ from paddle.fluid.contrib.slim.quantization import QuantizationTransformPass
 from paddle.fluid.contrib.slim.quantization import QuantInt8MkldnnPass
 from paddle.fluid import core
 
+paddle.enable_static()
 os.environ["CPU_NUM"] = "1"
 
 
diff --git a/python/paddle/fluid/contrib/slim/tests/test_quantization_pass.py b/python/paddle/fluid/contrib/slim/tests/test_quantization_pass.py
index dc9b83e44355342dde132f498354394fc9390af1..768a9ba7cfc3e769fe66c1deaffb1e60fc1a5689 100644
--- a/python/paddle/fluid/contrib/slim/tests/test_quantization_pass.py
+++ b/python/paddle/fluid/contrib/slim/tests/test_quantization_pass.py
@@ -27,6 +27,8 @@ from paddle.fluid.contrib.slim.quantization import TransformForMobilePass
 from paddle.fluid.contrib.slim.quantization import AddQuantDequantPass
 from paddle.fluid import core
 
+paddle.enable_static()
+
 os.environ["CUDA_VISIBLE_DEVICES"] = "0"
 os.environ["CPU_NUM"] = "1"
 
diff --git a/python/paddle/fluid/contrib/slim/tests/test_quantization_scale_pass.py b/python/paddle/fluid/contrib/slim/tests/test_quantization_scale_pass.py
index 9e8c5027ebbf9b365b2a8f7e80f56fb2d202fe97..b03281546a59b4118a5a32b131ea7f66b208e6f0 100644
--- a/python/paddle/fluid/contrib/slim/tests/test_quantization_scale_pass.py
+++ b/python/paddle/fluid/contrib/slim/tests/test_quantization_scale_pass.py
@@ -27,6 +27,8 @@ from paddle.fluid.contrib.slim.quantization import OutScaleForInferencePass
 from paddle.fluid.contrib.slim.quantization import AddQuantDequantPass
 from paddle.fluid import core
 
+paddle.enable_static()
+
 os.environ["CUDA_VISIBLE_DEVICES"] = "0"
 os.environ["CPU_NUM"] = "1"
 
diff --git a/python/paddle/fluid/contrib/slim/tests/test_user_defined_quantization.py b/python/paddle/fluid/contrib/slim/tests/test_user_defined_quantization.py
index 32292c8a47b50bc5e7eb2d7833823e586eea8909..f03d0faa3981b5767eef1c5fde0f583f08686c13 100644
--- a/python/paddle/fluid/contrib/slim/tests/test_user_defined_quantization.py
+++ b/python/paddle/fluid/contrib/slim/tests/test_user_defined_quantization.py
@@ -29,6 +29,8 @@ from paddle.fluid.contrib.slim.quantization import AddQuantDequantPass
 from paddle.fluid import core
 from paddle.fluid.layer_helper import LayerHelper
 
+paddle.enable_static()
+
 os.environ["CUDA_VISIBLE_DEVICES"] = "0"
 os.environ["CPU_NUM"] = "1"
 
diff --git a/python/paddle/fluid/contrib/slim/tests/test_weight_quantization_mobilenetv1.py b/python/paddle/fluid/contrib/slim/tests/test_weight_quantization_mobilenetv1.py
index ff22b1b61e68f9c7d364b34a3b6b185a766f8c64..1e8fa51d635e32d5d0169cf23ca0681051028ae9 100644
--- a/python/paddle/fluid/contrib/slim/tests/test_weight_quantization_mobilenetv1.py
+++ b/python/paddle/fluid/contrib/slim/tests/test_weight_quantization_mobilenetv1.py
@@ -17,6 +17,9 @@ import os
 import time
 from paddle.dataset.common import download, DATA_HOME
 from paddle.fluid.contrib.slim.quantization import WeightQuantization
+import paddle
+
+paddle.enable_static()
 
 
 class TestWeightQuantization(unittest.TestCase):
diff --git a/python/paddle/fluid/contrib/tests/test_correlation.py b/python/paddle/fluid/contrib/tests/test_correlation.py
new file mode 100644
index 0000000000000000000000000000000000000000..50b091415a52a2b2c09907e45435361cbc79795c
--- /dev/null
+++ b/python/paddle/fluid/contrib/tests/test_correlation.py
@@ -0,0 +1,166 @@
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+import numpy as np
+import paddle.fluid as fluid
+from paddle.fluid.dygraph.base import to_variable
+import paddle
+
+paddle.enable_static()
+
+
+def corr(x_1,
+         x_2,
+         pad_size=4,
+         kernel_size=1,
+         max_displacement=4,
+         stride1=1,
+         stride2=1,
+         corr_multiply=1):
+    K = kernel_size
+
+    rinput1 = np.pad(x_1, ((0, 0), (0, 0), (pad_size, pad_size),
+                           (pad_size, pad_size)),
+                     mode='constant')
+    rinput2 = np.pad(x_2, ((0, 0), (0, 0), (pad_size, pad_size),
+                           (pad_size, pad_size)),
+                     mode='constant')
+    rinput1 = np.transpose(rinput1, (0, 2, 3, 1))
+    rinput2 = np.transpose(rinput2, (0, 2, 3, 1))
+    B = int(rinput1.shape[0])
+    H = int(x_1.shape[2])
+    W = int(x_2.shape[3])
+    d = max_displacement
+    D = 2 * d + 1
+    output = np.zeros((B, D * D, H, W), dtype=np.float32)
+
+    for b in range(B):
+        for i in range(H):
+            for j in range(W):
+                for k in range(-d, d + 1):
+                    for l in range(-d, d + 1):
+                        x1_index = i + pad_size
+                        y1_index = j + pad_size
+                        x2_index = x1_index + k
+                        y2_index = y1_index + l
+                        output[b, l + d + D * (k + d), i, j] = np.mean(
+                            rinput1[b, x1_index:x1_index + K, y1_index:y1_index
+                                    + K] * rinput2[b, x2_index:x2_index + K,
+                                                   y2_index:y2_index + K])
+
+    return output
+
+
+class TestCorrelationOp(unittest.TestCase):
+    def test_check_output(self):
+        if not fluid.core.is_compiled_with_cuda():
+            return
+        np.random.seed(13)
+        np.set_printoptions(threshold=np.inf)
+        x_shape = (2, 10, 3, 3)
+        x_type = 'float32'
+        x1 = fluid.layers.data(
+            name='x1',
+            shape=x_shape,
+            dtype=x_type,
+            append_batch_size=False,
+            stop_gradient=False)
+        x2 = fluid.layers.data(
+            name='x2',
+            shape=x_shape,
+            dtype=x_type,
+            append_batch_size=False,
+            stop_gradient=False)
+
+        x1_np = np.random.randn(2, 3, 4, 5).astype(x_type)
+        x2_np = np.random.randn(2, 3, 4, 5).astype(x_type)
+        out_np = corr(
+            x1_np,
+            x2_np,
+            pad_size=4,
+            kernel_size=1,
+            max_displacement=4,
+            stride1=1,
+            stride2=1)
+
+        out = fluid.contrib.correlation(
+            x1,
+            x2,
+            pad_size=4,
+            kernel_size=1,
+            max_displacement=4,
+            stride1=1,
+            stride2=1)
+
+        loss = fluid.layers.reduce_mean(out)
+        optimizer = fluid.optimizer.Momentum(0.0001, 0.9)
+        optimizer.minimize(loss)
+
+        place = fluid.CUDAPlace(0)
+        exe = fluid.Executor(place)
+        res = exe.run(feed={'x1': x1_np,
+                            'x2': x2_np},
+                      fetch_list=[out.name, loss.name])
+
+        self.assertTrue(np.allclose(res[0], out_np))
+
+
+class Net(fluid.dygraph.Layer):
+    def __init__(self, name_scope):
+        super(Net, self).__init__(name_scope)
+
+    def forward(self, x1, x2):
+        y = fluid.contrib.correlation(
+            x1,
+            x2,
+            pad_size=4,
+            kernel_size=1,
+            max_displacement=4,
+            stride1=1,
+            stride2=1)
+        return y
+
+
+class TestCorrelationOpDyGraph(unittest.TestCase):
+    def test_check_output(self):
+        if not fluid.core.is_compiled_with_cuda():
+            return
+        np.random.seed(13)
+        np.set_printoptions(threshold=np.inf)
+        x_shape = (2, 10, 3, 3)
+        x_type = 'float32'
+        place = fluid.CUDAPlace(0)
+        with fluid.dygraph.guard(place):
+            x1_np = np.random.randn(2, 3, 4, 5).astype(x_type)
+            x2_np = np.random.randn(2, 3, 4, 5).astype(x_type)
+            out_np = corr(
+                x1_np,
+                x2_np,
+                pad_size=4,
+                kernel_size=1,
+                max_displacement=4,
+                stride1=1,
+                stride2=1)
+
+            x1 = to_variable(x1_np)
+            x2 = to_variable(x2_np)
+            corr_pd = Net('corr_pd')
+            y = corr_pd(x1, x2)
+            out = y.numpy()
+            self.assertTrue(np.allclose(out, out_np))
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/fluid/contrib/tests/test_fp16_utils.py b/python/paddle/fluid/contrib/tests/test_fp16_utils.py
index e286bb0150e996de156eb2ab6d594b1e9c6dfe8d..0b51f2dcc869ea073eb05c908cb30963eb5c2033 100644
--- a/python/paddle/fluid/contrib/tests/test_fp16_utils.py
+++ b/python/paddle/fluid/contrib/tests/test_fp16_utils.py
@@ -16,6 +16,9 @@ import unittest
 import paddle.fluid as fluid
 from paddle.fluid import core
 from paddle.fluid.contrib.mixed_precision import fp16_utils
+import paddle
+
+paddle.enable_static()
 
 
 class AMPTest(unittest.TestCase):
diff --git a/python/paddle/fluid/contrib/tests/test_image_classification_fp16.py b/python/paddle/fluid/contrib/tests/test_image_classification_fp16.py
index 5fb1dba40a3c69bd3419640a404c580c8375f215..1bf1a234834670d680e3f13a0206b17d216db8fd 100644
--- a/python/paddle/fluid/contrib/tests/test_image_classification_fp16.py
+++ b/python/paddle/fluid/contrib/tests/test_image_classification_fp16.py
@@ -25,6 +25,8 @@ import os
 import copy
 import numpy as np
 
+paddle.enable_static()
+
 
 def resnet_cifar10(input, depth=32):
     def conv_bn_layer(input,
diff --git a/python/paddle/fluid/contrib/tests/test_quantize_transpiler.py b/python/paddle/fluid/contrib/tests/test_quantize_transpiler.py
index 77fdf0087b93c3ad44a2492de68f8f57ce243ef3..342be7db3ed30d9b7d1af9133d289b933fb23c45 100644
--- a/python/paddle/fluid/contrib/tests/test_quantize_transpiler.py
+++ b/python/paddle/fluid/contrib/tests/test_quantize_transpiler.py
@@ -20,6 +20,9 @@ import paddle
 import paddle.fluid as fluid
 from paddle.fluid.contrib.quantize.quantize_transpiler import _original_var_name
 from paddle.fluid.contrib.quantize.quantize_transpiler import QuantizeTranspiler
+import paddle
+
+paddle.enable_static()
 
 
 def linear_fc(num):
diff --git a/python/paddle/fluid/contrib/tests/test_weight_decay_extend.py b/python/paddle/fluid/contrib/tests/test_weight_decay_extend.py
index a5f08ca969ac43f47899395aeb588ddaf2f1e394..906d83fff4fd61390a68133170cb1c43f6b74251 100644
--- a/python/paddle/fluid/contrib/tests/test_weight_decay_extend.py
+++ b/python/paddle/fluid/contrib/tests/test_weight_decay_extend.py
@@ -21,6 +21,8 @@ import paddle
 import paddle.fluid as fluid
 import contextlib
 
+paddle.enable_static()
+
 
 def get_places():
     places = [fluid.CPUPlace()]
diff --git a/python/paddle/fluid/core.py b/python/paddle/fluid/core.py
index a05aa3b0a84b57bb1f9ce00b0ad007280c316c6e..2e3bb6b00218a47467c59a2a88ea45ec80c32419 100644
--- a/python/paddle/fluid/core.py
+++ b/python/paddle/fluid/core.py
@@ -39,6 +39,11 @@ try:
         third_lib_path = current_path + os.sep + '..' + os.sep + 'libs'
         os.environ['path'] = third_lib_path + ';' + os.environ['path']
         sys.path.insert(0, third_lib_path)
+        # Note: from python3.8, PATH will not take effect
+        # https://github.com/python/cpython/pull/12302
+        # Use add_dll_directory to specify dll resolution path
+        if sys.version_info[:2] >= (3, 8):
+            os.add_dll_directory(third_lib_path)
 
 except ImportError as e:
     from .. import compat as cpt
diff --git a/python/paddle/fluid/data_feeder.py b/python/paddle/fluid/data_feeder.py
index 45aa85d4168a55e206460ce2e39292013caa9ce0..5da83da33b8de334d4ae1e5b072cfb20d74c1271 100644
--- a/python/paddle/fluid/data_feeder.py
+++ b/python/paddle/fluid/data_feeder.py
@@ -132,6 +132,28 @@ def check_dtype(input_dtype,
              extra_message))
 
 
+def check_shape(shape,
+                op_name,
+                expected_shape_type=(list, tuple, Variable),
+                expected_element_type=(int, Variable),
+                expected_tensor_dtype=('int32', 'int64')):
+    # See NOTE [ Why skip dynamic graph check ]
+    if in_dygraph_mode():
+        return
+    check_type(shape, 'shape', expected_shape_type, op_name)
+    if expected_element_type is not None and not isinstance(shape, Variable):
+        for item in shape:
+            check_type(item, 'element of shape', expected_element_type, op_name)
+            if expected_tensor_dtype is not None and isinstance(item, Variable):
+                check_dtype(
+                    item.dtype, 'element of shape', expected_tensor_dtype,
+                    op_name,
+                    'If element of shape is Tensor, its data type should be {}'.
+                    format(', '.join(expected_tensor_dtype)))
+    if expected_tensor_dtype is not None and isinstance(shape, Variable):
+        check_dtype(shape.dtype, 'shape', expected_tensor_dtype, op_name)
+
+
 class DataToLoDTensorConverter(object):
     def __init__(self, place, lod_level, shape, dtype):
         self.place = place
diff --git a/python/paddle/fluid/dataloader/dataloader_iter.py b/python/paddle/fluid/dataloader/dataloader_iter.py
index 6a996493e4df1e1facc6ccd205a8ae5105f92c5b..1ef0d494e0725084b0ddfddcafe93d49da0525d7 100644
--- a/python/paddle/fluid/dataloader/dataloader_iter.py
+++ b/python/paddle/fluid/dataloader/dataloader_iter.py
@@ -347,6 +347,92 @@ class _DataLoaderIterSingleProcess(_DataLoaderIterBase):
         return self.__next__()
 
 
+# NOTE(chenweihang): _worker_loop must be top level method to be pickled
+def _worker_loop(dataset, dataset_kind, indices_queue, out_queue, done_event,
+                 collate_fn, init_fn, worker_id, num_workers,
+                 use_shared_memory):
+    try:
+        # NOTE: [ mmap files clear ] When the child process exits unexpectedly,
+        # some shared memory objects may have been applied for but have not yet
+        # been put into the inter-process Queue. This part of the object needs
+        # to be cleaned up when the process ends.
+        CleanupFuncRegistrar.register(_cleanup_mmap)
+
+        # set signal handler
+        core._set_process_signal_handler()
+
+        global _worker_info
+        _worker_info = WorkerInfo(
+            id=worker_id, num_workers=num_workers, dataset=dataset)
+
+        init_exception = None
+        try:
+            if init_fn is not None:
+                init_fn(worker_id)
+            fetcher = _DatasetKind.create_fetcher(dataset_kind, dataset,
+                                                  collate_fn, True)
+        except:
+            init_exception = Exception("init_fn failed in worker {}: " \
+                                    "{}".format(worker_id, sys.exc_info()))
+
+        iterator_drained = False
+        parent_watch_dog = ParentWatchDog()
+
+        while parent_watch_dog.is_alive():
+            try:
+                data = indices_queue.get(MP_INDICES_CHECK_INTERVAL)
+            except queue.Empty:
+                continue
+
+            # None as poison piil, so worker event should be set
+            if data is None:
+                assert done_event.is_set() or iterator_drained, \
+                        "get None when worker done_event set"
+                break
+            # If worker done event is set but get still get data in
+            # indices_queue, remaining data should be get and skipped.
+            if done_event.is_set() or iterator_drained:
+                continue
+
+            idx, indices = data
+            try:
+                if init_exception is not None:
+                    batch = init_exception
+                    init_exception = None
+                else:
+                    batch = fetcher.fetch(indices)
+            except Exception as e:
+                if isinstance(
+                        e, StopIteration) and dataset_kind == _DatasetKind.ITER:
+                    out_queue.put(_IterableDatasetStopIteration(worker_id))
+                    iterator_drained = True
+                else:
+                    out_queue.put((idx, e))
+            else:
+                if use_shared_memory:
+                    # FIXME(dkp): _convert_to_tensor_list only support np.array
+                    #             list now, should support paddle.Tensor list
+                    if isinstance(batch[0][0], paddle.Tensor):
+                        np_batch = []
+                        for sample in batch:
+                            np_batch.append([s.numpy() for s in sample])
+                        batch = np_batch
+
+                    tensor_list = core._convert_to_tensor_list(batch)
+                    out_queue.put((idx, tensor_list))
+                    core._remove_tensor_list_mmap_fds(tensor_list)
+                else:
+                    out_queue.put((idx, batch))
+    except KeyboardInterrupt:
+        # NOTE: Main process will raise KeyboardInterrupt anyways, ignore it in child process
+        pass
+    except:
+        six.reraise(*sys.exc_info())
+    finally:
+        if use_shared_memory:
+            _cleanup_mmap()
+
+
 class _DataLoaderIterMultiProcess(_DataLoaderIterBase):
     def __init__(self, loader):
         super(_DataLoaderIterMultiProcess, self).__init__(loader)
@@ -404,11 +490,11 @@ class _DataLoaderIterMultiProcess(_DataLoaderIterBase):
             indices_queue = multiprocessing.Queue()
             self._indices_queues.append(indices_queue)
             worker = multiprocessing.Process(
-                target=self._worker_loop,
+                target=_worker_loop,
                 args=(self._dataset, self._dataset_kind, indices_queue,
                       self._data_queue, self._workers_done_event,
                       self._collate_fn, self._worker_init_fn, i,
-                      self._num_workers))
+                      self._num_workers, self._use_shared_memory))
             worker.daemon = True
             worker.start()
             self._workers.append(worker)
@@ -483,90 +569,6 @@ class _DataLoaderIterMultiProcess(_DataLoaderIterBase):
         self._blocking_queue.kill()
         logging.error("DataLoader reader thread raised an exception!")
 
-    def _worker_loop(self, dataset, dataset_kind, indices_queue, out_queue,
-                     done_event, collate_fn, init_fn, worker_id, num_workers):
-        try:
-            # NOTE: [ mmap files clear ] When the child process exits unexpectedly,
-            # some shared memory objects may have been applied for but have not yet
-            # been put into the inter-process Queue. This part of the object needs
-            # to be cleaned up when the process ends.
-            CleanupFuncRegistrar.register(_cleanup_mmap)
-
-            # set signal handler
-            core._set_process_signal_handler()
-
-            global _worker_info
-            _worker_info = WorkerInfo(
-                id=worker_id, num_workers=num_workers, dataset=dataset)
-
-            init_exception = None
-            try:
-                if init_fn is not None:
-                    init_fn(worker_id)
-                fetcher = _DatasetKind.create_fetcher(dataset_kind, dataset,
-                                                      collate_fn, True)
-            except:
-                init_exception = Exception("init_fn failed in worker {}: " \
-                                     "{}".format(worker_id, sys.exc_info()))
-
-            iterator_drained = False
-            parent_watch_dog = ParentWatchDog()
-
-            while parent_watch_dog.is_alive():
-                try:
-                    data = indices_queue.get(MP_INDICES_CHECK_INTERVAL)
-                except queue.Empty:
-                    continue
-
-                # None as poison piil, so worker event should be set
-                if data is None:
-                    assert done_event.is_set() or iterator_drained, \
-                            "get None when worker done_event set"
-                    break
-                # If worker done event is set but get still get data in
-                # indices_queue, remaining data should be get and skipped.
-                if done_event.is_set() or iterator_drained:
-                    continue
-
-                idx, indices = data
-                try:
-                    if init_exception is not None:
-                        batch = init_exception
-                        init_exception = None
-                    else:
-                        batch = fetcher.fetch(indices)
-                except Exception as e:
-                    if isinstance(
-                            e,
-                            StopIteration) and dataset_kind == _DatasetKind.ITER:
-                        out_queue.put(_IterableDatasetStopIteration(worker_id))
-                        iterator_drained = True
-                    else:
-                        out_queue.put((idx, e))
-                else:
-                    if self._use_shared_memory:
-                        # FIXME(dkp): _convert_to_tensor_list only support np.array
-                        #             list now, should support paddle.Tensor list
-                        if isinstance(batch[0][0], paddle.Tensor):
-                            np_batch = []
-                            for sample in batch:
-                                np_batch.append([s.numpy() for s in sample])
-                            batch = np_batch
-
-                        tensor_list = core._convert_to_tensor_list(batch)
-                        out_queue.put((idx, tensor_list))
-                        core._remove_tensor_list_mmap_fds(tensor_list)
-                    else:
-                        out_queue.put((idx, batch))
-        except KeyboardInterrupt:
-            # NOTE: Main process will raise KeyboardInterrupt anyways, ignore it in child process
-            pass
-        except:
-            six.reraise(*sys.exc_info())
-        finally:
-            if self._use_shared_memory:
-                _cleanup_mmap()
-
     def _thread_loop(self):
         while not self._thread_done_event.is_set():
             batch = self._get_data()
diff --git a/python/paddle/fluid/dygraph/amp/loss_scaler.py b/python/paddle/fluid/dygraph/amp/loss_scaler.py
index 8f3ca9ec007ef5c1ab8769dde741a5d2b3697600..ff57f30dcd2ec73d55ff06e751767deea0a2eead 100644
--- a/python/paddle/fluid/dygraph/amp/loss_scaler.py
+++ b/python/paddle/fluid/dygraph/amp/loss_scaler.py
@@ -210,13 +210,12 @@ class AmpScaler(object):
     def _unscale(self, optimizer):
         if not self._enable:
             return
-        inv_scale = 1.0 / self._scale
         param_grads = [
             param._grad_ivar() for param in optimizer._parameter_list
             if param._grad_ivar() is not None
         ]
-        core.ops.amp_check_finite_and_scale(param_grads, inv_scale, param_grads,
-                                            self._found_inf)
+        core.ops.check_finite_and_unscale(param_grads, self._scale, param_grads,
+                                          self._found_inf)
 
     def _update(self):
         """
diff --git a/python/paddle/fluid/dygraph/base.py b/python/paddle/fluid/dygraph/base.py
index c548bdfeba19510b26c0f80d356fa6a6b7bbaed7..01c2f0fed496081400d363d9464360c69d924be8 100644
--- a/python/paddle/fluid/dygraph/base.py
+++ b/python/paddle/fluid/dygraph/base.py
@@ -23,8 +23,8 @@ from paddle.fluid import framework
 from paddle.fluid.multiprocess_utils import CleanupFuncRegistrar
 from .tracer import Tracer
 import logging
-import objgraph
 from ..data_feeder import convert_dtype
+import warnings
 
 __all__ = [
     'no_grad', 'no_grad_', 'grad', 'guard', 'enable_dygraph', 'disable_dygraph',
@@ -367,24 +367,6 @@ def guard(place=None):
                     yield
 
 
-def _print_debug_msg(parameter_list, limit=5, is_test=False):
-    if not core._is_dygraph_debug_enabled():
-        logging.warn(
-            'Debug mode is not enabled. Please set FLAGS_dygraph_debug=1 to enable debug'
-        )
-        return
-    unique_name_size = len(framework.unique_name.generator.ids)
-    tracer_var_size = len(parameter_list)
-    alive_cpp_var_size = len(core.VarBase._alive_vars())
-    if not is_test:
-        logging.warn(
-            'unique_name num: {}, tracer vars num: {}, alive cpp vars num: {}'
-            .format(unique_name_size, tracer_var_size, alive_cpp_var_size))
-        objgraph.show_growth(limit=limit)
-    else:
-        return unique_name_size, tracer_var_size, alive_cpp_var_size
-
-
 @framework.dygraph_only
 def grad(outputs,
          inputs,
@@ -609,10 +591,10 @@ def to_variable(value, name=None, zero_copy=None, dtype=None):
             uint8, uint16, complex64, complex128}.
         name(str, optional): The default value is None. Normally there is no 
             need for user to set this property. For more information, please 
-            refer to :ref:`api_guide_Name` .
+            refer to :ref:`api_guide_Name` . 
         zero_copy(bool, optional): Whether to share memory with the input numpy 
             array. This parameter only works with CPUPlace and will be set to 
-            True when it is None. Default: None.
+            True when it is None. Default: None. (Note: zero_copy is discarded temporally for some reason.)
         dtype(str, optional): The desired data type of returned ``Variable`` .
             Can be 'bool' , 'float16' , 'float32' , 'float64' , 'int8' , 'int16' , 
             'int32' , 'int64' , 'uint8' . Default: None.
@@ -665,8 +647,17 @@ def to_variable(value, name=None, zero_copy=None, dtype=None):
     else:
         if isinstance(framework._current_expected_place(),
                       framework.core.CPUPlace):
-            if zero_copy is None:
-                zero_copy = True
+            #TODO(zhiqiu): we found two problems when enable zero_copy on CPUPlace.
+            # (1): eigen requires 16-bytes alignments, but the data of numpy array may not statisfy. 
+            # Details: https://eigen.tuxfamily.org/dox/group__TopicUnalignedArrayAssert.html
+            # (2): when used in flask framework, it may result in hang.
+            # Details: https://github.com/PaddlePaddle/Paddle/issues/26635
+            # So, we temporally diable the zero_copy strategy.
+            if zero_copy == True:
+                warnings.warn(
+                    "Currently, zero_copy is not supported, and it will be discarded."
+                )
+                zero_copy = False
         else:
             assert not zero_copy, "zero_copy mode can only be used with CPUPlace"
 
diff --git a/python/paddle/fluid/dygraph/checkpoint.py b/python/paddle/fluid/dygraph/checkpoint.py
index f4d68a798efa26d43702aa1c555f6046f0e6a6a5..93cb0bafc847b897816636f92255bd06b7e67321 100644
--- a/python/paddle/fluid/dygraph/checkpoint.py
+++ b/python/paddle/fluid/dygraph/checkpoint.py
@@ -16,13 +16,16 @@ from __future__ import print_function
 
 import os
 import collections
+import functools
 from ..framework import Variable, default_main_program, in_dygraph_mode, dygraph_only, Parameter, ParamBase, _varbase_creator, _dygraph_tracer
 import pickle
 import six
 from . import learning_rate_scheduler
 import warnings
 from .. import core
-from paddle.fluid.dygraph.io import VARIABLE_FILENAME, EXTRA_VAR_INFO_FILENAME, _load_persistable_vars
+from .base import guard
+from paddle.fluid.dygraph.jit import SaveLoadConfig, deprecate_save_load_configs
+from paddle.fluid.dygraph.io import _construct_program_holders, _construct_params_and_buffers, EXTRA_VAR_INFO_FILENAME
 
 __all__ = [
     'save_dygraph',
@@ -30,6 +33,37 @@ __all__ = [
 ]
 
 
+# NOTE(chenweihang): deprecate load_dygraph's argument keep_name_table,
+# ensure compatibility when user still use keep_name_table argument
+def deprecate_keep_name_table(func):
+    @functools.wraps(func)
+    def wrapper(*args, **kwargs):
+        def __warn_and_build_configs__(keep_name_table):
+            warnings.warn(
+                "The argument `keep_name_table` has deprecated, please use `SaveLoadConfig.keep_name_table`.",
+                DeprecationWarning)
+            config = SaveLoadConfig()
+            config.keep_name_table = keep_name_table
+            return config
+
+        # deal with arg `keep_name_table`
+        if len(args) > 1 and isinstance(args[1], bool):
+            args = list(args)
+            args[1] = __warn_and_build_configs__(args[1])
+        # deal with kwargs
+        elif 'keep_name_table' in kwargs:
+            kwargs['config'] = __warn_and_build_configs__(kwargs[
+                'keep_name_table'])
+            kwargs.pop('keep_name_table')
+        else:
+            # do nothing
+            pass
+
+        return func(*args, **kwargs)
+
+    return wrapper
+
+
 @dygraph_only
 def save_dygraph(state_dict, model_path):
     '''
@@ -100,17 +134,28 @@ def save_dygraph(state_dict, model_path):
 
 # TODO(qingqing01): remove dygraph_only to support loading static model.
 # maybe need to unify the loading interface after 2.0 API is ready.
-#@dygraph_only
-def load_dygraph(model_path, keep_name_table=False):
+# @dygraph_only
+@deprecate_save_load_configs
+@deprecate_keep_name_table
+def load_dygraph(model_path, config=None):
     '''
     :api_attr: imperative
     
-    Load parameter state_dict from disk.
+    Load parameter state dict from disk.
+
+    .. note::
+        Due to some historical reasons, if you load ``state_dict`` from the saved 
+        result of `paddle.io.save_inference_model`, the structured variable name 
+        will cannot be restored. You need to set the argument `use_structured_name=False` 
+        when using `Layer.set_state_dict` later.
 
     Args:
-        model_path(str) : The file prefix store the state_dict. (The path should Not contain suffix '.pdparams') 
-        keep_name_table(bool, optional) : Whether keep structed name to parameter name conversion table in output dict. 
-                                          Default : False
+        model_path(str) : The file prefix store the state_dict. 
+            (The path should Not contain suffix '.pdparams') 
+        config (SaveLoadConfig, optional): :ref:`api_imperative_jit_saveLoadConfig`
+            object that specifies additional configuration options, these options 
+            are for compatibility with ``jit.save/io.save_inference_model`` formats. 
+            Default None.
 
     Returns:
         state_dict(dict) : the dict store the state_dict
@@ -118,23 +163,27 @@ def load_dygraph(model_path, keep_name_table=False):
     Examples:
         .. code-block:: python
 
-            import paddle.fluid as fluid
+            import paddle
             
-            with fluid.dygraph.guard():
-                emb = fluid.dygraph.Embedding([10, 10])
+            paddle.disable_static()
 
-                state_dict = emb.state_dict()
-                fluid.save_dygraph( state_dict, "paddle_dy")
+            emb = paddle.nn.Embedding([10, 10])
 
-                adam = fluid.optimizer.Adam( learning_rate = fluid.layers.noam_decay( 100, 10000),
-                                             parameter_list = emb.parameters() )
-                state_dict = adam.state_dict()
-                fluid.save_dygraph( state_dict, "paddle_dy")
+            state_dict = emb.state_dict()
+            paddle.save(state_dict, "paddle_dy")
 
-                para_state_dict, opti_state_dict = fluid.load_dygraph( "paddle_dy")
+            scheduler = paddle.optimizer.lr_scheduler.NoamLR(
+                d_model=0.01, warmup_steps=100, verbose=True)
+            adam = paddle.optimizer.Adam(
+                learning_rate=scheduler,
+                parameters=emb.parameters())
+            state_dict = adam.state_dict()
+            paddle.save(state_dict, "paddle_dy")
 
-    '''
+            para_state_dict, opti_state_dict = paddle.load("paddle_dy")
 
+    '''
+    # deal with argument `model_path`
     model_prefix = model_path
     if model_prefix.endswith(".pdparams"):
         model_prefix = model_prefix[:-9]
@@ -145,67 +194,12 @@ def load_dygraph(model_path, keep_name_table=False):
     opti_dict = None
     params_file_path = model_prefix + ".pdparams"
     opti_file_path = model_prefix + ".pdopt"
-    if not os.path.exists(params_file_path) and not os.path.exists(
-            opti_file_path):
-        # Load state dict by `jit.save` save format
-        # TODO(chenweihang): [Why not support `io.save_infernece_model` save format here]
-        # The model saved by `save_inference_model` does not completely correspond to 
-        # the information required by the `state_dict` under the dygraph. 
-        # Although we reluctantly restore the `state_dict` in some scenarios, 
-        # this may not be complete and there are some limitations, so this function 
-        # will be considered later. The limitations include:
-        #   1. `save_inference_model` not save structured name, we need to remind 
-        # the user to configure the `use_structured_name` argument when `set_dict`, 
-        # but this argument is currently not public
-        #   2. if `save_inference_model` save all persistable variables in a single file,
-        # user need to give the variable name list to load `state_dict`
-
-        # 1. check model path
-        if not os.path.isdir(model_prefix):
-            raise ValueError("Model saved directory '%s' is not exists." %
-                             model_prefix)
-        # 2. load `__variables.info__`
-        var_info_path = os.path.join(model_prefix, EXTRA_VAR_INFO_FILENAME)
-        if not os.path.exists(var_info_path):
-            raise RuntimeError(
-                "No target can be loaded. Now only supports loading `state_dict` from "
-                "the result saved by `imperative.save` and `imperative.jit.save`."
-            )
-        with open(var_info_path, 'rb') as f:
-            extra_var_info = pickle.load(f)
-        # 3. load `__variables__`
-        # TODO(chenweihang): now only supports loading from default save format:
-        # - all persistable vars saved in one file named `__variables__`
-        # for other case, we may need to modify the arguments of this API
-        var_file_path = os.path.join(model_prefix, VARIABLE_FILENAME)
-        if not os.path.exists(var_file_path):
-            raise RuntimeError(
-                "The parameter file to be loaded was not found. "
-                "Now only supports loading from the default save format, "
-                "and does not support custom params_filename and "
-                "save parameters separately.")
-        # 4. load all persistable vars
-        load_var_list = []
-        for name in sorted(extra_var_info):
-            var = _varbase_creator(name=name, persistable=True)
-            load_var_list.append(var)
-        _dygraph_tracer().trace_op(
-            type='load_combine',
-            inputs={},
-            outputs={'Out': load_var_list},
-            attrs={'file_path': var_file_path})
-        # 5. construct state_dict
-        para_dict = dict()
-        for var in load_var_list:
-            structured_name = extra_var_info[var.name].get('structured_name',
-                                                           None)
-            if structured_name is None:
-                raise RuntimeError(
-                    "Cannot find saved variable (%s)'s structured name in saved model.",
-                    var.name)
-            para_dict[structured_name] = var.numpy()
-        # NOTE: `jit.save` doesn't save optimizer state
-    else:
+
+    # deal with argument `config`
+    if config is None:
+        config = SaveLoadConfig()
+
+    if os.path.exists(params_file_path) or os.path.exists(opti_file_path):
         # Load state dict by `save_dygraph` save format
         para_dict = {}
         if os.path.exists(params_file_path):
@@ -213,12 +207,103 @@ def load_dygraph(model_path, keep_name_table=False):
                 para_dict = pickle.load(f) if six.PY2 else pickle.load(
                     f, encoding='latin1')
 
-        if not keep_name_table and "StructuredToParameterName@@" in para_dict:
+        if not config.keep_name_table and "StructuredToParameterName@@" in para_dict:
             del para_dict["StructuredToParameterName@@"]
 
         if os.path.exists(opti_file_path):
             with open(opti_file_path, 'rb') as f:
                 opti_dict = pickle.load(f) if six.PY2 else pickle.load(
                     f, encoding='latin1')
+    else:
+        # check model path
+        if not os.path.isdir(model_prefix):
+            raise ValueError("Model saved directory '%s' is not exists." %
+                             model_prefix)
+
+        # check whether model file exists
+        if config.model_filename is None:
+            model_filename = '__model__'
+        else:
+            model_filename = config.model_filename
+        model_file_path = os.path.join(model_path, model_filename)
+
+        if os.path.exists(model_file_path):
+            # Load state dict by `jit.save/io.save_inference_model` save format
+            # NOTE(chenweihang): [ Compatibility of save_inference_model save format ]
+            # The model saved by `save_inference_model` does not completely correspond to 
+            # the information required by the `state_dict` under the dygraph. 
+            # `save_inference_model` not save structured name, we need to remind 
+            # the user to configure the `use_structured_name` argument when `set_state_dict`
+            # NOTE(chenweihang): `jit.save` doesn't save optimizer state 
+
+            # 1. load program desc & construct _ProgramHolder
+            programs = _construct_program_holders(model_path,
+                                                  config.model_filename)
+
+            # 2. load layer parameters & buffers
+            # NOTE: using fluid.dygraph.guard() here will cause import error in py2
+            with guard():
+                persistable_var_dict = _construct_params_and_buffers(
+                    model_prefix,
+                    programs,
+                    config.separate_params,
+                    config.params_filename,
+                    append_suffix=False)
+
+                # 3. construct state_dict
+                para_dict = dict()
+                for var_name in persistable_var_dict:
+                    para_dict[var_name] = persistable_var_dict[var_name].numpy()
+
+                # if __variables.info__ exists, we can recover structured_name
+                var_info_path = os.path.join(model_prefix,
+                                             EXTRA_VAR_INFO_FILENAME)
+                if os.path.exists(var_info_path):
+                    with open(var_info_path, 'rb') as f:
+                        extra_var_info = pickle.load(f)
+                    structured_para_dict = dict()
+                    for var_name in para_dict:
+                        structured_name = extra_var_info[var_name].get(
+                            'structured_name', None)
+                        assert structured_name is not None, "Cannot find saved variable (%s)'s structured name in saved model." % var_name
+                        structured_para_dict[structured_name] = para_dict[
+                            var_name]
+                    para_dict = structured_para_dict
+        else:
+            # load state dict by `io.save_params/persistables` save format
+            # TODO(chenweihang): [ Now only supports loading parameters seperately ]
+            # If users save all parameters as one file, the [ variable.name -> variable ]
+            # mapping info will lost, so users need to give variable list, but users build 
+            # variable list in dygraph mode is difficult, we recommend users to use
+            # paddle.io.load_program_state in this case
+
+            # Try to load all the files in the directory in VarBase format, 
+            # the file name is used as the name of VarBase
+            load_var_list = []
+
+            # 1. load file names
+            var_name_list = []
+            for root, _, files in os.walk(model_path):
+                for filename in files:
+                    file_path = os.path.join(root, filename)
+                    tmp_var_name = os.path.relpath(file_path, model_path)
+                    var_name = tmp_var_name.replace("\\", "/")
+                    var_name_list.append(var_name)
+
+            # 2. create and load VarBase
+            with guard():
+                for name in var_name_list:
+                    new_var = _varbase_creator(name=name, persistable=True)
+                    _dygraph_tracer().trace_op(
+                        type='load',
+                        inputs={},
+                        outputs={'Out': new_var},
+                        attrs={'file_path': os.path.join(model_path, name)})
+                    load_var_list.append(new_var)
+
+            # 3. construct state_dict
+            para_dict = dict()
+            for var in load_var_list:
+                para_dict[var.name] = var.numpy()
 
     return para_dict, opti_dict
diff --git a/python/paddle/fluid/dygraph/dygraph_to_static/ast_transformer.py b/python/paddle/fluid/dygraph/dygraph_to_static/ast_transformer.py
index 5152799ca72f1461d6fbfc3a619a6aa9b9477934..5050067e48a1b147d43abd955c64c7fbb8cf6068 100644
--- a/python/paddle/fluid/dygraph/dygraph_to_static/ast_transformer.py
+++ b/python/paddle/fluid/dygraph/dygraph_to_static/ast_transformer.py
@@ -60,7 +60,7 @@ class DygraphToStaticAst(gast.NodeTransformer):
     def transfer_from_node_type(self, node_wrapper):
         translator_logger = logging_utils.TranslatorLogger()
         translator_logger.log(
-            1, "   Source code: \n{}".format(ast_to_source_code(self.root)))
+            1, "Source code: \n{}".format(ast_to_source_code(self.root)))
         # Generic transformation
         self.visit(node_wrapper.node)
 
diff --git a/python/paddle/fluid/dygraph/dygraph_to_static/convert_call_func.py b/python/paddle/fluid/dygraph/dygraph_to_static/convert_call_func.py
index 4630cfcdabfd307ea03a7fd0c885c73ce4a4ea0b..c837c8eb123c2707d89a75a7489607f43a2e7501 100644
--- a/python/paddle/fluid/dygraph/dygraph_to_static/convert_call_func.py
+++ b/python/paddle/fluid/dygraph/dygraph_to_static/convert_call_func.py
@@ -31,6 +31,7 @@ from paddle.fluid.dygraph.dygraph_to_static.convert_operators import convert_len
 from paddle.fluid.dygraph.dygraph_to_static.logging_utils import TranslatorLogger
 from paddle.fluid.dygraph.dygraph_to_static.program_translator import StaticLayer
 from paddle.fluid.dygraph.dygraph_to_static.program_translator import convert_to_static
+from paddle.fluid.dygraph.dygraph_to_static.program_translator import unwrap_decorators
 from paddle.fluid.dygraph.layers import Layer
 
 # TODO(liym27): A better way to do this.
@@ -118,14 +119,9 @@ def convert_call(func):
     func_self = None
     converted_call = None
 
-    # Function in convert_call may be decorated by another `@declarative`,
+    # Function in convert_call may be decorated by another `@to_static`,
     # in this case, unwraps it into a raw method or function.
-    if isinstance(func, StaticLayer):
-        instance = func._class_instance
-        if instance is not None:
-            func = func.dygraph_function.__get__(instance)
-        else:
-            func = func.dygraph_function
+    _, func = unwrap_decorators(func)
 
     if is_builtin_len(func):
         return convert_len
@@ -155,7 +151,8 @@ def convert_call(func):
                 if inspect.isfunction(fn):
                     global_functions.add(fn)
                 elif isinstance(fn, StaticLayer):
-                    global_functions.add(fn.dygraph_function)
+                    _, fn = unwrap_decorators(fn)
+                    global_functions.add(fn)
 
             if func in global_functions:
                 converted_call = convert_to_static(func)
@@ -189,7 +186,8 @@ def convert_call(func):
     elif hasattr(func, '__class__') and hasattr(func.__class__, '__call__'):
         if hasattr(func, 'forward') and isinstance(func, Layer):
             try:
-                forward_func = convert_to_static(func.forward)
+                _, forward_func = unwrap_decorators(func.forward)
+                forward_func = convert_to_static(forward_func)
                 setattr(func, 'forward', forward_func)
                 func_self = func
             except Exception:
diff --git a/python/paddle/fluid/dygraph/dygraph_to_static/error.py b/python/paddle/fluid/dygraph/dygraph_to_static/error.py
index 5aba7ca0fdc0cfda5d79f5a66d78785df49c0baf..be21ab6d5394ed5f89c23988a9405b57e05b56fb 100644
--- a/python/paddle/fluid/dygraph/dygraph_to_static/error.py
+++ b/python/paddle/fluid/dygraph/dygraph_to_static/error.py
@@ -13,6 +13,7 @@
 # limitations under the License.
 
 import os
+import six
 import sys
 import traceback
 
@@ -20,6 +21,14 @@ from paddle.fluid.dygraph.dygraph_to_static.origin_info import Location, OriginI
 
 ERROR_DATA = "Error data about original source code information and traceback."
 
+# A flag to set whether to open the dygraph2static error reporting module
+SIMPLIFY_ERROR_ENV_NAME = "TRANSLATOR_SIMPLIFY_NEW_ERROR"
+DEFAULT_SIMPLIFY_NEW_ERROR = 1
+
+# A flag to set whether to display the simplified error stack
+DISABLE_ERROR_ENV_NAME = "TRANSLATOR_DISABLE_NEW_ERROR"
+DEFAULT_DISABLE_NEW_ERROR = 0
+
 
 def attach_error_data(error, in_runtime=False):
     """
@@ -103,7 +112,10 @@ class ErrorData(object):
 
         # Simplify error value to improve readability if error is raised in runtime
         if self.in_runtime:
-            self._simplify_error_value()
+            if int(
+                    os.getenv(SIMPLIFY_ERROR_ENV_NAME,
+                              DEFAULT_SIMPLIFY_NEW_ERROR)):
+                self._simplify_error_value()
             message_lines.append(str(self.error_value))
             return '\n'.join(message_lines)
 
@@ -150,3 +162,22 @@ class ErrorData(object):
 
         error_value_str = '\n'.join(error_value_lines)
         self.error_value = self.error_type(error_value_str)
+
+    def raise_new_exception(self):
+
+        # Raises the origin error if disable dygraph2static error module,
+        if int(os.getenv(DISABLE_ERROR_ENV_NAME, DEFAULT_DISABLE_NEW_ERROR)):
+            raise
+
+        new_exception = self.create_exception()
+        if six.PY3:
+            # NOTE(liym27):
+            # 1. Why `raise new_exception from None`?
+            #   In Python 3, by default, an new exception is raised with trace information of the caught exception.
+            #   This only raises new_exception and hides unwanted implementation details from tracebacks of the
+            #   caught exception.
+            # 2. Use exec to bypass syntax error checking in Python 2.
+
+            six.exec_("raise new_exception from None")
+        else:
+            raise new_exception
diff --git a/python/paddle/fluid/dygraph/dygraph_to_static/function_spec.py b/python/paddle/fluid/dygraph/dygraph_to_static/function_spec.py
index 90e38bd98863ff62174bd569a483b11984480b5a..3d1ed836ff1aca38d796f3a247e9a5d6f6cf3add 100644
--- a/python/paddle/fluid/dygraph/dygraph_to_static/function_spec.py
+++ b/python/paddle/fluid/dygraph/dygraph_to_static/function_spec.py
@@ -12,17 +12,18 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-import logging
 import six
 import inspect
 import numpy as np
 import collections
+
 import paddle
 from paddle.fluid import core
 from paddle.fluid.dygraph import layers
 from paddle.fluid.layers.utils import flatten
 from paddle.fluid.layers.utils import pack_sequence_as
 from paddle.fluid.dygraph.base import switch_to_static_graph
+from paddle.fluid.dygraph.dygraph_to_static import logging_utils
 from paddle.fluid.dygraph.dygraph_to_static.utils import parse_arg_and_kwargs
 from paddle.fluid.dygraph.dygraph_to_static.utils import type_name
 from paddle.fluid.dygraph.dygraph_to_static.utils import func_to_source_code
@@ -135,6 +136,11 @@ class FunctionSpec(object):
 
             input_with_spec = pack_sequence_as(args, input_with_spec)
 
+        # If without specificing name in input_spec, add default name
+        # according to argument name from decorated function.
+        input_with_spec = replace_spec_empty_name(self._arg_names,
+                                                  input_with_spec)
+
         return input_with_spec
 
     @switch_to_static_graph
@@ -286,7 +292,7 @@ def convert_to_input_spec(inputs, input_spec):
         if len(inputs) > len(input_spec):
             for rest_input in inputs[len(input_spec):]:
                 if isinstance(rest_input, (core.VarBase, np.ndarray)):
-                    logging.warning(
+                    logging_utils.warn(
                         "The inputs constain `{}` without specificing InputSpec, its shape and dtype will be treated immutable. "
                         "Please specific InputSpec information in `@declarative` if you expect them as mutable inputs.".
                         format(type_name(rest_input)))
@@ -309,3 +315,61 @@ def convert_to_input_spec(inputs, input_spec):
         raise TypeError(
             "The type(input_spec) should be a `InputSpec` or dict/list/tuple of it, but received {}.".
             type_name(input_spec))
+
+
+def replace_spec_empty_name(args_name, input_with_spec):
+    """
+    Adds default name according to argument name from decorated function
+    if without specificing InputSpec.name
+
+    The naming rule are as followed:
+        1. If InputSpec.name is not None, do nothing.
+        2. If each argument `x` corresponds to an InputSpec, using the argument name like `x`
+        3. If the arguments `inputs` corresponds to a list(InputSpec), using name like `inputs_0`, `inputs_1`
+        4. If the arguments `input_dic` corresponds to a dict(InputSpec), using key as name.
+
+    For example:
+        
+        # case 1: foo(x, y)
+        foo = to_static(foo, input_spec=[InputSpec([None, 10]), InputSpec([None])])
+        print([in_var.name for in_var in foo.inputs])  # [x, y]
+
+        # case 2: foo(inputs) where inputs is a list
+        foo = to_static(foo, input_spec=[[InputSpec([None, 10]), InputSpec([None])]])
+        print([in_var.name for in_var in foo.inputs])  # [inputs_0, inputs_1]
+
+        # case 3: foo(inputs) where inputs is a dict
+        foo = to_static(foo, input_spec=[{'x': InputSpec([None, 10]), 'y': InputSpec([None])}])
+        print([in_var.name for in_var in foo.inputs])  # [x, y]
+    """
+    input_with_spec = list(input_with_spec)
+    candidate_arg_names = args_name[:len(input_with_spec)]
+
+    for i, arg_name in enumerate(candidate_arg_names):
+        input_spec = input_with_spec[i]
+        input_with_spec[i] = _replace_spec_name(arg_name, input_spec)
+
+    return input_with_spec
+
+
+def _replace_spec_name(name, input_spec):
+    """
+    Replaces InputSpec.name with given `name` while not specificing it.
+    """
+    if isinstance(input_spec, paddle.static.InputSpec):
+        if input_spec.name is None:
+            input_spec.name = name
+        return input_spec
+    elif isinstance(input_spec, (list, tuple)):
+        processed_specs = []
+        for i, spec in enumerate(input_spec):
+            new_name = "{}_{}".format(name, i)
+            processed_specs.append(_replace_spec_name(new_name, spec))
+        return processed_specs
+    elif isinstance(input_spec, dict):
+        processed_specs = {}
+        for key, spec in six.iteritems(input_spec):
+            processed_specs[key] = _replace_spec_name(key, spec)
+        return processed_specs
+    else:
+        return input_spec
diff --git a/python/paddle/fluid/dygraph/dygraph_to_static/logging_utils.py b/python/paddle/fluid/dygraph/dygraph_to_static/logging_utils.py
index c52872b15016169504359b54ad5a40360e244ce0..4d9ed5916adfd79013be1d8d1bb90f3c44428b49 100644
--- a/python/paddle/fluid/dygraph/dygraph_to_static/logging_utils.py
+++ b/python/paddle/fluid/dygraph/dygraph_to_static/logging_utils.py
@@ -26,6 +26,8 @@ CODE_LEVEL_ENV_NAME = 'TRANSLATOR_CODE_LEVEL'
 DEFAULT_VERBOSITY = -1
 DEFAULT_CODE_LEVEL = -1
 
+LOG_AllTransformer = 100
+
 
 def synchronized(func):
     def wrapper(*args, **kwargs):
@@ -53,10 +55,15 @@ class TranslatorLogger(object):
             return
 
         self._initialized = True
+        self.logger_name = "Dynamic-to-Static"
         self._logger = log_helper.get_logger(
-            __name__, 1, fmt='%(asctime)s-%(levelname)s: %(message)s')
+            self.logger_name,
+            1,
+            fmt='%(asctime)s %(name)s %(levelname)s: %(message)s')
         self._verbosity_level = None
         self._transformed_code_level = None
+        self._need_to_echo_log_to_stdout = None
+        self._need_to_echo_code_to_stdout = None
 
     @property
     def logger(self):
@@ -86,6 +93,28 @@ class TranslatorLogger(object):
         self.check_level(level)
         self._transformed_code_level = level
 
+    @property
+    def need_to_echo_log_to_stdout(self):
+        if self._need_to_echo_log_to_stdout is not None:
+            return self._need_to_echo_log_to_stdout
+        return False
+
+    @need_to_echo_log_to_stdout.setter
+    def need_to_echo_log_to_stdout(self, log_to_stdout):
+        assert isinstance(log_to_stdout, (bool, type(None)))
+        self._need_to_echo_log_to_stdout = log_to_stdout
+
+    @property
+    def need_to_echo_code_to_stdout(self):
+        if self._need_to_echo_code_to_stdout is not None:
+            return self._need_to_echo_code_to_stdout
+        return False
+
+    @need_to_echo_code_to_stdout.setter
+    def need_to_echo_code_to_stdout(self, code_to_stdout):
+        assert isinstance(code_to_stdout, (bool, type(None)))
+        self._need_to_echo_code_to_stdout = code_to_stdout
+
     def check_level(self, level):
         if isinstance(level, (six.integer_types, type(None))):
             rv = level
@@ -110,34 +139,56 @@ class TranslatorLogger(object):
 
     def error(self, msg, *args, **kwargs):
         self.logger.error(msg, *args, **kwargs)
+        if self.need_to_echo_log_to_stdout:
+            self._output_to_stdout('ERROR: ' + msg, *args)
 
     def warn(self, msg, *args, **kwargs):
-        self.logger.warn(msg, *args, **kwargs)
+        self.logger.warning(msg, *args, **kwargs)
+        if self.need_to_echo_log_to_stdout:
+            self._output_to_stdout('WARNING: ' + msg, *args)
 
     def log(self, level, msg, *args, **kwargs):
         if self.has_verbosity(level):
-            self.logger.log(level, msg, *args, **kwargs)
+            msg_with_level = '(Level {}) {}'.format(level, msg)
+            self.logger.info(msg_with_level, *args, **kwargs)
+            if self.need_to_echo_log_to_stdout:
+                self._output_to_stdout('INFO: ' + msg_with_level, *args)
 
     def log_transformed_code(self, level, ast_node, transformer_name, *args,
                              **kwargs):
         if self.has_code_level(level):
             source_code = ast_to_source_code(ast_node)
-            header_msg = "After the level {} ast transformer: '{}', the transformed code:\n"\
-                .format(level, transformer_name)
+            if level == LOG_AllTransformer:
+                header_msg = "After the last level ast transformer: '{}', the transformed code:\n" \
+                    .format(transformer_name)
+            else:
+                header_msg = "After the level {} ast transformer: '{}', the transformed code:\n"\
+                    .format(level, transformer_name)
 
             msg = header_msg + source_code
             self.logger.info(msg, *args, **kwargs)
 
+            if self.need_to_echo_code_to_stdout:
+                self._output_to_stdout('INFO: ' + msg, *args)
+
+    def _output_to_stdout(self, msg, *args):
+        msg = self.logger_name + ' ' + msg
+        print(msg % args)
+
 
 _TRANSLATOR_LOGGER = TranslatorLogger()
 
 
-def set_verbosity(level=0):
+def set_verbosity(level=0, also_to_stdout=False):
     """
-    Sets the verbosity level of log for dygraph to static graph.
+    Sets the verbosity level of log for dygraph to static graph. Logs can be output to stdout by setting `also_to_stdout`.
+
     There are two means to set the logging verbosity:
-     1. Call function `set_verbosity`
-     2. Set environment variable `TRANSLATOR_VERBOSITY`
+
+    1. Call function `set_verbosity`
+
+    2. Set environment variable `TRANSLATOR_VERBOSITY`
+
 
     **Note**:
     `set_verbosity` has a higher priority than the environment variable.
@@ -145,6 +196,7 @@ def set_verbosity(level=0):
     Args:
         level(int): The verbosity level. The larger value idicates more verbosity.
             The default value is 0, which means no logging.
+        also_to_stdout(bool): Whether to also output log messages to `sys.stdout`.
 
     Examples:
         .. code-block:: python
@@ -159,27 +211,30 @@ def set_verbosity(level=0):
             # The verbosity level is now 3, but it has no effect because it has a lower priority than `set_verbosity`
     """
     _TRANSLATOR_LOGGER.verbosity_level = level
+    _TRANSLATOR_LOGGER.need_to_echo_log_to_stdout = also_to_stdout
 
 
 def get_verbosity():
     return _TRANSLATOR_LOGGER.verbosity_level
 
 
-LOG_AllTransformer = 100
-
-
-def set_code_level(level=LOG_AllTransformer):
+def set_code_level(level=LOG_AllTransformer, also_to_stdout=False):
     """
-    Sets the level to print code from specific level of Ast Transformer.
+    Sets the level to print code from specific level Ast Transformer. Code can be output to stdout by setting `also_to_stdout`.
+
     There are two means to set the code level:
-     1. Call function `set_code_level`
-     2. Set environment variable `TRANSLATOR_CODE_LEVEL`
+
+    1. Call function `set_code_level`
+
+    2. Set environment variable `TRANSLATOR_CODE_LEVEL`
+
 
     **Note**:
     `set_code_level` has a higher priority than the environment variable.
 
     Args:
         level(int): The level to print code. Default is 100, which means to print the code after all AST Transformers.
+        also_to_stdout(bool): Whether to also output code to `sys.stdout`.
 
     Examples:
         .. code-block:: python
@@ -195,6 +250,7 @@ def set_code_level(level=LOG_AllTransformer):
 
     """
     _TRANSLATOR_LOGGER.transformed_code_level = level
+    _TRANSLATOR_LOGGER.need_to_echo_code_to_stdout = also_to_stdout
 
 
 def get_code_level():
diff --git a/python/paddle/fluid/dygraph/dygraph_to_static/origin_info.py b/python/paddle/fluid/dygraph/dygraph_to_static/origin_info.py
index 13f38b0726c27566ff0eda41d6c365e6a7e4aa4b..76e732d4d37f6a2056afba72649077acf16ba30e 100644
--- a/python/paddle/fluid/dygraph/dygraph_to_static/origin_info.py
+++ b/python/paddle/fluid/dygraph/dygraph_to_static/origin_info.py
@@ -124,8 +124,13 @@ class OriginInfoAttacher(gast.NodeTransformer):
 
     def _abs_lineno(self, node):
         # NOTE(liym27):
-        #   If the first gast.FunctionDef has decorator, its lineno is 1, which
-        #   equals to the lineno of the first decorator node.
+        #   There are differences in ast_node.lineno between PY3.8+ and PY3.8-.
+        #   If the first gast.FunctionDef has decorator, the lineno of gast.FunctionDef is differs.
+        #       1. < PY3.8
+        #           its lineno equals to the lineno of the first decorator node, which is not right.
+        #       2. >= PY3.8
+        #           its lineno is the actual lineno, which is right.
+
         return self.lineno_offset + node.lineno
 
     def _abs_col_offset(self, node):
diff --git a/python/paddle/fluid/dygraph/dygraph_to_static/partial_program.py b/python/paddle/fluid/dygraph/dygraph_to_static/partial_program.py
index 59cb5fb144eb50f4616c94ed78348d56a4029834..1004665ca15fbc2458c1626735f161c7f4904596 100644
--- a/python/paddle/fluid/dygraph/dygraph_to_static/partial_program.py
+++ b/python/paddle/fluid/dygraph/dygraph_to_static/partial_program.py
@@ -14,21 +14,17 @@
 
 from __future__ import print_function
 import numpy as np
-import logging
 import six
 
-from paddle.fluid import log_helper
 from paddle.fluid import framework, backward, core
 from paddle.fluid.dygraph import layers
 from paddle.fluid.dygraph.base import switch_to_static_graph
+from paddle.fluid.dygraph.dygraph_to_static import logging_utils
 from paddle.fluid.dygraph.dygraph_to_static.return_transformer import RETURN_NO_VALUE_MAGIC_NUM
 from paddle.fluid.layers.utils import flatten
 from paddle.fluid.layers.utils import pack_sequence_as
 import paddle.compat as cpt
 
-_logger = log_helper.get_logger(
-    __name__, logging.WARNING, fmt='%(asctime)s-%(levelname)s: %(message)s')
-
 
 class NestSequence(object):
     """
@@ -72,7 +68,7 @@ class NestSequence(object):
                 if not isinstance(var, (framework.Variable, core.VarBase)):
                     warning_types.add(type(var))
             if warning_types:
-                _logger.warning(
+                logging_utils.warn(
                     "Output of traced function contains non-tensor type values: {}. "
                     "Currently, We don't support to update them while training and will return "
                     "what we first saw. Please try to return them as tensor.".
diff --git a/python/paddle/fluid/dygraph/dygraph_to_static/print_transformer.py b/python/paddle/fluid/dygraph/dygraph_to_static/print_transformer.py
index d555c8ed28f358a43e53966dd30d76d85a03dde5..efde2481721f4f16f0ecadb1200b4abcde73e2b8 100644
--- a/python/paddle/fluid/dygraph/dygraph_to_static/print_transformer.py
+++ b/python/paddle/fluid/dygraph/dygraph_to_static/print_transformer.py
@@ -15,14 +15,8 @@
 from __future__ import print_function
 
 import gast
-import logging
 
-from paddle.fluid import log_helper
-from paddle.fluid.dygraph.dygraph_to_static.static_analysis import AstNodeWrapper, NodeVarType, StaticAnalysisVisitor
-from paddle.fluid.dygraph.dygraph_to_static.utils import ast_to_source_code
-
-_logger = log_helper.get_logger(
-    __name__, logging.WARNING, fmt='%(asctime)s-%(levelname)s: %(message)s')
+from paddle.fluid.dygraph.dygraph_to_static.static_analysis import AstNodeWrapper, StaticAnalysisVisitor
 
 
 class PrintTransformer(gast.NodeTransformer):
diff --git a/python/paddle/fluid/dygraph/dygraph_to_static/program_translator.py b/python/paddle/fluid/dygraph/dygraph_to_static/program_translator.py
index cb489af44d0adc7da377f73a3205c3c264769b4d..3b3b9bbe96f2929257d99b924af9770605b287f4 100644
--- a/python/paddle/fluid/dygraph/dygraph_to_static/program_translator.py
+++ b/python/paddle/fluid/dygraph/dygraph_to_static/program_translator.py
@@ -13,16 +13,15 @@
 # limitations under the License.
 
 from __future__ import print_function
-import gast
+
 import collections
-import logging
+import gast
 import inspect
 import six
 import textwrap
 import threading
-import warnings
+import weakref
 
-import gast
 from paddle.fluid import framework
 from paddle.fluid import in_dygraph_mode
 from paddle.fluid.dygraph import layers
@@ -31,8 +30,7 @@ from paddle.fluid.layers.utils import flatten
 from paddle.fluid.dygraph.base import param_guard
 from paddle.fluid.dygraph.base import switch_to_static_graph
 from paddle.fluid.dygraph.dygraph_to_static import DygraphToStaticAst
-from paddle.fluid.dygraph.dygraph_to_static.error import ERROR_DATA
-from paddle.fluid.dygraph.dygraph_to_static.error import attach_error_data
+from paddle.fluid.dygraph.dygraph_to_static import error
 from paddle.fluid.dygraph.dygraph_to_static import logging_utils
 from paddle.fluid.dygraph.dygraph_to_static.origin_info import attach_origin_info
 from paddle.fluid.dygraph.dygraph_to_static.origin_info import create_and_update_origin_info_map
@@ -245,7 +243,8 @@ class StaticLayer(object):
         self._input_spec = input_spec
         self._function_spec = FunctionSpec(function, input_spec)
         self._program_cache = ProgramCache()
-        # Note: Hold a reference to ProgramTranslator for switching `enable_declarative`.
+        self._descriptor_cache = weakref.WeakKeyDictionary()
+        # Note: Hold a reference to ProgramTranslator for switching `enable_to_static`.
         self._program_trans = ProgramTranslator()
 
     def __get__(self, instance, owner):
@@ -271,8 +270,19 @@ class StaticLayer(object):
         of `Net` instance. After decorated by `@paddle.jit.to_static`, it will firstly to call `__get__`
         to parse the class instance correctly instead of the `StaticLayer` instance.
         """
-        self._class_instance = instance
-        return self
+        if instance not in self._descriptor_cache:
+            if instance is None:
+                return self
+            # Note(Aurelius84): To construct new instance of StaticLayer when we
+            # first encouter the bound function of layer and cache it.
+            new_static_layer = self._clone()
+            new_static_layer._class_instance = instance
+            self._descriptor_cache[instance] = new_static_layer
+
+        return self._descriptor_cache[instance]
+
+    def _clone(self):
+        return self.__class__(self._dygraph_function, self._input_spec)
 
     def __call__(self, *args, **kwargs):
         """
@@ -287,21 +297,23 @@ class StaticLayer(object):
         """
 
         # 1. call dygraph function directly if not enable `declarative`
-        if not self._program_trans.enable_declarative:
+        if not self._program_trans.enable_to_static:
             logging_utils.warn(
-                "The decorator '@paddle.jit.to_static' does NOT work when setting ProgramTranslator.enable=False. "
-                "We will just return dygraph output.")
+                "The decorator '@paddle.jit.to_static' does NOT work when setting ProgramTranslator.enable to False. "
+                "We will just return dygraph output. If you would like to get static graph output, please call API "
+                "ProgramTranslator.enable(True)")
             return self._call_dygraph_function(*args, **kwargs)
 
-        if not in_dygraph_mode() and self._program_trans.enable_declarative:
+        if not in_dygraph_mode():
             raise RuntimeError(
                 "Failed to run the callable object {} decorated by '@paddle.jit.to_static', "
-                "because it does NOT in dynamic mode. Please disable the static mode to enter dynamic mode with the "
+                "because it is NOT in dynamic mode. Please disable the static mode to enter dynamic mode with the "
                 "following API: paddle.disable_static().".format(
                     self.dygraph_function))
 
         # 2. trace ops from dygraph layers and cache the generated program.
         args, kwargs = self._function_spec.unified_args_and_kwargs(args, kwargs)
+
         try:
             concrete_program, partial_program_layer = self.get_concrete_program(
                 *args, **kwargs)
@@ -311,27 +323,22 @@ class StaticLayer(object):
                 partial_program_layer.training = self._class_instance.training
 
             # 4. return outputs.
-            return partial_program_layer(args)
+            try:
+                return partial_program_layer(args)
+            except Exception as e:
+                if not hasattr(e, error.ERROR_DATA):
+                    # runtime error
+                    error.attach_error_data(e, in_runtime=True)
+                    raise
         except Exception as e:
-            if not hasattr(e, ERROR_DATA):
-                # runtime error
-                attach_error_data(e, in_runtime=True)
-            error_data = getattr(e, ERROR_DATA, None)
+            error_data = getattr(e, error.ERROR_DATA, None)
             if error_data:
-                new_exception = error_data.create_exception()
-                if six.PY3:
-                    # NOTE(liym27):
-                    # 1. Why `raise new_exception from None`?
-                    #   In Python 3, by default, an new exception is raised with trace information of the caught exception.
-                    #   This only raises new_exception and hides unwanted implementation details from tracebacks of the
-                    #   caught exception.
-                    # 2. Use exec to bypass syntax error checking in Python 2.
-
-                    six.exec_("raise new_exception from None")
-                else:
-                    raise new_exception
+                error_data.raise_new_exception()
             else:
-                raise
+                logging_utils.warn(
+                    "Please file an issue at 'https://github.com/PaddlePaddle/Paddle/issues'"
+                    " if you can't handle this {} yourself.".format(type(e)))
+                raise e
 
     def _call_dygraph_function(self, *args, **kwargs):
         """
@@ -363,6 +370,7 @@ class StaticLayer(object):
         Returns:
             Traced ConcreteProgram and executable translated Layer.
         """
+
         # 1. unify args/kwargs and replace Tensor with InputSpec
         if len(args) != len(self._function_spec.args_name):
             args, kwargs = self._function_spec.unified_args_and_kwargs(args,
@@ -442,7 +450,7 @@ class StaticLayer(object):
                     format(self._function_spec))
         # If more than one programs have been cached, return the recent converted program by default.
         elif cached_program_len > 1:
-            logging.warning(
+            logging_utils.warn(
                 "Current {} has more than one cached programs: {}, the last traced progam will be return by default.".
                 format(self._function_spec, cached_program_len))
 
@@ -515,6 +523,19 @@ def _switch_declarative_mode_guard_(is_declarative=True):
     _in_declarative_mode_ = original_val
 
 
+def _verify_init_in_dynamic_mode(class_instance):
+    """
+    Verifies the instance is initialized in dynamic mode.
+    """
+    if isinstance(class_instance, layers.Layer):
+        if not class_instance._init_in_dynamic_mode:
+            raise RuntimeError(
+                " `paddle.jit.to_static` is only available in dynamic mode. Please call `paddle.disable_static()` before "
+                "initializing your Layer class `{}` . Because parameters of Layer class should be initialized firstly "
+                "in dynamic mode while applying transformation.".format(
+                    class_instance))
+
+
 class ConcreteProgram(object):
 
     __slots__ = [
@@ -547,6 +568,9 @@ class ConcreteProgram(object):
             func_spec(FunctionSpec): A FunctionSpec instance for decorated function.
             input_spec(list[InputSpec]): 
         """
+        # verify the instance is initialized in imperative mode.
+        _verify_init_in_dynamic_mode(class_instance)
+
         # Transforms dygraph function into static function and caches it.
         dygraph_function = func_spec.dygraph_function
         static_func = convert_to_static(dygraph_function)
@@ -580,7 +604,7 @@ class ConcreteProgram(object):
                         outputs = static_func(*inputs)
                     except BaseException as e:
                         # NOTE: If e is raised in compile time, e should be attached to ERROR_DATA here.
-                        attach_error_data(e)
+                        error.attach_error_data(e)
                         raise
 
                 if not isinstance(outputs,
@@ -623,7 +647,7 @@ class ProgramCache(object):
             # Note: raise warnings if number of traced program is more than `max_tracing_count`
             current_tracing_count = len(self._caches)
             if current_tracing_count > MAX_TRACED_PROGRAM_COUNT:
-                logging.warning(
+                logging_utils.warn(
                     "Current traced program number: {} > `max_tracing_count`:{}. Too much cached programs will bring expensive overhead. "
                     "The reason may be: (1) passing tensors with different shapes, (2) passing python objects instead of tensors.".
                     format(current_tracing_count, MAX_TRACED_PROGRAM_COUNT))
@@ -715,15 +739,15 @@ class ProgramTranslator(object):
             return
         self._initialized = True
         self._program_cache = ProgramCache()
-        self.enable_declarative = True
+        self.enable_to_static = True
 
-    def enable(self, enable_declarative):
+    def enable(self, enable_to_static):
         """
         Enable or disable the converting from imperative to declarative by
         ProgramTranslator globally.
 
         Args:
-            enable_declarative (bool): True or False to enable or disable declarative.
+            enable_to_static (bool): True or False to enable or disable declarative.
 
         Returns:
             None.
@@ -752,9 +776,9 @@ class ProgramTranslator(object):
                 print(func(x).numpy()) # [[2. 2.]]
 
         """
-        check_type(enable_declarative, "enable_declarative", bool,
+        check_type(enable_to_static, "enable_to_static", bool,
                    "ProgramTranslator.enable")
-        self.enable_declarative = enable_declarative
+        self.enable_to_static = enable_to_static
 
     def get_output(self, dygraph_func, *args, **kwargs):
         """
@@ -795,33 +819,44 @@ class ProgramTranslator(object):
         assert callable(
             dygraph_func
         ), "Input dygraph_func is not a callable in ProgramTranslator.get_output"
-        if not self.enable_declarative:
-            warnings.warn(
-                "The ProgramTranslator.get_output doesn't work when setting ProgramTranslator.enable = False. "
-                "We will just return dygraph output.")
-            return dygraph_func(*args, **kwargs)
-
-        function_spec = FunctionSpec(dygraph_func)
-        cache_key = CacheKey.from_func_and_args(function_spec, args, kwargs,
-                                                getattr(dygraph_func,
-                                                        '__self__', None))
-        _, partial_program_layer = self._program_cache[cache_key]
 
-        if args and isinstance(args[0], layers.Layer):
-            # Synchronize self.training attribute.
-            partial_program_layer.training = args[0].training
-            args = args[1:]
+        if not self.enable_to_static:
+            logging_utils.warn(
+                "The ProgramTranslator.get_output doesn't work when setting ProgramTranslator.enable to False. "
+                "We will just return dygraph output. "
+                "Please call ProgramTranslator.enable(True) if you would like to get static output."
+            )
+            return dygraph_func(*args, **kwargs)
         try:
-            return partial_program_layer(args)
-
+            function_spec = FunctionSpec(dygraph_func)
+            cache_key = CacheKey.from_func_and_args(function_spec, args, kwargs,
+                                                    getattr(dygraph_func,
+                                                            '__self__', None))
+            _, partial_program_layer = self._program_cache[cache_key]
+
+            if args and isinstance(args[0], layers.Layer):
+                # Synchronize self.training attribute.
+                partial_program_layer.training = args[0].training
+                args = args[1:]
+            try:
+                return partial_program_layer(args)
+            except BaseException as e:
+                # NOTE:
+                # 1. If e is raised in compile time, e should have been attached to ERROR_DATA before;
+                # 2. If e raised in runtime, e should be attached to ERROR_DATA here.
+                if not hasattr(e, error.ERROR_DATA):
+                    # runtime error
+                    error.attach_error_data(e, in_runtime=True)
+                raise
         except BaseException as e:
-            # NOTE:
-            # 1. If e is raised in compile time, e should have been attached to ERROR_DATA before;
-            # 2. If e raised in runtime, e should be attached to ERROR_DATA here.
-            if not hasattr(e, ERROR_DATA):
-                # runtime error
-                attach_error_data(e, in_runtime=True)
-            raise
+            error_data = getattr(e, error.ERROR_DATA, None)
+            if error_data:
+                error_data.raise_new_exception()
+            else:
+                logging_utils.warn(
+                    "Please file an issue at 'https://github.com/PaddlePaddle/Paddle/issues'"
+                    " if you can't handle this {} yourself.".format(type(e)))
+                raise e
 
     def get_func(self, dygraph_func):
         """
@@ -860,10 +895,12 @@ class ProgramTranslator(object):
         assert callable(
             dygraph_func
         ), "Input dygraph_func is not a callable in ProgramTranslator.get_func"
-        if not self.enable_declarative:
-            warnings.warn(
-                "The ProgramTranslator.get_func doesn't work when setting ProgramTranslator.enable=False. We will "
-                "just return dygraph output.")
+
+        if not self.enable_to_static:
+            logging_utils.warn(
+                "The ProgramTranslator.get_func doesn't work when setting ProgramTranslator.enable to False. We will "
+                "just return dygraph output. Please call ProgramTranslator.enable(True) if you would like to get static output."
+            )
             return dygraph_func
 
         static_func = convert_to_static(dygraph_func)
@@ -913,10 +950,13 @@ class ProgramTranslator(object):
         assert callable(
             dygraph_func
         ), "Input dygraph_func is not a callable in ProgramTranslator.get_program"
-        if not self.enable_declarative:
-            warnings.warn(
-                "The ProgramTranslator.get_program doesn't work when setting ProgramTranslator.enable=False."
-                "We will just return dygraph output.")
+
+        if not self.enable_to_static:
+            logging_utils.warn(
+                "The ProgramTranslator.get_program doesn't work when setting ProgramTranslator.enable to False."
+                "We will just return dygraph output. "
+                "Please call ProgramTranslator.enable(True) if you would like to get static output."
+            )
             return dygraph_func(*args, **kwargs)
 
         function_spec = FunctionSpec(dygraph_func)
diff --git a/python/paddle/fluid/dygraph/io.py b/python/paddle/fluid/dygraph/io.py
index 7f3d450a49c7d3fcc9ca1d3c2d7c5eb732671c6c..4391843b0efb5636104973f0524131aa64751ffa 100644
--- a/python/paddle/fluid/dygraph/io.py
+++ b/python/paddle/fluid/dygraph/io.py
@@ -19,6 +19,7 @@ import six
 import pickle
 import numpy as np
 
+import paddle
 from paddle import compat as cpt
 from paddle.fluid import core
 from paddle.fluid import framework
@@ -182,9 +183,9 @@ class _ProgramHolder(object):
         super(_ProgramHolder, self).__init__()
 
         # input, output, persistable var info
-        self._input_names = []
-        self._persistable_names = []
+        self._input_descs = []
         self._output_descs = []
+        self._persistable_names = []
 
         # execution scope
         self._inner_scope = core.Scope()
@@ -207,11 +208,11 @@ class _ProgramHolder(object):
         return self._train_program_desc
 
     @property
-    def input_names(self):
-        return self._input_names
+    def input_descs(self):
+        return self._input_descs
 
     @property
-    def output_decs(self):
+    def output_descs(self):
         return self._output_descs
 
     @property
@@ -233,7 +234,8 @@ class _ProgramHolder(object):
                 ops_to_remove.append(i)
                 feed_var_name = cpt.to_bytes(op.input('X')[0])
                 root_block._remove_var(feed_var_name)
-                self._input_names.append(cpt.to_bytes(op.output('Out')[0]))
+                self._input_descs.append(
+                    root_block.find_var(cpt.to_bytes(op.output('Out')[0])))
             elif op.type() == 'scale' and op.output('Out')[0].startswith(
                     'save_infer_model/scale_'):
                 ops_to_remove.append(i)
@@ -257,7 +259,7 @@ class _ProgramHolder(object):
             root_block._remove_op(op_idx, op_idx + 1)
 
         # 2. Input processing, reverse feed vars
-        self._input_names.reverse()
+        self._input_descs.reverse()
 
         # 3. Output processing, add scale for outputs
         tmp_program = _build_program_by_desc(program_desc)
@@ -479,15 +481,28 @@ def _load_persistable_vars(model_path,
             var_file_path = os.path.join(model_path, params_filename)
         else:
             var_file_path = os.path.join(model_path, VARIABLE_FILENAME)
-        framework._dygraph_tracer().trace_op(
-            type='load_combine',
-            inputs={},
-            outputs={'Out': load_var_list},
-            attrs={'file_path': var_file_path})
+        if not os.path.exists(var_file_path):
+            if len(extra_var_info) != 0:
+                raise ValueError("The model to be loaded is incomplete.")
+        else:
+            framework._dygraph_tracer().trace_op(
+                type='load_combine',
+                inputs={},
+                outputs={'Out': load_var_list},
+                attrs={'file_path': var_file_path})
 
     return load_var_dict
 
 
+# NOTE(chenweihang): to adapt paddle.load to get state_dict
+def _remove_varname_suffix(var_dict, program_holder):
+    no_suffix_var_dict = dict()
+    for var_name in var_dict:
+        no_suffix_name = program_holder._suffix_varname_dict[var_name]
+        no_suffix_var_dict[no_suffix_name] = var_dict[var_name]
+    return no_suffix_var_dict
+
+
 def _construct_program_holders(model_path, model_filename=None):
     # make sure the path has been checked
     program_holder_dict = dict()
@@ -517,7 +532,8 @@ def _construct_program_holders(model_path, model_filename=None):
 def _construct_params_and_buffers(model_path,
                                   programs,
                                   separate_params=False,
-                                  params_filename=None):
+                                  params_filename=None,
+                                  append_suffix=True):
     var_info_path = os.path.join(model_path, EXTRA_VAR_INFO_FILENAME)
     if os.path.exists(var_info_path):
         var_dict = _load_persistable_vars(model_path, var_info_path,
@@ -526,6 +542,10 @@ def _construct_params_and_buffers(model_path,
     else:
         var_dict = _load_persistable_vars_by_program(
             model_path, programs['forward'], params_filename)
+
+    if not append_suffix:
+        var_dict = _remove_varname_suffix(var_dict, programs['forward'])
+
     return var_dict
 
 
@@ -542,89 +562,92 @@ class TranslatedLayer(layers.Layer):
         .. code-block:: python
 
             import numpy as np
-            import paddle.fluid as fluid
-            from paddle.fluid.dygraph import Linear
-            from paddle.fluid.dygraph import declarative
+            import paddle
+            import paddle.nn as nn
+            import paddle.optimizer as opt
 
-            BATCH_SIZE = 32
-            BATCH_NUM = 20
+            BATCH_SIZE = 16
+            BATCH_NUM = 4
+            EPOCH_NUM = 4
 
-            def random_batch_reader():
-                def _get_random_images_and_labels(image_shape, label_shape):
-                    image = np.random.random(size=image_shape).astype('float32')
-                    label = np.random.random(size=label_shape).astype('int64')
-                    return image, label
+            IMAGE_SIZE = 784
+            CLASS_NUM = 10
 
-                def __reader__():
-                    for _ in range(BATCH_NUM):
-                        batch_image, batch_label = _get_random_images_and_labels(
-                            [BATCH_SIZE, 784], [BATCH_SIZE, 1])
-                        yield batch_image, batch_label
+            # define a random dataset
+            class RandomDataset(paddle.io.Dataset):
+                def __init__(self, num_samples):
+                    self.num_samples = num_samples
 
-                return __reader__
+                def __getitem__(self, idx):
+                    image = np.random.random([IMAGE_SIZE]).astype('float32')
+                    label = np.random.randint(0, CLASS_NUM - 1, (1, )).astype('int64')
+                    return image, label
 
-            class LinearNet(fluid.dygraph.Layer):
-                def __init__(self, in_size, out_size):
+                def __len__(self):
+                    return self.num_samples
+
+            class LinearNet(nn.Layer):
+                def __init__(self):
                     super(LinearNet, self).__init__()
-                    self._linear = Linear(in_size, out_size)
+                    self._linear = nn.Linear(IMAGE_SIZE, CLASS_NUM)
 
-                @declarative
+                @paddle.jit.to_static
                 def forward(self, x):
                     return self._linear(x)
 
+            def train(layer, loader, loss_fn, opt):
+                for epoch_id in range(EPOCH_NUM):
+                    for batch_id, (image, label) in enumerate(loader()):
+                        out = layer(image)
+                        loss = loss_fn(out, label)
+                        loss.backward()
+                        opt.step()
+                        opt.clear_grad()
+                        print("Epoch {} batch {}: loss = {}".format(
+                            epoch_id, batch_id, np.mean(loss.numpy())))
+
             # enable dygraph mode
-            fluid.enable_dygraph() 
+            place = paddle.CPUPlace()
+            paddle.disable_static(place) 
 
             # 1. train & save model.
-            # create network
-            net = LinearNet(784, 1)
-            adam = fluid.optimizer.AdamOptimizer(learning_rate=0.1, parameter_list=net.parameters())
-            # create data loader
-            train_loader = fluid.io.DataLoader.from_generator(capacity=5)
-            train_loader.set_batch_generator(random_batch_reader())
-            # train
-            for data in train_loader():
-                img, label = data
-                label.stop_gradient = True
 
-                cost = net(img)
+            # create network
+            layer = LinearNet()
+            loss_fn = nn.CrossEntropyLoss()
+            adam = opt.Adam(learning_rate=0.001, parameters=layer.parameters())
 
-                loss = fluid.layers.cross_entropy(cost, label)
-                avg_loss = fluid.layers.mean(loss)
+            # create data loader
+            dataset = RandomDataset(BATCH_NUM * BATCH_SIZE)
+            loader = paddle.io.DataLoader(dataset,
+                places=place,
+                batch_size=BATCH_SIZE,
+                shuffle=True,
+                drop_last=True,
+                num_workers=2)
 
-                avg_loss.backward()
-                adam.minimize(avg_loss)
-                net.clear_gradients()
+            # train
+            train(layer, loader, loss_fn, adam)
 
+            # save
             model_path = "linear.example.model"
-            fluid.dygraph.jit.save(
-                layer=net,
-                model_path=model_path,
-                input_spec=[img])
+            paddle.jit.save(layer, model_path)
 
             # 2. load model as TranslatedLayer
-            translated_layer = fluid.dygraph.jit.load(model_path)
+
+            # load
+            translated_layer = paddle.jit.load(model_path)
+
             # inference
             translated_layer.eval()
-            x = fluid.dygraph.to_variable(np.random.random((1, 784)).astype('float32'))
+            x = paddle.randn([1, IMAGE_SIZE], 'float32')
             pred = translated_layer(x)
+
             # fine-tune
             translated_layer.train()
-            adam = fluid.optimizer.AdamOptimizer(learning_rate=0.1, parameter_list=translated_layer.parameters())
-            train_loader = fluid.io.DataLoader.from_generator(capacity=5)
-            train_loader.set_batch_generator(random_batch_reader())
-            for data in train_loader():
-                img, label = data
-                label.stop_gradient = True
+            adam = opt.Adam(learning_rate=0.001, parameters=translated_layer.parameters())
+            train(translated_layer, loader, loss_fn, adam)
 
-                cost = translated_layer(img)
-
-                loss = fluid.layers.cross_entropy(cost, label)
-                avg_loss = fluid.layers.mean(loss)
-
-                avg_loss.backward()
-                adam.minimize(avg_loss)
-                translated_layer.clear_gradients()
     """
 
     def __init__(self, programs, persistable_vars):
@@ -685,7 +708,7 @@ class TranslatedLayer(layers.Layer):
         # 1. load program desc & construct _ProgramHolder
         programs = _construct_program_holders(model_path, model_filename)
 
-        # 2. load layer parameters & parameter attributes
+        # 2. load layer parameters & buffers
         persistable_vars = _construct_params_and_buffers(
             model_path, programs, separate_params, params_filename)
 
@@ -717,7 +740,7 @@ class TranslatedLayer(layers.Layer):
                 if isinstance(value, np.ndarray):
                     var = core.VarBase(
                         value=value,
-                        name=program_holder.input_names[i],
+                        name=program_holder.input_descs[i].name(),
                         persistable=False,
                         place=framework._current_expected_place(),
                         zero_copy=True)
@@ -725,7 +748,7 @@ class TranslatedLayer(layers.Layer):
                     var = value
                     # NOTE: we changed var name here, 
                     # but it may be an important name set by user
-                    var.name = program_holder.input_names[i]
+                    var.name = program_holder.input_descs[i].name()
                 input_vars.append(var)
 
             persistable_vars = []
@@ -741,7 +764,7 @@ class TranslatedLayer(layers.Layer):
                         % var_name)
 
             output_vars = []
-            for var_desc in program_holder.output_decs:
+            for var_desc in program_holder.output_descs:
                 var = core.VarBase(var_desc.dtype(),
                                    var_desc.shape(),
                                    var_desc.name(), var_desc.type(), False)
@@ -800,3 +823,144 @@ class TranslatedLayer(layers.Layer):
 
     def eval(self):
         self._is_test = True
+
+    def program(self, method_name='forward'):
+        """
+        Gets translated program of specified method.
+
+        Args:
+            - method_name (string): mehtod name corresponding to the program
+                to be obtained. Default: 'forward'.
+        
+        Returns:
+            Program
+
+        Examples:
+            .. code-block:: python
+            
+                import numpy as np
+                import paddle
+                import paddle.nn as nn
+                import paddle.optimizer as opt
+
+                BATCH_SIZE = 16
+                BATCH_NUM = 4
+                EPOCH_NUM = 4
+
+                IMAGE_SIZE = 784
+                CLASS_NUM = 10
+
+                # define a random dataset
+                class RandomDataset(paddle.io.Dataset):
+                    def __init__(self, num_samples):
+                        self.num_samples = num_samples
+
+                    def __getitem__(self, idx):
+                        image = np.random.random([IMAGE_SIZE]).astype('float32')
+                        label = np.random.randint(0, CLASS_NUM - 1, (1, )).astype('int64')
+                        return image, label
+
+                    def __len__(self):
+                        return self.num_samples
+
+                class LinearNet(nn.Layer):
+                    def __init__(self):
+                        super(LinearNet, self).__init__()
+                        self._linear = nn.Linear(IMAGE_SIZE, CLASS_NUM)
+
+                    @paddle.jit.to_static
+                    def forward(self, x):
+                        return self._linear(x)
+
+                def train(layer, loader, loss_fn, opt):
+                    for epoch_id in range(EPOCH_NUM):
+                        for batch_id, (image, label) in enumerate(loader()):
+                            out = layer(image)
+                            loss = loss_fn(out, label)
+                            loss.backward()
+                            opt.step()
+                            opt.clear_grad()
+                            print("Epoch {} batch {}: loss = {}".format(
+                                epoch_id, batch_id, np.mean(loss.numpy())))
+
+                # enable dygraph mode
+                place = paddle.CPUPlace()
+                paddle.disable_static(place) 
+
+                # create network
+                layer = LinearNet()
+                loss_fn = nn.CrossEntropyLoss()
+                adam = opt.Adam(learning_rate=0.001, parameters=layer.parameters())
+
+                # create data loader
+                dataset = RandomDataset(BATCH_NUM * BATCH_SIZE)
+                loader = paddle.io.DataLoader(dataset,
+                    places=place,
+                    batch_size=BATCH_SIZE,
+                    shuffle=True,
+                    drop_last=True,
+                    num_workers=2)
+
+                # train
+                train(layer, loader, loss_fn, adam)
+
+                # save
+                model_path = "linear.example.model"
+                paddle.jit.save(layer, model_path)
+
+                # load
+                translated_layer = paddle.jit.load(model_path)
+
+                # get program
+                program = translated_layer.program()
+        """
+        # 1. get program holder
+        program_holder = self._get_program_holder(method_name)
+
+        # 2. get inference program desc
+        program_desc = program_holder.infer_program
+
+        # 3. construct program
+        program = _build_program_by_desc(program_desc)
+        return program
+
+    def _get_program_holder(self, method_name='forward'):
+        program_holder = self._program_holder_dict.get(method_name, None)
+        if program_holder is None:
+            raise ValueError(
+                "The method `%s` does not exist in loaded TranslatedLayer." %
+                method_name)
+        return program_holder
+
+    def _input_spec(self, method_name='forward'):
+        # 1. get program holder
+        program_holder = self._get_program_holder(method_name)
+
+        # 2. build input spec by input desc
+        input_spec = []
+        for var_desc in program_holder.input_descs:
+            spec = paddle.static.InputSpec(
+                shape=var_desc.shape(),
+                dtype=var_desc.dtype(),
+                name=var_desc.name())
+            input_spec.append(spec)
+
+        return input_spec
+
+    def _output_spec(self, method_name='forward'):
+        # 1. get program holder
+        program_holder = self._get_program_holder(method_name)
+
+        # 2. build output spec by output desc
+        output_spec = []
+        for var_desc in program_holder.output_descs:
+            # NOTE(chenweihang): InputSpec describes a tensor, not just input. 
+            # Maybe the name is not good enough. Here we use InputSpec to 
+            # construct the description of Output tensor
+            spec = paddle.static.InputSpec(
+                shape=var_desc.shape(),
+                dtype=var_desc.dtype(),
+                name=var_desc.name())
+            output_spec.append(spec)
+
+        return output_spec
diff --git a/python/paddle/fluid/dygraph/jit.py b/python/paddle/fluid/dygraph/jit.py
index f67b79b91f7da235697d920cf0dfe376e88ab93e..10819e4b320dd0630c7ac43fdf89b84252823a94 100644
--- a/python/paddle/fluid/dygraph/jit.py
+++ b/python/paddle/fluid/dygraph/jit.py
@@ -17,6 +17,8 @@ from __future__ import print_function
 import os
 import pickle
 import warnings
+import functools
+from collections import OrderedDict
 
 import six
 import paddle
@@ -24,6 +26,7 @@ from paddle.fluid import core
 from paddle.fluid.compiler import BuildStrategy, CompiledProgram, ExecutionStrategy
 from paddle.fluid.data_feeder import check_type
 from paddle.fluid.dygraph.base import program_desc_tracing_guard, switch_to_static_graph
+from paddle.fluid.dygraph.dygraph_to_static import logging_utils
 from paddle.fluid.dygraph.dygraph_to_static.logging_utils import set_code_level, set_verbosity
 from paddle.fluid.dygraph.dygraph_to_static.program_translator import ProgramTranslator, StaticLayer, unwrap_decorators
 from paddle.fluid.dygraph.io import EXTRA_VAR_INFO_FILENAME, VARIABLE_FILENAME, TranslatedLayer
@@ -36,7 +39,7 @@ from paddle.fluid.wrapped_decorator import wrap_decorator
 
 __all__ = [
     'TracedLayer', 'declarative', 'dygraph_to_static_func', 'set_code_level',
-    'set_verbosity'
+    'set_verbosity', 'save', 'load', 'SaveLoadConfig'
 ]
 
 
@@ -117,8 +120,8 @@ def _dygraph_to_static_func_(dygraph_func):
     # TODO: remove this decorator after we finalize training API
     def __impl__(*args, **kwargs):
         program_translator = ProgramTranslator()
-        if in_dygraph_mode() or not program_translator.enable_declarative:
-            warnings.warn(
+        if in_dygraph_mode() or not program_translator.enable_to_static:
+            logging_utils.warn(
                 "The decorator 'dygraph_to_static_func' doesn't work in "
                 "dygraph mode or set ProgramTranslator.enable to False. "
                 "We will just return dygraph output.")
@@ -210,7 +213,16 @@ def declarative(function=None, input_spec=None):
 
     # for usage: `declarative(foo, ...)`
     if function is not None:
-        return decorated(function)
+        if isinstance(function, Layer):
+            if isinstance(function.forward, StaticLayer):
+                class_name = function.__class__.__name__
+                logging_utils.warn(
+                    "`{}.forward` has already been decorated somewhere. It will be redecorated to replace previous one.".
+                    format(class_name))
+            function.forward = decorated(function.forward)
+            return function
+        else:
+            return decorated(function)
 
     # for usage: `@declarative`
     return decorated
@@ -228,63 +240,60 @@ class SaveLoadConfig(object):
 
         .. code-block:: python
 
-            import numpy as np
-            import paddle.fluid as fluid
-            from paddle.fluid.dygraph import Linear
-            from paddle.fluid.dygraph import declarative
+            import paddle
+            import paddle.nn as nn
+            import paddle.optimizer as opt
 
-            class SimpleNet(fluid.dygraph.Layer):
+            class SimpleNet(nn.Layer):
                 def __init__(self, in_size, out_size):
                     super(SimpleNet, self).__init__()
-                    self._linear = Linear(in_size, out_size)
+                    self._linear = nn.Linear(in_size, out_size)
 
-                @declarative
+                @paddle.jit.to_static
                 def forward(self, x):
                     y = self._linear(x)
                     z = self._linear(y)
                     return z
 
             # enable dygraph mode
-            fluid.enable_dygraph() 
+            paddle.disable_static() 
 
             # train model
             net = SimpleNet(8, 8)
-            adam = fluid.optimizer.AdamOptimizer(learning_rate=0.1, parameter_list=net.parameters())
-            x = fluid.dygraph.to_variable(np.random.random((4, 8)).astype('float32'))
+            adam = opt.Adam(learning_rate=0.1, parameters=net.parameters())
+            x = paddle.randn([4, 8], 'float32')
             for i in range(10):
                 out = net(x)
-                loss = fluid.layers.mean(out)
+                loss = paddle.tensor.mean(out)
                 loss.backward()
-                adam.minimize(loss)
-                net.clear_gradients()
+                adam.step()
+                adam.clear_grad()
 
             # use SaveLoadconfig when saving model
             model_path = "simplenet.example.model"
-            configs = fluid.dygraph.jit.SaveLoadConfig()
-            configs.model_filename = "__simplenet__"
-            fluid.dygraph.jit.save(
+            config = paddle.SaveLoadConfig()
+            config.model_filename = "__simplenet__"
+            paddle.jit.save(
                 layer=net,
                 model_path=model_path,
-                input_spec=[x],
-                configs=configs)
+                config=config)
 
         2. Using ``SaveLoadConfig`` when loading model
 
         .. code-block:: python
 
-            import numpy as np
-            import paddle.fluid as fluid
+            import paddle
 
             # enable dygraph mode
-            fluid.enable_dygraph() 
+            paddle.disable_static() 
 
             # use SaveLoadconfig when loading model
             model_path = "simplenet.example.model"
-            configs = fluid.dygraph.jit.SaveLoadConfig()
-            configs.model_filename = "__simplenet__"
-            infer_net = fluid.dygraph.jit.load(model_path, configs=configs)
+            config = paddle.SaveLoadConfig()
+            config.model_filename = "__simplenet__"
+            infer_net = paddle.jit.load(model_path, config=config)
             # inference
-            x = fluid.dygraph.to_variable(np.random.random((4, 8)).astype('float32'))
+            x = paddle.randn([4, 8], 'float32')
             pred = infer_net(x)
     """
 
@@ -293,6 +302,8 @@ class SaveLoadConfig(object):
         self._model_filename = None
         self._params_filename = None
         self._separate_params = False
+        # used for `paddle.load`
+        self._keep_name_table = False
 
         # NOTE: Users rarely use following configs, so these configs are not open to users,
         # reducing user learning costs, but we retain the configuration capabilities
@@ -322,51 +333,46 @@ class SaveLoadConfig(object):
         Examples:
             .. code-block:: python
 
-                import numpy as np
-                import paddle.fluid as fluid
-                from paddle.fluid.dygraph import Linear
-                from paddle.fluid.dygraph import declarative
+                import paddle
+                import paddle.nn as nn
+                import paddle.optimizer as opt
 
-                class SimpleNet(fluid.dygraph.Layer):
+                class SimpleNet(nn.Layer):
                     def __init__(self, in_size, out_size):
                         super(SimpleNet, self).__init__()
-                        self._linear = Linear(in_size, out_size)
+                        self._linear = nn.Linear(in_size, out_size)
 
-                    @declarative
+                    @paddle.jit.to_static
                     def forward(self, x):
                         y = self._linear(x)
                         z = self._linear(y)
-                        loss = fluid.layers.mean(z)
+                        loss = paddle.tensor.mean(z)
                         return z, loss
 
                 # enable dygraph mode
-                fluid.enable_dygraph() 
+                paddle.disable_static() 
 
                 # train model
                 net = SimpleNet(8, 8)
-                adam = fluid.optimizer.AdamOptimizer(learning_rate=0.1, parameter_list=net.parameters())
-                x = fluid.dygraph.to_variable(np.random.random((4, 8)).astype('float32'))
+                adam = opt.Adam(learning_rate=0.1, parameters=net.parameters())
+                x = paddle.randn([4, 8], 'float32')
                 for i in range(10):
                     out, loss = net(x)
                     loss.backward()
-                    adam.minimize(loss)
-                    net.clear_gradients()
+                    adam.step()
+                    adam.clear_grad()
 
                 # use SaveLoadconfig.output_spec
                 model_path = "simplenet.example.model.output_spec"
-                configs = fluid.dygraph.jit.SaveLoadConfig()
-                # only keep the predicted output in saved model, discard loss
-                configs.output_spec = [out]
-
-                fluid.dygraph.jit.save(
+                config = paddle.SaveLoadConfig()
+                config.output_spec = [out]
+                paddle.jit.save(
                     layer=net,
                     model_path=model_path,
-                    input_spec=[x],
-                    configs=configs)
+                    config=config)
 
-                infer_net = fluid.dygraph.jit.load(model_path, configs=configs)
-                x = fluid.dygraph.to_variable(np.random.random((4, 8)).astype('float32'))
-                # only have the predicted output
+                infer_net = paddle.jit.load(model_path)
+                x = paddle.randn([4, 8], 'float32')
                 pred = infer_net(x)
         """
         return self._output_spec
@@ -393,52 +399,47 @@ class SaveLoadConfig(object):
         Examples:
             .. code-block:: python
 
-                import numpy as np
-                import paddle.fluid as fluid
-                from paddle.fluid.dygraph import Linear
-                from paddle.fluid.dygraph import declarative
+                import paddle
+                import paddle.nn as nn
+                import paddle.optimizer as opt
 
-                class SimpleNet(fluid.dygraph.Layer):
+                class SimpleNet(nn.Layer):
                     def __init__(self, in_size, out_size):
                         super(SimpleNet, self).__init__()
-                        self._linear = Linear(in_size, out_size)
+                        self._linear = nn.Linear(in_size, out_size)
 
-                    @declarative
+                    @paddle.jit.to_static
                     def forward(self, x):
                         y = self._linear(x)
                         z = self._linear(y)
                         return z
 
                 # enable dygraph mode
-                fluid.enable_dygraph() 
+                paddle.disable_static() 
 
                 # train model
                 net = SimpleNet(8, 8)
-                adam = fluid.optimizer.AdamOptimizer(learning_rate=0.1, parameter_list=net.parameters())
-                x = fluid.dygraph.to_variable(np.random.random((4, 8)).astype('float32'))
+                adam = opt.Adam(learning_rate=0.1, parameters=net.parameters())
+                x = paddle.randn([4, 8], 'float32')
                 for i in range(10):
                     out = net(x)
-                    loss = fluid.layers.mean(out)
+                    loss = paddle.tensor.mean(out)
                     loss.backward()
-                    adam.minimize(loss)
-                    net.clear_gradients()
-
-                model_path = "simplenet.example.model.model_filename"
-                configs = fluid.dygraph.jit.SaveLoadConfig()
-                configs.model_filename = "__simplenet__"
+                    adam.step()
+                    adam.clear_grad()
 
                 # saving with configs.model_filename
-                fluid.dygraph.jit.save(
+                model_path = "simplenet.example.model.model_filename"
+                config = paddle.SaveLoadConfig()
+                config.model_filename = "__simplenet__"
+                paddle.jit.save(
                     layer=net,
                     model_path=model_path,
-                    input_spec=[x],
-                    configs=configs)
-                # [result] the saved model directory contains:
-                # __simplenet__  __variables__  __variables.info__
+                    config=config)
 
                 # loading with configs.model_filename
-                infer_net = fluid.dygraph.jit.load(model_path, configs=configs)
-                x = fluid.dygraph.to_variable(np.random.random((4, 8)).astype('float32'))
+                infer_net = paddle.jit.load(model_path, config=config)
+                x = paddle.randn([4, 8], 'float32')
                 pred = infer_net(x)
         """
         return self._model_filename
@@ -463,52 +464,48 @@ class SaveLoadConfig(object):
         Examples:
             .. code-block:: python
 
-                import numpy as np
-                import paddle.fluid as fluid
-                from paddle.fluid.dygraph import Linear
-                from paddle.fluid.dygraph import declarative
+                import paddle
+                import paddle.nn as nn
+                import paddle.optimizer as opt
 
-                class SimpleNet(fluid.dygraph.Layer):
+                class SimpleNet(nn.Layer):
                     def __init__(self, in_size, out_size):
                         super(SimpleNet, self).__init__()
-                        self._linear = Linear(in_size, out_size)
+                        self._linear = nn.Linear(in_size, out_size)
 
-                    @declarative
+                    @paddle.jit.to_static
                     def forward(self, x):
                         y = self._linear(x)
                         z = self._linear(y)
                         return z
 
                 # enable dygraph mode
-                fluid.enable_dygraph() 
+                paddle.disable_static() 
 
                 # train model
                 net = SimpleNet(8, 8)
-                adam = fluid.optimizer.AdamOptimizer(learning_rate=0.1, parameter_list=net.parameters())
-                x = fluid.dygraph.to_variable(np.random.random((4, 8)).astype('float32'))
+                adam = opt.Adam(learning_rate=0.1, parameters=net.parameters())
+                x = paddle.randn([4, 8], 'float32')
                 for i in range(10):
                     out = net(x)
-                    loss = fluid.layers.mean(out)
+                    loss = paddle.tensor.mean(out)
                     loss.backward()
-                    adam.minimize(loss)
-                    net.clear_gradients()
+                    adam.step()
+                    adam.clear_grad()
 
                 model_path = "simplenet.example.model.params_filename"
-                configs = fluid.dygraph.jit.SaveLoadConfig()
-                configs.params_filename = "__params__"
+                config = paddle.SaveLoadConfig()
+                config.params_filename = "__params__"
 
                 # saving with configs.params_filename
-                fluid.dygraph.jit.save(
+                paddle.jit.save(
                     layer=net,
                     model_path=model_path,
-                    input_spec=[x],
-                    configs=configs)
-                # [result] the saved model directory contains:
-                # __model__  __params__  __variables.info__
+                    config=config)
 
                 # loading with configs.params_filename
-                infer_net = fluid.dygraph.jit.load(model_path, configs=configs)
-                x = fluid.dygraph.to_variable(np.random.random((4, 8)).astype('float32'))
+                infer_net = paddle.jit.load(model_path, config=config)
+                x = paddle.randn([4, 8], 'float32')
                 pred = infer_net(x)
         """
         return self._params_filename
@@ -542,52 +539,50 @@ class SaveLoadConfig(object):
         Examples:
             .. code-block:: python
 
-                import numpy as np
-                import paddle.fluid as fluid
-                from paddle.fluid.dygraph import Linear
-                from paddle.fluid.dygraph import declarative
+                import paddle
+                import paddle.nn as nn
+                import paddle.optimizer as opt
 
-                class SimpleNet(fluid.dygraph.Layer):
+                class SimpleNet(nn.Layer):
                     def __init__(self, in_size, out_size):
                         super(SimpleNet, self).__init__()
-                        self._linear = Linear(in_size, out_size)
+                        self._linear = nn.Linear(in_size, out_size)
 
-                    @declarative
+                    @paddle.jit.to_static
                     def forward(self, x):
                         y = self._linear(x)
                         z = self._linear(y)
                         return z
 
                 # enable dygraph mode
-                fluid.enable_dygraph() 
+                paddle.disable_static() 
 
                 # train model
                 net = SimpleNet(8, 8)
-                adam = fluid.optimizer.AdamOptimizer(learning_rate=0.1, parameter_list=net.parameters())
-                x = fluid.dygraph.to_variable(np.random.random((4, 8)).astype('float32'))
+                adam = opt.Adam(learning_rate=0.1, parameters=net.parameters())
+                x = paddle.randn([4, 8], 'float32')
                 for i in range(10):
                     out = net(x)
-                    loss = fluid.layers.mean(out)
+                    loss = paddle.tensor.mean(out)
                     loss.backward()
-                    adam.minimize(loss)
-                    net.clear_gradients()
+                    adam.step()
+                    adam.clear_grad()
 
                 model_path = "simplenet.example.model.separate_params"
-                configs = fluid.dygraph.jit.SaveLoadConfig()
-                configs.separate_params = True
+                config = paddle.jit.SaveLoadConfig()
+                config.separate_params = True
 
                 # saving with configs.separate_params
-                fluid.dygraph.jit.save(
+                paddle.jit.save(
                     layer=net,
                     model_path=model_path,
-                    input_spec=[x],
-                    configs=configs)
+                    config=config)
                 # [result] the saved model directory contains:
                 # linear_0.b_0  linear_0.w_0  __model__  __variables.info__
 
                 # loading with configs.params_filename
-                infer_net = fluid.dygraph.jit.load(model_path, configs=configs)
-                x = fluid.dygraph.to_variable(np.random.random((4, 8)).astype('float32'))
+                infer_net = paddle.jit.load(model_path, config=config)
+                x = paddle.randn([4, 8], 'float32')
                 pred = infer_net(x)
         """
         return self._separate_params
@@ -600,9 +595,137 @@ class SaveLoadConfig(object):
                 % type(value))
         self._separate_params = value
 
+    @property
+    def keep_name_table(self):
+        """
+        Configures whether keep ``structured_name -> parameter_name`` dict in loaded state dict.
+        This dict is the debugging information saved when call `paddle.save`. 
+        It is generally only used for debugging and does not affect the actual training or inference. 
+        By default, it will not be retained in `paddle.load` result. Default: False.
+        
+        .. note::
+            Only used for ``paddle.load``.
+
+        Examples:
+            .. code-block:: python
+
+                import paddle
+            
+                paddle.disable_static()
+
+                linear = paddle.nn.Linear(5, 1)
+
+                state_dict = linear.state_dict()
+                paddle.save(state_dict, "paddle_dy")
+
+                configs = paddle.SaveLoadConfig()
+                configs.keep_name_table = True
+                para_state_dict, _ = paddle.load("paddle_dy", configs)
+
+                print(para_state_dict)
+                # the name_table is 'StructuredToParameterName@@'
+                # {'bias': array([0.], dtype=float32), 
+                #  'StructuredToParameterName@@': 
+                #     {'bias': u'linear_0.b_0', 'weight': u'linear_0.w_0'}, 
+                #  'weight': array([[ 0.04230034],
+                #     [-0.1222527 ],
+                #     [ 0.7392676 ],
+                #     [-0.8136974 ],
+                #     [ 0.01211023]], dtype=float32)}
+        """
+        return self._keep_name_table
 
+    @keep_name_table.setter
+    def keep_name_table(self, value):
+        if not isinstance(value, bool):
+            raise TypeError(
+                "The SaveLoadConfig.keep_name_table should be bool value, but received input's type is %s."
+                % type(value))
+        self._keep_name_table = value
+
+
+def _get_input_var_names(inputs, input_spec):
+    name_none_error = "The %s's name is None. " \
+        "When using jit.save, please set InputSepc's name in " \
+        "to_static(input_spec=[]) and jit.save(input_spec=[]) " \
+        "and make sure they are consistent."
+    name_no_exists_error = "The tensor `%s` does not exists. " \
+        "Please make sure the name of InputSpec or example Tensor " \
+        "in input_spec is the same as the name of InputSpec in " \
+        "`to_static` decorated on the Layer.forward method."
+    result_list = []
+    input_var_names = [var.name for var in inputs if isinstance(var, Variable)]
+    if input_spec is None:
+        # no prune
+        result_list = input_var_names
+    elif input_spec is not None and len(input_spec) == len(input_var_names):
+        # no prune
+        result_list = input_var_names
+        # if input spec name not in input_var_names, only raise warning 
+        for spec in input_spec:
+            if spec.name is None:
+                warnings.warn(name_none_error % spec)
+            elif spec.name not in input_var_names:
+                warnings.warn(name_no_exists_error % spec.name)
+            else:
+                # do nothing
+                pass
+    else:
+        # prune
+        for spec in input_spec:
+            if spec.name is None:
+                # name is None, the input_spec only can be InputSpec
+                raise ValueError(name_none_error % spec)
+            elif spec.name not in input_var_names:
+                # the input_spec can be `InputSpec` or `VarBase`
+                raise ValueError(name_no_exists_error % spec.name)
+            else:
+                result_list.append(spec.name)
+
+    return result_list
+
+
+def _get_output_vars(outputs, output_spec):
+    name_no_exists_error = "The tensor `%s` does not exists. " \
+        "Please make sure the name of example Tensor " \
+        "in configs.output_spec is the output tensor of " \
+        "Layer.forward method."
+    result_list = []
+    output_vars_dict = OrderedDict()
+    for var in outputs:
+        if isinstance(var, Variable):
+            output_vars_dict[var.name] = var
+    if output_spec is None:
+        result_list = output_vars_dict.values()
+    elif output_spec is not None and len(output_spec) == len(output_vars_dict):
+        result_list = output_vars_dict.values()
+        for var in output_spec:
+            if var.name not in output_vars_dict:
+                warnings.warn(name_no_exists_error % var.name)
+    else:
+        for var in output_spec:
+            if var.name not in output_vars_dict:
+                raise ValueError(name_no_exists_error % var.name)
+            else:
+                result_list.append(output_vars_dict[var.name])
+    return result_list
+
+
+# NOTE(chenweihang): change jit.save/load argument `configs` to `config`
+def deprecate_save_load_configs(func):
+    @functools.wraps(func)
+    def wrapper(*args, **kwargs):
+        if 'configs' in kwargs:
+            kwargs['config'] = kwargs['configs']
+            kwargs.pop('configs')
+        return func(*args, **kwargs)
+
+    return wrapper
+
+
+@deprecate_save_load_configs
 @switch_to_static_graph
-def save(layer, model_path, input_spec=None, configs=None):
+def save(layer, model_path, input_spec=None, config=None):
     """
     Saves input declarative Layer as :ref:`api_imperative_TranslatedLayer` 
     format model, which can be used for inference or fine-tuning after loading.
@@ -627,7 +750,7 @@ def save(layer, model_path, input_spec=None, configs=None):
             It is the example inputs that will be passed to saved TranslatedLayer's forward
             function. If None, all input variables of the original Layer's forward function
             would be the inputs of the saved model. Default None.
-        configs (SaveLoadConfig, optional): :ref:`api_imperative_jit_saveLoadConfig` object
+        config (SaveLoadConfig, optional): :ref:`api_imperative_jit_saveLoadConfig` object
             that specifies additional configuration options. Default None.
     Returns:
         None
@@ -636,120 +759,145 @@ def save(layer, model_path, input_spec=None, configs=None):
         .. code-block:: python
 
             import numpy as np
-            import paddle.fluid as fluid
-            from paddle.fluid.dygraph import Linear
-            from paddle.fluid.dygraph import declarative
+            import paddle
+            import paddle.nn as nn
+            import paddle.optimizer as opt
 
-            BATCH_SIZE = 32
-            BATCH_NUM = 20
+            BATCH_SIZE = 16
+            BATCH_NUM = 4
+            EPOCH_NUM = 4
 
-            def random_batch_reader():
-                def _get_random_images_and_labels(image_shape, label_shape):
-                    image = np.random.random(size=image_shape).astype('float32')
-                    label = np.random.random(size=label_shape).astype('int64')
-                    return image, label
+            IMAGE_SIZE = 784
+            CLASS_NUM = 10
 
-                def __reader__():
-                    for _ in range(BATCH_NUM):
-                        batch_image, batch_label = _get_random_images_and_labels(
-                            [BATCH_SIZE, 784], [BATCH_SIZE, 1])
-                        yield batch_image, batch_label
+            # define a random dataset
+            class RandomDataset(paddle.io.Dataset):
+                def __init__(self, num_samples):
+                    self.num_samples = num_samples
 
-                return __reader__
+                def __getitem__(self, idx):
+                    image = np.random.random([IMAGE_SIZE]).astype('float32')
+                    label = np.random.randint(0, CLASS_NUM - 1, (1, )).astype('int64')
+                    return image, label
 
-            class LinearNet(fluid.dygraph.Layer):
-                def __init__(self, in_size, out_size):
+                def __len__(self):
+                    return self.num_samples
+
+            class LinearNet(nn.Layer):
+                def __init__(self):
                     super(LinearNet, self).__init__()
-                    self._linear = Linear(in_size, out_size)
+                    self._linear = nn.Linear(IMAGE_SIZE, CLASS_NUM)
 
-                @declarative
+                @paddle.jit.to_static
                 def forward(self, x):
                     return self._linear(x)
 
+            def train(layer, loader, loss_fn, opt):
+                for epoch_id in range(EPOCH_NUM):
+                    for batch_id, (image, label) in enumerate(loader()):
+                        out = layer(image)
+                        loss = loss_fn(out, label)
+                        loss.backward()
+                        opt.step()
+                        opt.clear_grad()
+                        print("Epoch {} batch {}: loss = {}".format(
+                            epoch_id, batch_id, np.mean(loss.numpy())))
+
             # enable dygraph mode
-            fluid.enable_dygraph() 
+            place = paddle.CPUPlace()
+            paddle.disable_static(place) 
 
-            # create network
-            net = LinearNet(784, 1)
-            adam = fluid.optimizer.AdamOptimizer(learning_rate=0.1, parameter_list=net.parameters())
-            # create data loader
-            train_loader = fluid.io.DataLoader.from_generator(capacity=5)
-            train_loader.set_batch_generator(random_batch_reader())
-            # train
-            for data in train_loader():
-                img, label = data
-                label.stop_gradient = True
+            # 1. train & save model.
 
-                cost = net(img)
+            # create network
+            layer = LinearNet()
+            loss_fn = nn.CrossEntropyLoss()
+            adam = opt.Adam(learning_rate=0.001, parameters=layer.parameters())
 
-                loss = fluid.layers.cross_entropy(cost, label)
-                avg_loss = fluid.layers.mean(loss)
+            # create data loader
+            dataset = RandomDataset(BATCH_NUM * BATCH_SIZE)
+            loader = paddle.io.DataLoader(dataset,
+                places=place,
+                batch_size=BATCH_SIZE,
+                shuffle=True,
+                drop_last=True,
+                num_workers=2)
 
-                avg_loss.backward()
-                adam.minimize(avg_loss)
-                net.clear_gradients()
+            # train
+            train(layer, loader, loss_fn, adam)
 
-            # save model
+            # save
             model_path = "linear.example.model"
-            fluid.dygraph.jit.save(
-                layer=net,
-                model_path=model_path,
-                input_spec=[img])
+            paddle.jit.save(layer, model_path)
     """
 
-    def get_inout_spec(all_vars, target_vars, return_name=False):
-        result_list = []
-        valid_var_dict = {}
-        valid_vars = [var for var in all_vars if isinstance(var, Variable)]
-        for var in valid_vars:
-            valid_var_dict[var.name] = var
-        if target_vars:
-            for i, var in enumerate(target_vars):
-                # check target var whether exists
-                if var.name not in valid_var_dict:
-                    raise RuntimeError(
-                        "The variable to feed/fetch are not exist.")
-                result_list.append(valid_var_dict[var.name])
-        else:
-            result_list = valid_vars
-        if return_name:
-            result_list = [var.name for var in result_list]
-
-        return result_list
-
     # 1. input check
     prog_translator = ProgramTranslator()
-    if not prog_translator.enable:
+    if not prog_translator.enable_to_static:
         raise RuntimeError(
-            "The paddle.jit.save doesn't work when setting ProgramTranslator.enable=False."
+            "The paddle.jit.save doesn't work when setting ProgramTranslator.enable to False."
         )
     if not isinstance(layer, Layer):
         raise TypeError(
             "The input layer of paddle.jit.save should be 'Layer', but received layer type is %s."
             % type(layer))
 
+    configs = config
     if configs is None:
         configs = SaveLoadConfig()
 
+    # avoid change user given input_spec
+    inner_input_spec = None
     if input_spec is not None:
         if not isinstance(input_spec, list):
             raise TypeError(
                 "The input input_spec should be 'list', but received input_spec's type is %s."
                 % type(input_spec))
+        inner_input_spec = []
         for var in input_spec:
-            if not isinstance(var, (core.VarBase, Variable,
-                                    paddle.static.InputSpec)):
+            if isinstance(var, paddle.static.InputSpec):
+                inner_input_spec.append(var)
+            elif isinstance(var, (core.VarBase, Variable)):
+                inner_input_spec.append(
+                    paddle.static.InputSpec.from_tensor(var))
+            else:
                 raise TypeError(
                     "The element in input_spec list should be 'Variable' or `paddle.static.InputSpec`, but received element's type is %s."
                     % type(var))
 
-    # 2. get program of declarative Layer.forward
-    if not isinstance(layer.forward, StaticLayer):
-        raise RuntimeError(
-            "layer.forward need to be decorated by `@declarative`.")
-    concrete_program = layer.forward.concrete_program
-
-    # NOTE: we maintain the mapping of variable name to
+    # 2. get program from Layer
+    # TODO(chenweihang): add support for other method, not only forward
+    if isinstance(layer.forward, StaticLayer):
+        concrete_program = layer.forward.concrete_program
+    else:
+        # transform in jit.save, if input_spec is incomplete, declarative will throw error
+        static_forward = declarative(layer.forward, input_spec=inner_input_spec)
+        concrete_program = static_forward.concrete_program
+        # the input_spec has been used in declarative, which is equal to 
+        # @declarative with input_spec and jit.save without input_spec,
+        # avoid needless warning
+        inner_input_spec = None
+
+    # 3. build input & output of save_infernece_model
+    # NOTE(chenweihang): [ Get input variables name ]
+    # There are two cases, whether to prune the inputs or not
+    # - not prune inputs (recommend):
+    #   - the len(input_spec) == len((concrete_program.inputs) - 1
+    #   - here can use concrete_program.inputs directly
+    # - prune inputs:
+    #   - the input_spec length < len((concrete_program.inputs) - 1
+    #   - the input_spec's name should be in concrete_program.inputs
+    input_var_names = _get_input_var_names(concrete_program.inputs,
+                                           inner_input_spec)
+
+    # NOTE(chenweihang): [ Get output variables ]
+    # the rule is like [ Get input variables name ]. For output var, 
+    # we only support VarBase spec, and actually, we only need the 
+    # var name of output, and we don't recommended to use output_spec
+    output_vars = _get_output_vars(concrete_program.outputs,
+                                   configs.output_spec)
+
+    # NOTE(chenweihang): we maintain the mapping of variable name to
     # structured name, the buffer variable (non-persistable)
     # saved to inference program may not need by dygraph Layer, 
     # we only record the state_dict variable's structured name
@@ -757,7 +905,7 @@ def save(layer, model_path, input_spec=None, configs=None):
     for structured_name, var in six.iteritems(layer.state_dict()):
         state_names_dict[var.name] = structured_name
 
-    # 3. share parameters from Layer to scope & record var info
+    # 4. share parameters from Layer to scope & record var info
     scope = core.Scope()
     extra_var_info = dict()
     for param_or_buffer in concrete_program.parameters:
@@ -775,10 +923,6 @@ def save(layer, model_path, input_spec=None, configs=None):
             extra_info_dict['trainable'] = param_or_buffer.trainable
         extra_var_info[param_or_buffer.name] = extra_info_dict
 
-    # 4. build input & output spec
-    input_var_names = get_inout_spec(concrete_program.inputs, input_spec, True)
-    output_vars = get_inout_spec(concrete_program.outputs, configs.output_spec)
-
     # 5. save inference model
     from paddle.fluid.io import save_inference_model
 
@@ -799,7 +943,7 @@ def save(layer, model_path, input_spec=None, configs=None):
             export_for_deployment=configs._export_for_deployment,
             program_only=configs._program_only)
 
-        # NOTE: [ Save extra variable info ]
+        # NOTE(chenweihang): [ Save extra variable info ]
         # save_inference_model will lose some important variable information, including:
         #   - Variable name and correspondence (when saved variables as one file)
         #   - Variable.stop_gradient information
@@ -819,8 +963,9 @@ def save(layer, model_path, input_spec=None, configs=None):
             pickle.dump(extra_var_info, f, protocol=2)
 
 
+@deprecate_save_load_configs
 @dygraph_only
-def load(model_path, configs=None):
+def load(model_path, config=None):
     """
     :api_attr: imperative
 
@@ -837,7 +982,7 @@ def load(model_path, configs=None):
 
     Args:
         model_path (str): The directory path where the model is saved.
-        configs (SaveLoadConfig, optional): :ref:`api_imperative_jit_saveLoadConfig` object that specifies 
+        config (SaveLoadConfig, optional): :ref:`api_imperative_jit_saveLoadConfig` object that specifies 
             additional configuration options. Default None.
 
     Returns:
@@ -849,122 +994,126 @@ def load(model_path, configs=None):
         .. code-block:: python
 
             import numpy as np
-            import paddle.fluid as fluid
-            from paddle.fluid.dygraph import Linear
-            from paddle.fluid.dygraph import declarative
+            import paddle
+            import paddle.nn as nn
+            import paddle.optimizer as opt
 
-            BATCH_SIZE = 32
-            BATCH_NUM = 20
+            BATCH_SIZE = 16
+            BATCH_NUM = 4
+            EPOCH_NUM = 4
 
-            def random_batch_reader():
-                def _get_random_images_and_labels(image_shape, label_shape):
-                    image = np.random.random(size=image_shape).astype('float32')
-                    label = np.random.random(size=label_shape).astype('int64')
-                    return image, label
+            IMAGE_SIZE = 784
+            CLASS_NUM = 10
 
-                def __reader__():
-                    for _ in range(BATCH_NUM):
-                        batch_image, batch_label = _get_random_images_and_labels(
-                            [BATCH_SIZE, 784], [BATCH_SIZE, 1])
-                        yield batch_image, batch_label
+            # define a random dataset
+            class RandomDataset(paddle.io.Dataset):
+                def __init__(self, num_samples):
+                    self.num_samples = num_samples
 
-                return __reader__
+                def __getitem__(self, idx):
+                    image = np.random.random([IMAGE_SIZE]).astype('float32')
+                    label = np.random.randint(0, CLASS_NUM - 1, (1, )).astype('int64')
+                    return image, label
 
-            class LinearNet(fluid.dygraph.Layer):
-                def __init__(self, in_size, out_size):
+                def __len__(self):
+                    return self.num_samples
+
+            class LinearNet(nn.Layer):
+                def __init__(self):
                     super(LinearNet, self).__init__()
-                    self._linear = Linear(in_size, out_size)
+                    self._linear = nn.Linear(IMAGE_SIZE, CLASS_NUM)
 
-                @declarative
+                @paddle.jit.to_static
                 def forward(self, x):
                     return self._linear(x)
 
+            def train(layer, loader, loss_fn, opt):
+                for epoch_id in range(EPOCH_NUM):
+                    for batch_id, (image, label) in enumerate(loader()):
+                        out = layer(image)
+                        loss = loss_fn(out, label)
+                        loss.backward()
+                        opt.step()
+                        opt.clear_grad()
+                        print("Epoch {} batch {}: loss = {}".format(
+                            epoch_id, batch_id, np.mean(loss.numpy())))
+
             # enable dygraph mode
-            fluid.enable_dygraph() 
+            place = paddle.CPUPlace()
+            paddle.disable_static(place) 
 
             # 1. train & save model.
+
             # create network
-            net = LinearNet(784, 1)
-            adam = fluid.optimizer.AdamOptimizer(learning_rate=0.1, parameter_list=net.parameters())
+            layer = LinearNet()
+            loss_fn = nn.CrossEntropyLoss()
+            adam = opt.Adam(learning_rate=0.001, parameters=layer.parameters())
+
             # create data loader
-            train_loader = fluid.io.DataLoader.from_generator(capacity=5)
-            train_loader.set_batch_generator(random_batch_reader())
-            # train
-            for data in train_loader():
-                img, label = data
-                label.stop_gradient = True
+            dataset = RandomDataset(BATCH_NUM * BATCH_SIZE)
+            loader = paddle.io.DataLoader(dataset,
+                places=place,
+                batch_size=BATCH_SIZE,
+                shuffle=True,
+                drop_last=True,
+                num_workers=2)
 
-                cost = net(img)
+            # train
+            train(layer, loader, loss_fn, adam)
 
-                loss = fluid.layers.cross_entropy(cost, label)
-                avg_loss = fluid.layers.mean(loss)
+            # save
+            model_path = "linear.example.model"
+            paddle.jit.save(layer, model_path)
 
-                avg_loss.backward()
-                adam.minimize(avg_loss)
-                net.clear_gradients()
+            # 2. load model
 
-            model_path = "linear.example.model"
-            fluid.dygraph.jit.save(
-                layer=net,
-                model_path=model_path,
-                input_spec=[img])
+            # load
+            loaded_layer = paddle.jit.load(model_path)
 
-            # 2. load model & inference
-            # load model
-            infer_net = fluid.dygraph.jit.load(model_path)
             # inference
-            x = fluid.dygraph.to_variable(np.random.random((1, 784)).astype('float32'))
-            pred = infer_net(x)
+            loaded_layer.eval()
+            x = paddle.randn([1, IMAGE_SIZE], 'float32')
+            pred = loaded_layer(x)
 
-            # 3. load model & fine-tune
-            # load model
-            train_net = fluid.dygraph.jit.load(model_path)
-            train_net.train()
-            adam = fluid.optimizer.AdamOptimizer(learning_rate=0.1, parameter_list=train_net.parameters())
-            # create data loader
-            train_loader = fluid.io.DataLoader.from_generator(capacity=5)
-            train_loader.set_batch_generator(random_batch_reader())
             # fine-tune
-            for data in train_loader():
-                img, label = data
-                label.stop_gradient = True
-
-                cost = train_net(img)
+            loaded_layer.train()
+            adam = opt.Adam(learning_rate=0.001, parameters=loaded_layer.parameters())
+            train(loaded_layer, loader, loss_fn, adam)
 
-                loss = fluid.layers.cross_entropy(cost, label)
-                avg_loss = fluid.layers.mean(loss)
-
-                avg_loss.backward()
-                adam.minimize(avg_loss)
-                train_net.clear_gradients()
 
         2. Load model saved by :ref:`api_fluid_io_save_inference_model` then performing and fine-tune training.
 
         .. code-block:: python
 
             import numpy as np
+            import paddle
             import paddle.fluid as fluid
+            import paddle.nn as nn
+            import paddle.optimizer as opt
 
-            BATCH_SIZE = 32
-            BATCH_NUM = 20
+            BATCH_SIZE = 16
+            BATCH_NUM = 4
+            EPOCH_NUM = 4
 
-            def random_batch_reader():
-                def _get_random_images_and_labels(image_shape, label_shape):
-                    image = np.random.random(size=image_shape).astype('float32')
-                    label = np.random.random(size=label_shape).astype('int64')
-                    return image, label
+            IMAGE_SIZE = 784
+            CLASS_NUM = 10
+
+            # define a random dataset
+            class RandomDataset(paddle.io.Dataset):
+                def __init__(self, num_samples):
+                    self.num_samples = num_samples
 
-                def __reader__():
-                    for _ in range(BATCH_NUM):
-                        batch_image, batch_label = _get_random_images_and_labels(
-                            [BATCH_SIZE, 784], [BATCH_SIZE, 1])
-                        yield batch_image, batch_label
+                def __getitem__(self, idx):
+                    image = np.random.random([IMAGE_SIZE]).astype('float32')
+                    label = np.random.randint(0, CLASS_NUM - 1, (1, )).astype('int64')
+                    return image, label
 
-                return __reader__
+                def __len__(self):
+                    return self.num_samples
 
-            img = fluid.data(name='img', shape=[None, 784], dtype='float32')
+            image = fluid.data(name='image', shape=[None, 784], dtype='float32')
             label = fluid.data(name='label', shape=[None, 1], dtype='int64')
-            pred = fluid.layers.fc(input=img, size=10, act='softmax')
+            pred = fluid.layers.fc(input=image, size=10, act='softmax')
             loss = fluid.layers.cross_entropy(input=pred, label=label)
             avg_loss = fluid.layers.mean(loss)
 
@@ -975,9 +1124,15 @@ def load(model_path, configs=None):
             exe = fluid.Executor(place)
             exe.run(fluid.default_startup_program())
 
-            loader = fluid.io.DataLoader.from_generator(
-                feed_list=[img, label], capacity=5, iterable=True)
-            loader.set_batch_generator(random_batch_reader(), places=place)
+            # create data loader
+            dataset = RandomDataset(BATCH_NUM * BATCH_SIZE)
+            loader = paddle.io.DataLoader(dataset,
+                feed_list=[image, label],
+                places=place,
+                batch_size=BATCH_SIZE, 
+                shuffle=True,
+                drop_last=True,
+                num_workers=2)
 
             # 1. train and save inference model
             for data in loader():
@@ -988,39 +1143,42 @@ def load(model_path, configs=None):
 
             model_path = "fc.example.model"
             fluid.io.save_inference_model(
-                model_path, ["img"], [pred], exe)
+                model_path, ["image"], [pred], exe)
+
+            # 2. load model
 
             # enable dygraph mode
-            fluid.enable_dygraph() 
+            paddle.disable_static(place)
 
-            # 2. load model & inference
-            fc = fluid.dygraph.jit.load(model_path)
-            x = fluid.dygraph.to_variable(np.random.random((1, 784)).astype('float32'))
+            # load
+            fc = paddle.jit.load(model_path)
+
+            # inference
+            fc.eval()
+            x = paddle.randn([1, IMAGE_SIZE], 'float32')
             pred = fc(x)
 
-            # 3. load model & fine-tune
-            fc = fluid.dygraph.jit.load(model_path)
+            # fine-tune
             fc.train()
-            sgd = fluid.optimizer.SGD(learning_rate=0.001,
-                                        parameter_list=fc.parameters())
-
-            train_loader = fluid.io.DataLoader.from_generator(capacity=5)
-            train_loader.set_batch_generator(
-                random_batch_reader(), places=place)
-
-            for data in train_loader():
-                img, label = data
-                label.stop_gradient = True
-
-                cost = fc(img)
-
-                loss = fluid.layers.cross_entropy(cost, label)
-                avg_loss = fluid.layers.mean(loss)
-
-                avg_loss.backward()
-                sgd.minimize(avg_loss)
+            loss_fn = nn.CrossEntropyLoss()
+            adam = opt.Adam(learning_rate=0.001, parameters=fc.parameters())
+            loader = paddle.io.DataLoader(dataset,
+                places=place,
+                batch_size=BATCH_SIZE,
+                shuffle=True,
+                drop_last=True,
+                num_workers=2)
+            for epoch_id in range(EPOCH_NUM):
+                for batch_id, (image, label) in enumerate(loader()):
+                    out = fc(image)
+                    loss = loss_fn(out, label)
+                    loss.backward()
+                    adam.step()
+                    adam.clear_grad()
+                    print("Epoch {} batch {}: loss = {}".format(
+                        epoch_id, batch_id, np.mean(loss.numpy())))
     """
-    return TranslatedLayer._construct(model_path, configs)
+    return TranslatedLayer._construct(model_path, config)
 
 
 @dygraph_only
diff --git a/python/paddle/fluid/dygraph/layers.py b/python/paddle/fluid/dygraph/layers.py
index 1ef719b9da187be659d9c898ec996b5ad0c0d8a6..9c79deaab73ff7bde9a2414ceb67ad0d04103498 100644
--- a/python/paddle/fluid/dygraph/layers.py
+++ b/python/paddle/fluid/dygraph/layers.py
@@ -29,6 +29,9 @@ from .layer_object_helper import LayerObjectHelper
 from .base import program_desc_tracing_guard, param_guard
 from paddle.fluid import framework
 from ..param_attr import ParamAttr
+from paddle.fluid.executor import Executor, global_scope
+from paddle.fluid.framework import in_dygraph_mode
+from paddle.fluid.framework import _current_expected_place as _get_device
 
 __all__ = ['Layer']
 
@@ -88,6 +91,7 @@ class Layer(core.Layer):
         self._helper = LayerObjectHelper(self._full_name)
         self._built = False
         self._dtype = dtype
+        self._init_in_dynamic_mode = framework.in_dygraph_mode()
 
         self._parameters = collections.OrderedDict()
         # Buffers the variable (not parameter) created in layer
@@ -797,7 +801,7 @@ class Layer(core.Layer):
                 raise ValueError(
                     "super(YourLayer, self).__init__() should be called first")
             if len(self._loaddict_holder) > 0:
-                assert value.name in self._loaddict_holder, "Parameter not found, Can't not find [ {} ] in stat_dict".format(
+                assert value.name in self._loaddict_holder, "Parameter not found, Can't not find [ {} ] in state_dict".format(
                     value.name)
 
                 value.set_value(self._loaddict_holder[value.name])
@@ -943,12 +947,13 @@ class Layer(core.Layer):
                     destination = destination_temp
         return destination
 
-    def set_dict(self,
-                 stat_dict,
-                 include_sublayers=True,
-                 use_structured_name=True):
+    @framework.deprecate_stat_dict
+    def set_state_dict(self,
+                       state_dict,
+                       include_sublayers=True,
+                       use_structured_name=True):
         '''
-        Set parameters and persistable buffers from stat_dict. All the parameters and buffers will be reset by the tensor in the stat_dict
+        Set parameters and persistable buffers from state_dict. All the parameters and buffers will be reset by the tensor in the state_dict
 
         Parameters:
             state_dict(dict) : Dict contains all the parameters and persistable buffers.
@@ -961,72 +966,67 @@ class Layer(core.Layer):
         Examples:
             .. code-block:: python
 
-                import paddle.fluid as fluid
-                with fluid.dygraph.guard():
-                    emb = fluid.dygraph.Embedding([10, 10])
+                import paddle
+                
+                paddle.disable_static()
+                
+                emb = paddle.nn.Embedding([10, 10])
 
-                    state_dict = emb.state_dict()
-                    fluid.save_dygraph( state_dict, "paddle_dy")
-                    
-                    para_state_dict, _ = fluid.load_dygraph( "paddle_dy")
-
-                    emb.set_dict( para_state_dict )
+                state_dict = emb.state_dict()
+                paddle.save(state_dict, "paddle_dy")
+                
+                para_state_dict, _ = paddle.load("paddle_dy")
 
-        '''
-        self.load_dict(
-            stat_dict,
-            include_sublayers=include_sublayers,
-            use_structured_name=use_structured_name)
+                emb.set_state_dict(para_state_dict)
 
-    def load_dict(self,
-                  stat_dict,
-                  include_sublayers=True,
-                  use_structured_name=True):
         '''
-        Set parameters and persistable buffers from stat_dict. All the parameters and persistabl buffers will be reset by the tensor in the stat_dict
 
-        This api will be Deprecated. Please use set_dict
-
-        Parameters:
-            state_dict(dict) : Dict contains all the parameters and persistable buffers.
-            include_sublayers(bool, optional) : If true, also include the parameters and persistable buffers from sublayers. Default: True
-            use_structured_name(bool, optional) : If true, use structured name as key, otherwise, use parameter or buffer name as key.
-                                                  Default: True
-        Returns:
-            None
-
-        Examples:
-            .. code-block:: python
-
-                import paddle.fluid as fluid
-                with fluid.dygraph.guard():
-                    emb = fluid.dygraph.Embedding([10, 10])
-
-                    state_dict = emb.state_dict()
-                    fluid.save_dygraph( state_dict, "paddle_dy")
-                    
-                    para_state_dict, _ = fluid.load_dygraph( "paddle_dy")
-
-                    emb.load_dict( para_state_dict )
-
-        '''
-
-        inner_state_dict = self.state_dict()
+        def _check_match(key, param):
+            state = state_dict.get(key, None)
+            if state is None:
+                raise ValueError("{} is not found in the provided dict.".format(
+                    key))
+            if list(state.shape) != list(param.shape):
+                raise ValueError(
+                    "{} receives a shape {}, but the expected shape is {}.".
+                    format(key, list(state.shape), list(param.shape)))
+            return param, state
+
+        matched_param_state = []
+        for key, param in self.state_dict().items():
+            key_name = key if use_structured_name else param.name
+            try:
+                match_res = _check_match(key_name, param)
+                matched_param_state.append(match_res)
+            except ValueError as err:
+                warnings.warn(("Skip loading for {}. ".format(key) + str(err)))
+
+        if in_dygraph_mode():
+            for param, state in matched_param_state:
+                param.set_value(state)
+        else:
 
-        for name, param_or_buffer in inner_state_dict.items():
-            key_name = name if use_structured_name else param_or_buffer.name
-            if key_name in stat_dict:
-                param_or_buffer.set_value(stat_dict[key_name])
-            else:
-                raise RuntimeError(
-                    "Parameter or persistable buffer not found, Can't find [ {} ] in stat_dict"
-                    "use_structured_name is set to [{}]".format(
-                        key_name, use_structured_name))
-        unused_para_list = []
-        for k, v in stat_dict.items():
-            if k not in inner_state_dict:
-                unused_para_list.append(k)
-        if len(unused_para_list) > 0:
-            warnings.warn(
-                "Variables [ {} ] are not used, because not included in layers state_dict".
-                format(" ".join(unused_para_list)))
+            def _set_var(var, ndarray):
+                t = global_scope().find_var(var.name).get_tensor()
+                p = t._place()
+                if p.is_cpu_place():
+                    place = core.CPUPlace()
+                elif p.is_cuda_pinned_place():
+                    place = core.CUDAPinnedPlace()
+                else:
+                    p = core.Place()
+                    p.set_place(t._place())
+                    place = core.CUDAPlace(p.gpu_device_id())
+                t.set(ndarray, place)
+
+            executor = Executor(_get_device())._default_executor
+            # restore parameter states
+            core._create_loaded_parameter(
+                [param for param, state in matched_param_state],
+                global_scope(), executor)
+            for param, state in matched_param_state:
+                _set_var(param, state)
+
+    # [aliases] Compatible with old method names
+    set_dict = set_state_dict
+    load_dict = set_state_dict
diff --git a/python/paddle/fluid/dygraph/learning_rate_scheduler.py b/python/paddle/fluid/dygraph/learning_rate_scheduler.py
index cce383be7e22cd066199f814db80a75367862b82..cd6af6fd5b575e8188088bde9e8944ab94c7e0f8 100644
--- a/python/paddle/fluid/dygraph/learning_rate_scheduler.py
+++ b/python/paddle/fluid/dygraph/learning_rate_scheduler.py
@@ -97,7 +97,7 @@ class LearningRateDecay(object):
         """
         self.keys = ['step_num']
 
-    def set_dict(self, state_dict):
+    def set_state_dict(self, state_dict):
         """
         Loads the schedulers state.
         """
@@ -114,6 +114,9 @@ class LearningRateDecay(object):
                 "There are some unused values in state_dict. Maybe the optimizer have different 'LearningRateDecay' when invoking state_dict and set_dict"
             )
 
+    # [aliases] Compatible with old method names
+    set_dict = set_state_dict
+
     def step(self):
         raise NotImplementedError()
 
diff --git a/python/paddle/fluid/dygraph/math_op_patch.py b/python/paddle/fluid/dygraph/math_op_patch.py
index 8c4109674200bf97354444f92f00b13e053152a0..3aa7b9dfc262810686319819f717f3cfd06b5e50 100644
--- a/python/paddle/fluid/dygraph/math_op_patch.py
+++ b/python/paddle/fluid/dygraph/math_op_patch.py
@@ -19,7 +19,6 @@ from ..framework import Variable, convert_np_dtype_to_dtype_, _varbase_creator
 from ..layers.layer_function_generator import OpProtoHolder
 from ..layers import common_methods
 from . import to_variable, no_grad
-import paddle
 
 import numpy as np
 import six
@@ -163,26 +162,6 @@ def monkey_patch_math_varbase():
     def _scalar_div_(var, value):
         return _scalar_elementwise_op_(var, 1.0 / value, 0.0)
 
-    # TODO(shenliang03):  currently, it supports divide, floor_divide, remainder
-    # for binary operator by using the api to achieve the type promotion
-    def _binary_method_creator_(op_type, reverse=False):
-        import paddle
-
-        def __impl__(self, other_var):
-            import paddle
-            op = getattr(paddle, op_type)
-            if reverse:
-                return op(other_var, self)
-            else:
-                return op(self, other_var)
-
-        __impl__.__doc__ = """
-
-        See paddle.{}""".format(op_type)
-        __impl__.__name__ = op_type
-
-        return __impl__
-
     # for binary operator such as elementwise, compare
     def _binary_creator_(method_name,
                          op_type,
@@ -281,20 +260,22 @@ def monkey_patch_math_varbase():
         ## a*b == b*a. Do not need to reverse explicitly
         ('__rmul__',
          _binary_creator_('__rmul__', 'elementwise_mul', False, _scalar_mul_)),
+        ('__div__', _binary_creator_('__div__', 'elementwise_div', False,
+                                     _scalar_div_)),
+        ('__truediv__', _binary_creator_('__truediv__', 'elementwise_div',
+                                         False, _scalar_div_)),
+        ('__rdiv__', _binary_creator_('__rdiv__', 'elementwise_div', True,
+                                      None)),
         ('__rtruediv__', _binary_creator_('rtruediv__', 'elementwise_div', True,
                                           None)),
         ('__pow__', _binary_creator_('__pow__', 'elementwise_pow', False,
                                      None)),
         ('__rpow__', _binary_creator_('__rpow__', 'elementwise_pow', True,
                                       None)),
-        # These binary use paddle.optype
-        ('__div__', _binary_method_creator_('divide', False)),
-        ('__truediv__', _binary_method_creator_('divide', False)),
-        ('__rtruediv__', _binary_method_creator_('divide', True)),
-        ('__rdiv__', _binary_method_creator_('divide', True)),
-        ('__floordiv__', _binary_method_creator_('floor_divide', False)),
-        ('__rfloordiv__', _binary_method_creator_('floor_divide', True)),
-        ('__mod__', _binary_method_creator_('remainder', False)),
+        ('__floordiv__', _binary_creator_('__floordiv__',
+                                          'elementwise_floordiv', False, None)),
+        ('__mod__', _binary_creator_('__mod__', 'elementwise_mod', False,
+                                     None)),
         ## for logical compare
         ('__eq__', _binary_creator_('__eq__', 'equal', False, None)),
         ('__ne__', _binary_creator_('__ne__', 'not_equal', False, None)),
@@ -304,7 +285,7 @@ def monkey_patch_math_varbase():
         ('__ge__', _binary_creator_('__ge__', 'greater_equal', False, None)),
         ('__array_ufunc__', None),
         ('sigmoid', _method_creator_('sigmoid', 'name=None')),
-        ('logsigmoid', _method_creator_('logsigmoid', 'name=None')),
+        ('log_sigmoid', _method_creator_('logsigmoid', 'name=None')),
         ('exp', _method_creator_('exp', 'name=None')),
         ('tanh', _method_creator_('tanh', 'name=None')),
         ('atan', _method_creator_('atan', 'name=None')),
diff --git a/python/paddle/fluid/dygraph/parallel.py b/python/paddle/fluid/dygraph/parallel.py
index 5ecc713ddcace7a6bed05ffa4282d9f5c1041a44..472022bced7e3e2dd11d301501ebaec75e5e412a 100644
--- a/python/paddle/fluid/dygraph/parallel.py
+++ b/python/paddle/fluid/dygraph/parallel.py
@@ -587,12 +587,13 @@ class DataParallel(layers.Layer):
             include_sublayers=include_sublayers,
             structured_name_prefix=structured_name_prefix)
 
-    def set_dict(self,
-                 stat_dict,
-                 include_sublayers=True,
-                 use_structured_name=True):
+    @framework.deprecate_stat_dict
+    def set_state_dict(self,
+                       state_dict,
+                       include_sublayers=True,
+                       use_structured_name=True):
         '''
-        Set parameters of self._layers from stat_dict. All the parameters of self._layers will be reset by the tensor in the stat_dict
+        Set parameters of self._layers from state_dict. All the parameters of self._layers will be reset by the tensor in the state_dict
 
         Parameters:
             state_dict(dict) : Dict contains all the parameters
@@ -605,62 +606,27 @@ class DataParallel(layers.Layer):
         Examples:
             .. code-block:: python
 
-                import paddle.fluid as fluid
-                with fluid.dygraph.guard():
-                    strategy=fluid.dygraph.prepare_context()
-                    emb = fluid.dygraph.Embedding([10, 10])
-                    emb = fluid.dygraph.DataParallel(emb, strategy)
-
-                    state_dict = emb.state_dict()
-                    fluid.save_dygraph( state_dict, "paddle_dy")
-                    
-                    para_state_dict, _ = fluid.load_dygraph( "paddle_dy")
-
-                    emb.set_dict( para_state_dict )
+                import paddle   
 
-        '''
-
-        self._layers.set_dict(
-            stat_dict,
-            include_sublayers=include_sublayers,
-            use_structured_name=use_structured_name)
-
-    def load_dict(self,
-                  stat_dict,
-                  include_sublayers=True,
-                  use_structured_name=True):
-        '''
-        Set parameters of self._layers from stat_dict. All the parameters of self._layers will be reset by the tensor in the stat_dict
-
-        This api will be Deprecated. Please use set_dict
-
-        Parameters:
-            state_dict(dict) : Dict contains all the parameters
-            include_sublayers(bool, optional) : If true, also include the parameters from sublayers. Default: True
-            use_structured_name(bool, optional) : If true, use structured name as key, otherwise, use parameter name as key.
-                                                  Default: True
-        Returns:
-            None
+                paddle.disable_static()
 
-        Examples:
-            .. code-block:: python
+                emb = paddle.nn.Embedding([10, 10])
+                emb = fluid.dygraph.DataParallel(emb, strategy)
 
-                import paddle.fluid as fluid
-                with fluid.dygraph.guard():
-                    strategy=fluid.dygraph.prepare_context()
-                    emb = fluid.dygraph.Embedding([10, 10])
-                    emb = fluid.dygraph.DataParallel(emb, strategy)
+                state_dict = emb.state_dict()
+                paddle.save(state_dict, "paddle_dy")
 
-                    state_dict = emb.state_dict()
-                    fluid.save_dygraph( state_dict, "paddle_dy")
-                    
-                    para_state_dict, _ = fluid.load_dygraph( "paddle_dy")
+                para_state_dict, _ = paddle.load("paddle_dy")
 
-                    emb.load_dict( para_state_dict )
+                emb.set_state_dict(para_state_dict)
 
         '''
 
-        self._layers.load_dict(
-            stat_dict,
+        self._layers.set_state_dict(
+            state_dict,
             include_sublayers=include_sublayers,
             use_structured_name=use_structured_name)
+
+    # [aliases] Compatible with old method names
+    set_dict = set_state_dict
+    load_dict = set_state_dict
diff --git a/python/paddle/fluid/framework.py b/python/paddle/fluid/framework.py
index fc4e91aad4fff1db325e17828d26ccd94c164c3d..797b32f5d4768af59fa4e6aceb75e4b6d9029d91 100644
--- a/python/paddle/fluid/framework.py
+++ b/python/paddle/fluid/framework.py
@@ -36,6 +36,7 @@ from . import core
 from . import unique_name
 import paddle.version as fluid_version
 import warnings
+import functools
 
 __all__ = [
     'Program',
@@ -216,7 +217,7 @@ def _dygraph_not_support_(func):
 def _dygraph_only_(func):
     def __impl__(*args, **kwargs):
         assert in_dygraph_mode(
-        ), "We Only support %s in imperative mode, please use fluid.dygraph.guard() as context to run it in imperative Mode" % func.__name__
+        ), "We Only support %s in dynamic mode, please call 'paddle.disable_static()' to enter dynamic mode." % func.__name__
         return func(*args, **kwargs)
 
     return __impl__
@@ -238,6 +239,25 @@ def _fake_interface_only_(func):
     return __impl__
 
 
+# NOTE(chenweihang): There is argument name typo (stat_dict, correct name is state_dict) 
+# in fluid api Layer.set_dict, Optimizer.load, in order to correct the argument without 
+# introducing compatibility issues, add this decorator
+# NOTE(chenweihang): not using `wrap_decorator` here is because `wrap_decorator` will
+# move kwargs to args, which doesn't work in this decorate case
+def deprecate_stat_dict(func):
+    @functools.wraps(func)
+    def wrapper(*args, **kwargs):
+        if 'stat_dict' in kwargs:
+            warnings.warn(
+                "The argument `stat_dict` has deprecated, please change it to `state_dict`.",
+                DeprecationWarning)
+            kwargs['state_dict'] = kwargs['stat_dict']
+            kwargs.pop('stat_dict')
+        return func(*args, **kwargs)
+
+    return wrapper
+
+
 dygraph_not_support = wrap_decorator(_dygraph_not_support_)
 dygraph_only = wrap_decorator(_dygraph_only_)
 fake_interface_only = wrap_decorator(_fake_interface_only_)
diff --git a/python/paddle/fluid/incubate/checkpoint/auto_checkpoint.py b/python/paddle/fluid/incubate/checkpoint/auto_checkpoint.py
index ad51a043a0a50f89f77811adb7f95759a4f220be..a8c1656b2b03758043aec9058db317e52b45a5af 100644
--- a/python/paddle/fluid/incubate/checkpoint/auto_checkpoint.py
+++ b/python/paddle/fluid/incubate/checkpoint/auto_checkpoint.py
@@ -98,7 +98,7 @@ class AutoCheckpointChecker(object):
             self._fs_cache = os.getenv("PADDLE_EDL_FS_CACHE", ".cache")
 
             self._save_checkpoint_inter = int(
-                os.getenv("PADDLE_EDL_SAVE_CHECKPOINT_INTER", "900"))  #s
+                os.getenv("PADDLE_EDL_SAVE_CHECKPOINT_INTER", "900"))  # s
 
             if not self._ce_test:
                 assert len(self._hdfs_home) > 3 and \
@@ -132,7 +132,7 @@ class AutoCheckpointChecker(object):
         if in_dygraph_mode():
             return False
 
-        return  self._run_env is not None and \
+        return self._run_env is not None and \
             self._platform is not None and \
             self._job_id is not None and \
             self._hdfs_home is not None and \
diff --git a/python/paddle/fluid/incubate/fleet/parameter_server/ir/public.py b/python/paddle/fluid/incubate/fleet/parameter_server/ir/public.py
index 216478479a7cfdcffac5f21855d0974309842c89..e348c67ae0461674358fa6d34ee8a73648862a6d 100644
--- a/python/paddle/fluid/incubate/fleet/parameter_server/ir/public.py
+++ b/python/paddle/fluid/incubate/fleet/parameter_server/ir/public.py
@@ -170,22 +170,40 @@ class CompileTimeStrategy(object):
         return trainer.mode == DistributedMode.ASYNC
 
     def get_role_id(self):
-        return self.role_maker.role_id()
+        try:
+            return self.role_maker._role_id()
+        except Exception:
+            return self.role_maker.role_id()
 
     def get_trainers(self):
-        return self.role_maker.worker_num()
+        try:
+            return self.role_maker._worker_num()
+        except Exception:
+            return self.role_maker.worker_num()
 
     def get_ps_endpoint(self):
-        return self.role_maker.get_pserver_endpoints()[self.get_role_id()]
+        try:
+            return self.role_maker._get_pserver_endpoints()[self.get_role_id()]
+        except Exception:
+            return self.role_maker.get_pserver_endpoints()[self.get_role_id()]
 
     def get_ps_endpoints(self):
-        return self.role_maker.get_pserver_endpoints()
+        try:
+            return self.role_maker._get_pserver_endpoints()
+        except Exception:
+            return self.role_maker.get_pserver_endpoints()
 
     def get_heter_worker_endpoints(self):
-        return self.role_maker._get_heter_worker_endpoints()
+        try:
+            return self.role_maker._get_heter_worker_endpoints()
+        except Exception:
+            return self.role_maker.get_heter_worker_endpoints()
 
     def get_heter_worker_endpoint(self):
-        return self.role_maker._get_heter_worker_endpoint()
+        try:
+            return self.role_maker._get_heter_worker_endpoint()
+        except Exception:
+            return self.role_maker.get_heter_worker_endpoint()
 
     def get_origin_programs(self):
         return self.origin_main_program, self.origin_startup_program
diff --git a/python/paddle/fluid/incubate/fleet/parameter_server/ir/trainer_pass.py b/python/paddle/fluid/incubate/fleet/parameter_server/ir/trainer_pass.py
index 5e6b8ca639952730f60e55718b0a8534404ca94c..4543af9820e8c9326098fa254494ca1c896d3b12 100644
--- a/python/paddle/fluid/incubate/fleet/parameter_server/ir/trainer_pass.py
+++ b/python/paddle/fluid/incubate/fleet/parameter_server/ir/trainer_pass.py
@@ -1,3 +1,4 @@
+# -*- coding: UTF-8 -*-
 # Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
@@ -441,7 +442,23 @@ def find_heter_ops(program, default_device="cpu"):
 
 def create_heter_program(program, config, heter_program, heter_ops,
                          block_var_detail, current_device):
-    # add heter op
+
+    # This function mainly includes the following contents:
+    # 1. For every heter block:
+    #     a) copy heter device op from origin program
+    #     b) create variables which belong to heter op：
+    #         -> if variable is persistable, clone it in global_scope
+    #         -> if variable is temp, create it in heter block
+    #     c) create communicate related op as follow:
+    #         joint_var.0_1 -> slice -> reshape -> origin_var
+    #         origin_var -> origin_program
+    #         reshape -> concat -> joint_var.1_2
+    #     d) copy send op from origin program for var@grad which loacted in current heter block
+    #     e) re-check every op in current blcok if its device is not current heter devie
+    # 2. Create send op for step counter in last heter-block
+    # 3. Create Listen&Serv OP for distributed training
+    # 4. update CompileTimeStrategy for heter_program
+
     optimizer_block = []
     grad_to_block_id = []
     send_grad_var_list = []
@@ -453,17 +470,10 @@ def create_heter_program(program, config, heter_program, heter_ops,
         for _, op in enumerate(heter_block_ops):
             block_append_op(heter_program, program, heter_block, op)
 
-            # add relate variables
-            inputs = _get_input_map_from_op(program.global_block().vars, op)
-            add_vars_by_op_map(inputs, heter_program)
-
-            outputs = _get_output_map_from_op(program.global_block().vars, op)
-            add_vars_by_op_map(outputs, heter_program)
-
         entrance_vars = block_var_detail[index]["entrance"]
-        add_vars_by_var_list(entrance_vars, program, heter_program)
+        add_vars_by_var_list(entrance_vars, program, heter_program, heter_block)
         exit_vars = block_var_detail[index]["exit"]
-        add_vars_by_var_list(exit_vars, program, heter_program)
+        add_vars_by_var_list(exit_vars, program, heter_program, heter_block)
 
         comm_info = get_communicate_var_info(program, index, entrance_vars,
                                              exit_vars)
@@ -471,13 +481,13 @@ def create_heter_program(program, config, heter_program, heter_ops,
         grad_to_block_id.append(comm_info["block_input_var_name"] + ":" + str(
             heter_block.idx))
 
-        # create slice op
         first_op_index = 0
 
         get_type_var_name = comm_info["input_var_reshape_name"][0].split(
             ".input_reshape@Heter")[0]
-        get_type_var = heter_program.global_block().vars[get_type_var_name]
+        get_type_var = heter_block.vars[get_type_var_name]
 
+        # create slice op
         insert_recv_slice_op(
             heter_program, heter_block, first_op_index,
             comm_info["block_input_var_name"],
@@ -487,6 +497,13 @@ def create_heter_program(program, config, heter_program, heter_ops,
                 for i in range(len(comm_info["input_var_reshape_dim"]))
             ])
         first_op_index += len(comm_info["input_var_reshape_dim"])
+
+        heter_program.global_block().create_var(
+            name=comm_info["block_input_var_name"],
+            shape=(-1, sum(comm_info["input_var_reshape_dim"])),
+            dtype=get_type_var.dtype,
+            type=get_type_var.type)
+
         # create reshape op
         for i in range(len(comm_info["input_var_reshape_name"])):
             var_name = entrance_vars[i]
@@ -514,13 +531,14 @@ def create_heter_program(program, config, heter_program, heter_ops,
                               comm_info["block_output_var_name"],
                               [-1, sum(comm_info["output_var_reshape_dim"])])
         check_op_device(heter_block, current_device)
+
+        # add send op
         send_grad_var_list = send_grad_var_list + add_heter_send_op(
             program, heter_program, heter_block, block_var_detail[index])
 
     # add step conter
     send_input_vars = []
     dummy_output = []
-    trainer_id = config.get_role_id()
     pserver_endpoints = config.get_ps_endpoints()
     optimizer_block[-1].append_op(
         type="send",
@@ -555,7 +573,6 @@ def create_heter_program(program, config, heter_program, heter_ops,
     # append the listen_and_serv op
     heter_program.global_block().append_op(
         type="listen_and_serv", inputs={'X': []}, outputs={}, attrs=attrs)
-
     check_heter_compile_time_strategy(program, config, send_grad_var_list)
 
 
@@ -574,6 +591,16 @@ def check_heter_compile_time_strategy(program, config, send_grad_var_list):
 
 
 def create_trainer_program(program, config, heter_ops, block_var_detail):
+    # This function mainly includes the following contents:
+    # 1. For every heter block in origin program
+    #     a) delete heter op and related variables
+    #     b) add send&recv op
+    #     c) add communicate ops as follows:
+    #         origin_var -> reshape -> concat -> joint_var.0_1
+    #         send&recv op(send joint_var.0_1; recv joint_var.1_2)
+    #         joint_var.1_2 -> slice -> reshape -> origin_var
+    #     d) remove send op which related var@grad is not in trainer program
+    # 2. check every op's device
     for device in heter_ops.keys():
         for heter_block_index in sorted(heter_ops[device]):
             replace_ops_by_communicate_op(program, config, heter_block_index,
@@ -932,19 +959,19 @@ def insert_reshape_op(program,
                       var_name,
                       new_var_name,
                       new_var_shape=None):
-    input_var = program.global_block().vars[var_name]
+    input_var = block.vars[var_name]
 
-    if new_var_name not in program.global_block().vars:
-        out = program.global_block().create_var(
+    if new_var_name not in block.vars:
+        out = block.create_var(
             name=new_var_name,
             shape=new_var_shape,
             dtype=input_var.dtype,
             type=input_var.type)
     else:
-        out = program.global_block().vars[new_var_name]
+        out = block.vars[new_var_name]
         new_var_shape = out.shape
 
-    x_shape = program.global_block().create_var(
+    x_shape = block.create_var(
         name="{}.xshape@Heter".format(var_name), dtype=input_var.dtype)
     block._insert_op(
         index=index,
@@ -957,9 +984,7 @@ def insert_reshape_op(program,
 
 def insert_send_concat_op(program, block, index, var_name_list, new_var_name,
                           new_var_shape):
-    input_var_list = [
-        program.global_block().vars[var_name] for var_name in var_name_list
-    ]
+    input_var_list = [block.vars[var_name] for var_name in var_name_list]
 
     out = program.global_block().create_var(
         name=new_var_name,
@@ -987,14 +1012,14 @@ def insert_recv_slice_op(program, block, index, var_name, var_shape, dtype,
 
     out_list = []
     for i in range(len(new_var_name_list)):
-        if new_var_name_list[i] not in program.global_block().vars:
-            out = program.global_block().create_var(
+        if new_var_name_list[i] not in block.vars:
+            out = block.create_var(
                 name=new_var_name_list[i],
                 shape=new_var_shape_list[i],
                 dtype=input_var.dtype,
                 type=input_var.type)
         else:
-            out = program.global_block().vars[new_var_name_list[i]]
+            out = block.vars[new_var_name_list[i]]
         out_list.append(out)
 
     start_index = 0
@@ -1037,21 +1062,33 @@ def deleter_trainer_useless_var(program):
 
 
 def block_append_op(program, origin_program, block, op):
-    inputs = _get_input_map_from_op(origin_program.global_block().vars, op)
+    merge_ordereddict = origin_program.global_block().vars.copy()
+    merge_ordereddict.update(block.vars)
+    inputs = _get_input_map_from_op(merge_ordereddict, op)
     for key, varlist in six.iteritems(inputs):
         if not isinstance(varlist, list):
             varlist = [varlist]
         for var in varlist:
-            if var.name not in program.global_block().vars:
-                program.global_block()._clone_variable(var)
+            if var.name not in program.global_block(
+            ).vars and var.name not in block.vars:
+                if var.persistable:
+                    program.global_block()._clone_variable(
+                        var, force_persistable=False)
+                else:
+                    block._clone_variable(var, force_persistable=False)
 
     outputs = _get_output_map_from_op(origin_program.global_block().vars, op)
     for key, varlist in six.iteritems(outputs):
         if not isinstance(varlist, list):
             varlist = [varlist]
         for var in varlist:
-            if var.name not in program.global_block().vars:
-                program.global_block()._clone_variable(var)
+            if var.name not in program.global_block(
+            ).vars and var.name not in block.vars:
+                if var.persistable:
+                    program.global_block()._clone_variable(
+                        var, force_persistable=False)
+                else:
+                    block._clone_variable(var, force_persistable=False)
 
     if "_grad" not in op.type:
         # for forward op
@@ -1076,21 +1113,15 @@ def block_append_op(program, origin_program, block, op):
         block._sync_with_cpp()
 
 
-def add_vars_by_op_map(var_map, program):
-    for key, varlist in six.iteritems(var_map):
-        if not isinstance(varlist, list):
-            varlist = [varlist]
-        for i in range(len(varlist)):
-            var = varlist[i]
-            if var.name not in program.global_block().vars:
-                program.global_block()._clone_variable(var)
-
-
-def add_vars_by_var_list(var_name_list, origin_program, program):
+def add_vars_by_var_list(var_name_list, origin_program, program, block):
     for var_name in var_name_list:
         if var_name not in program.global_block().vars:
             var = origin_program.global_block().vars[var_name]
-            program.global_block()._clone_variable(var)
+            if var.persistable:
+                program.global_block()._clone_variable(
+                    var, force_persistable=False)
+            else:
+                block._clone_variable(var, force_persistable=False)
 
 
 def get_varlist_from_op_map(var_map):
diff --git a/python/paddle/fluid/incubate/fleet/parameter_server/ir/vars_metatools.py b/python/paddle/fluid/incubate/fleet/parameter_server/ir/vars_metatools.py
index c8f3643b25be0780bbdfd1668d849ab00ece355c..c80b4a800bd149cced13c0e25322cd03ef94e468 100644
--- a/python/paddle/fluid/incubate/fleet/parameter_server/ir/vars_metatools.py
+++ b/python/paddle/fluid/incubate/fleet/parameter_server/ir/vars_metatools.py
@@ -12,9 +12,22 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 from __future__ import print_function
+from functools import reduce
+
 from paddle.fluid.framework import Variable
 from paddle.fluid import core
 
+dtype_to_size = {
+    core.VarDesc.VarType.FP16: 2,
+    core.VarDesc.VarType.FP32: 4,
+    core.VarDesc.VarType.FP64: 8,
+    core.VarDesc.VarType.INT16: 2,
+    core.VarDesc.VarType.INT32: 4,
+    core.VarDesc.VarType.INT64: 8,
+    core.VarDesc.VarType.BOOL: 1,
+    core.VarDesc.VarType.UINT8: 1,
+}
+
 
 class VarBlock:
     def __init__(self, varname, offset, size):
@@ -51,11 +64,14 @@ class VarStruct(object):
         self.type = type
         self.lod_level = lod_level
         self.persistable = persistable
+        self.m_size = 1
+        self.m_size = reduce(lambda x, y: x * y, shape)
+        self.m_size *= dtype_to_size[dtype]
 
     def __str__(self):
-        return "N: {}, S: {}, D: {}, T: {}, LL: {}, P: {}".format(
+        return "N: {}, S: {}, D: {}, T: {}, LL: {}, P: {}, M: {}".format(
             self.name, self.shape, self.dtype, self.type, self.lod_level,
-            self.persistable)
+            self.persistable, self.m_size)
 
 
 class VarDistributed(object):
diff --git a/python/paddle/fluid/incubate/fleet/utils/fleet_util.py b/python/paddle/fluid/incubate/fleet/utils/fleet_util.py
index cb1a54ef19899059d1a46d0807ce58bf3b5ab8b5..58313c46c3cf0d42d6e14e10d0ca91f361ce787a 100644
--- a/python/paddle/fluid/incubate/fleet/utils/fleet_util.py
+++ b/python/paddle/fluid/incubate/fleet/utils/fleet_util.py
@@ -26,8 +26,7 @@ import paddle.fluid as fluid
 from paddle.fluid.log_helper import get_logger
 from paddle.fluid.incubate.fleet.parameter_server.distribute_transpiler import fleet as fleet_pslib
 from paddle.fluid.incubate.fleet.parameter_server.distribute_transpiler import fleet as fleet_transpiler
-from . import hdfs
-from .hdfs import *
+from paddle.distributed.fleet.utils.fs import LocalFS, HDFSClient
 from . import utils
 
 __all__ = ["FleetUtil"]
diff --git a/python/paddle/fluid/incubate/fleet/utils/fs.py b/python/paddle/fluid/incubate/fleet/utils/fs.py
deleted file mode 100644
index 0ba06ef934a525d3801e233c6e2f124fb0a6df52..0000000000000000000000000000000000000000
--- a/python/paddle/fluid/incubate/fleet/utils/fs.py
+++ /dev/null
@@ -1,180 +0,0 @@
-#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-import os
-import sys
-import subprocess
-import multiprocessing
-from datetime import datetime
-
-import re
-import copy
-import errno
-import time
-import logging
-import abc
-from pathlib import PurePosixPath, Path
-import shutil
-
-__all__ = ['FS', 'LocalFS']
-
-
-class ExecuteError(Exception):
-    pass
-
-
-class FSFileExistsError(Exception):
-    pass
-
-
-class FSFileNotExistsError(Exception):
-    pass
-
-
-class FSTimeOut(Exception):
-    pass
-
-
-class FSShellCmdAborted(ExecuteError):
-    pass
-
-
-class FS(object):
-    @abc.abstractmethod
-    def ls_dir(self, fs_path):
-        raise NotImplementedError
-
-    @abc.abstractmethod
-    def is_file(self, fs_path):
-        raise NotImplementedError
-
-    @abc.abstractmethod
-    def is_dir(self, fs_path):
-        raise NotImplementedError
-
-    @abc.abstractmethod
-    def is_exist(self, fs_path):
-        raise NotImplementedError
-
-    @abc.abstractmethod
-    def upload(self, local_path, fs_path):
-        raise NotImplementedError
-
-    @abc.abstractmethod
-    def download(self, fs_path, local_path):
-        raise NotImplementedError
-
-    @abc.abstractmethod
-    def mkdirs(self, fs_path):
-        raise NotImplementedError
-
-    @abc.abstractmethod
-    def delete(self, fs_path):
-        raise NotImplementedError
-
-    @abc.abstractmethod
-    def need_upload_download(self):
-        raise NotImplementedError
-
-    @abc.abstractmethod
-    def rename(self, fs_src_path, fs_dst_path):
-        raise NotImplementedError
-
-    @abc.abstractmethod
-    def mv(self, fs_src_path, fs_dst_path, overwrite=False, test_exists=False):
-        raise NotImplementedError
-
-    @abc.abstractmethod
-    def upload_dir(self, local_dir, dest_dir):
-        raise NotImplementedError
-
-    @abc.abstractmethod
-    def list_dirs(self, fs_path):
-        raise NotImplementedError
-
-    @abc.abstractmethod
-    def touch(self, fs_path, exist_ok=True):
-        raise NotImplementedError
-
-
-class LocalFS(FS):
-    def ls_dir(self, fs_path):
-        return [f for f in os.listdir(fs_path)]
-
-    def mkdirs(self, fs_path):
-        assert not os.path.isfile(fs_path), "{} is already a file".format(
-            fs_path)
-        os.system("mkdir -p {}".format(fs_path))
-
-    def rename(self, fs_src_path, fs_dst_path):
-        os.rename(fs_src_path, fs_dst_path)
-
-    def _rmr(self, fs_path):
-        shutil.rmtree(fs_path)
-
-    def _rm(self, fs_path):
-        os.remove(fs_path)
-
-    def delete(self, fs_path):
-        if not self.is_exist(fs_path):
-            return
-
-        if os.path.isfile(fs_path):
-            return self._rm(fs_path)
-
-        return self._rmr(fs_path)
-
-    def need_upload_download(self):
-        return False
-
-    def is_file(self, fs_path):
-        return os.path.isfile(fs_path)
-
-    def is_dir(self, fs_path):
-        return os.path.isdir(fs_path)
-
-    def is_exist(self, fs_path):
-        return os.path.exists(fs_path)
-
-    def touch(self, fs_path, exist_ok=True):
-        if self.is_exist(fs_path):
-            if exist_ok:
-                return
-            raise FSFileExistsError
-
-        return Path(fs_path).touch(exist_ok=True)
-
-    def mv(self, src_path, dst_path, overwrite=False, test_exists=False):
-        if not self.is_exist(src_path):
-            raise FSFileNotExistsError
-
-        if overwrite and self.is_exist(dst_path):
-            self.delete(dst_path)
-
-        if self.is_exist(dst_path):
-            raise FSFileExistsError
-
-        return self.rename(src_path, dst_path)
-
-    def list_dirs(self, fs_path):
-        """	
-        list directory under fs_path, and only give the pure name, not include the fs_path	
-        """
-        if not self.is_exist(fs_path):
-            return []
-
-        dirs = [
-            f for f in os.listdir(fs_path) if os.path.isdir(fs_path + "/" + f)
-        ]
-
-        return dirs
diff --git a/python/paddle/fluid/inference/__init__.py b/python/paddle/fluid/inference/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..3013c1f2aff87fb293ea984c99d8336b418ee080
--- /dev/null
+++ b/python/paddle/fluid/inference/__init__.py
@@ -0,0 +1,17 @@
+#   Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from .wrapper import Config, DataType, PlaceType, PrecisionType, Tensor, Predictor
+
+from ..core import create_predictor, get_version, get_num_bytes_of_data_type, PredictorPool
diff --git a/python/paddle/fluid/inference/wrapper.py b/python/paddle/fluid/inference/wrapper.py
new file mode 100644
index 0000000000000000000000000000000000000000..96885edcc5e822beb5db8332f2b58d12b9c4ff63
--- /dev/null
+++ b/python/paddle/fluid/inference/wrapper.py
@@ -0,0 +1,23 @@
+#   Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from ..core import AnalysisConfig, PaddleDType, PaddlePlace
+from ..core import PaddleInferPredictor, PaddleInferTensor
+
+DataType = PaddleDType
+PlaceType = PaddlePlace
+PrecisionType = AnalysisConfig.Precision
+Config = AnalysisConfig
+Tensor = PaddleInferTensor
+Predictor = PaddleInferPredictor
diff --git a/python/paddle/fluid/install_check.py b/python/paddle/fluid/install_check.py
index ef469377acfbc0c2c521de61f8eacc0f7c9f0854..51fa1677b868e59f6c8c027d849d0b6bc45aef0f 100644
--- a/python/paddle/fluid/install_check.py
+++ b/python/paddle/fluid/install_check.py
@@ -62,6 +62,8 @@ def run_check():
             # Your Paddle Fluid works well on MUTIPLE GPU or CPU.
             # Your Paddle Fluid is installed successfully! Let's start deep Learning with Paddle Fluid now
     """
+    paddle.enable_static()
+
     print("Running Verify Fluid Program ... ")
 
     device_list = []
@@ -157,3 +159,5 @@ def run_check():
         print(
             "Your Paddle Fluid is installed successfully ONLY for SINGLE GPU or CPU! "
             "\n Let's start deep Learning with Paddle Fluid now")
+
+    paddle.disable_static()
diff --git a/python/paddle/fluid/io.py b/python/paddle/fluid/io.py
index 6e5f7fd035acfeab975f63b0794829d57f9bb239..fe5b683bdeaa3b997cc506ad99f1a74010808f62 100644
--- a/python/paddle/fluid/io.py
+++ b/python/paddle/fluid/io.py
@@ -26,13 +26,13 @@ from functools import reduce
 import numpy as np
 
 import paddle
-import paddle.reader
-from paddle.reader import *
 from paddle.fluid import layers
 from paddle.fluid.executor import Executor, global_scope
 from paddle.fluid.evaluator import Evaluator
 from paddle.fluid.framework import Program, Parameter, default_main_program, default_startup_program, Variable, \
     program_guard, dygraph_not_support
+from paddle.reader import cache, map_readers, buffered, compose, chain, shuffle, \
+    ComposeNotAligned, firstn, xmap_readers, multiprocess_reader
 from .wrapped_decorator import signature_safe_contextmanager
 from paddle.fluid.compiler import CompiledProgram
 from paddle.fluid.log_helper import get_logger
@@ -62,7 +62,7 @@ __all__ = [
     'set_program_state',
     'get_program_parameter',
     'get_program_persistable_vars',
-] + reader.__all__ + paddle.reader.__all__
+] + reader.__all__
 
 _logger = get_logger(
     __name__, logging.INFO, fmt='%(asctime)s-%(levelname)s: %(message)s')
diff --git a/python/paddle/fluid/layers/detection.py b/python/paddle/fluid/layers/detection.py
index ea6abe2d335e6669b27ba278c0faaca62ca0fdbb..bf87d1fc5a947e48845a3783fd71922641e28819 100644
--- a/python/paddle/fluid/layers/detection.py
+++ b/python/paddle/fluid/layers/detection.py
@@ -20,7 +20,8 @@ from __future__ import print_function
 from .layer_function_generator import generate_layer_fn
 from .layer_function_generator import autodoc, templatedoc
 from ..layer_helper import LayerHelper
-from ..framework import Variable
+from ..framework import Variable, in_dygraph_mode
+from .. import core
 from .loss import softmax_with_cross_entropy
 from . import tensor
 from . import nn
@@ -2893,8 +2894,8 @@ def generate_proposals(scores,
                        nms_thresh=0.5,
                        min_size=0.1,
                        eta=1.0,
-                       name=None,
-                       return_rois_num=False):
+                       return_rois_num=False,
+                       name=None):
     """
 	:alias_main: paddle.nn.functional.generate_proposals
 	:alias: paddle.nn.functional.generate_proposals,paddle.nn.functional.vision.generate_proposals
@@ -2949,6 +2950,10 @@ def generate_proposals(scores,
             num of each image in one batch. The N is the image's num. For example, the tensor has values [4,5] that represents
             the first image has 4 Rois, the second image has 5 Rois. It only used in rcnn model. 
             'False' by default. 
+        name(str, optional): For detailed information, please refer 
+            to :ref:`api_guide_Name`. Usually name is no need to set and 
+            None by default. 
+
     Returns:
         tuple:
         A tuple with format ``(rpn_rois, rpn_roi_probs)``.
@@ -2969,6 +2974,14 @@ def generate_proposals(scores,
                          im_info, anchors, variances)
 
     """
+    if in_dygraph_mode():
+        assert return_rois_num, "return_rois_num should be True in dygraph mode."
+        attrs = ('pre_nms_topN', pre_nms_top_n, 'post_nms_topN', post_nms_top_n,
+                 'nms_thresh', nms_thresh, 'min_size', min_size, 'eta', eta)
+        rpn_rois, rpn_roi_probs, rpn_rois_num = core.ops.generate_proposals(
+            scores, bbox_deltas, im_info, anchors, variances, *attrs)
+        return rpn_rois, rpn_roi_probs, rpn_rois_num
+
     helper = LayerHelper('generate_proposals', **locals())
 
     check_variable_and_dtype(scores, 'scores', ['float32'],
@@ -2986,7 +2999,14 @@ def generate_proposals(scores,
         dtype=bbox_deltas.dtype)
     rpn_roi_probs = helper.create_variable_for_type_inference(
         dtype=scores.dtype)
-    rpn_rois_lod = helper.create_variable_for_type_inference(dtype='int32')
+    outputs = {
+        'RpnRois': rpn_rois,
+        'RpnRoiProbs': rpn_roi_probs,
+    }
+    if return_rois_num:
+        rpn_rois_num = helper.create_variable_for_type_inference(dtype='int32')
+        rpn_rois_num.stop_gradient = True
+        outputs['RpnRoisNum'] = rpn_rois_num
 
     helper.append_op(
         type="generate_proposals",
@@ -3004,17 +3024,12 @@ def generate_proposals(scores,
             'min_size': min_size,
             'eta': eta
         },
-        outputs={
-            'RpnRois': rpn_rois,
-            'RpnRoiProbs': rpn_roi_probs,
-            'RpnRoisLod': rpn_rois_lod
-        })
+        outputs=outputs)
     rpn_rois.stop_gradient = True
     rpn_roi_probs.stop_gradient = True
-    rpn_rois_lod.stop_gradient = True
 
     if return_rois_num:
-        return rpn_rois, rpn_roi_probs, rpn_rois_lod
+        return rpn_rois, rpn_roi_probs, rpn_rois_num
     else:
         return rpn_rois, rpn_roi_probs
 
@@ -3656,6 +3671,7 @@ def distribute_fpn_proposals(fpn_rois,
                              max_level,
                              refer_level,
                              refer_scale,
+                             rois_num=None,
                              name=None):
     """
 	:alias_main: paddle.nn.functional.distribute_fpn_proposals
@@ -3687,6 +3703,11 @@ def distribute_fpn_proposals(fpn_rois,
             come from.
         refer_level(int32): The referring level of FPN layer with specified scale.
         refer_scale(int32): The referring scale of FPN layer with specified level.
+        rois_num(Tensor): 1-D Tensor contains the number of RoIs in each image. 
+            The shape is [B] and data type is int32. B is the number of images.
+            If it is not None then return a list of 1-D Tensor. Each element 
+            is the output RoIs' number of each image on the corresponding level
+            and the shape is [B]. None by default.
         name(str, optional): For detailed information, please refer 
             to :ref:`api_guide_Name`. Usually name is no need to set and 
             None by default. 
@@ -3702,6 +3723,10 @@ def distribute_fpn_proposals(fpn_rois,
         the number of total rois. The data type is int32. It is
         used to restore the order of fpn_rois.
 
+        rois_num_per_level(List): A list of 1-D Tensor and each Tensor is 
+        the RoIs' number in each image on the corresponding level. The shape 
+        is [B] and data type of int32. B is the number of images
+
 
     Examples:
         .. code-block:: python
@@ -3716,26 +3741,52 @@ def distribute_fpn_proposals(fpn_rois,
                 refer_level=4,
                 refer_scale=224)
     """
+    num_lvl = max_level - min_level + 1
+
+    if in_dygraph_mode():
+        assert rois_num is not None, "rois_num should not be None in dygraph mode."
+        attrs = ('min_level', min_level, 'max_level', max_level, 'refer_level',
+                 refer_level, 'refer_scale', refer_scale)
+        multi_rois, restore_ind, rois_num_per_level = core.ops.distribute_fpn_proposals(
+            fpn_rois, rois_num, num_lvl, num_lvl, *attrs)
+        return multi_rois, restore_ind, rois_num_per_level
+
     check_variable_and_dtype(fpn_rois, 'fpn_rois', ['float32', 'float64'],
                              'distribute_fpn_proposals')
     helper = LayerHelper('distribute_fpn_proposals', **locals())
     dtype = helper.input_dtype('fpn_rois')
-    num_lvl = max_level - min_level + 1
     multi_rois = [
         helper.create_variable_for_type_inference(dtype) for i in range(num_lvl)
     ]
+
     restore_ind = helper.create_variable_for_type_inference(dtype='int32')
+
+    inputs = {'FpnRois': fpn_rois}
+    outputs = {
+        'MultiFpnRois': multi_rois,
+        'RestoreIndex': restore_ind,
+    }
+
+    if rois_num is not None:
+        inputs['RoisNum'] = rois_num
+        rois_num_per_level = [
+            helper.create_variable_for_type_inference(dtype='int32')
+            for i in range(num_lvl)
+        ]
+        outputs['MultiLevelRoIsNum'] = rois_num_per_level
+
     helper.append_op(
         type='distribute_fpn_proposals',
-        inputs={'FpnRois': fpn_rois},
-        outputs={'MultiFpnRois': multi_rois,
-                 'RestoreIndex': restore_ind},
+        inputs=inputs,
+        outputs=outputs,
         attrs={
             'min_level': min_level,
             'max_level': max_level,
             'refer_level': refer_level,
             'refer_scale': refer_scale
         })
+    if rois_num is not None:
+        return multi_rois, restore_ind, rois_num_per_level
     return multi_rois, restore_ind
 
 
@@ -3820,6 +3871,7 @@ def collect_fpn_proposals(multi_rois,
                           min_level,
                           max_level,
                           post_nms_top_n,
+                          rois_num_per_level=None,
                           name=None):
     """
 	:alias_main: paddle.nn.functional.collect_fpn_proposals
@@ -3846,6 +3898,12 @@ def collect_fpn_proposals(multi_rois,
         min_level(int): The lowest level of FPN layer to collect
         max_level(int): The highest level of FPN layer to collect
         post_nms_top_n(int): The number of selected RoIs
+        rois_num_per_level(list, optional): The List of RoIs' numbers. 
+            Each element is 1-D Tensor which contains the RoIs' number of each 
+            image on each level and the shape is [B] and data type is 
+            int32, B is the number of images. If it is not None then return 
+            a 1-D Tensor contains the output RoIs' number of each image and 
+            the shape is [B]. Default: None
         name(str, optional): For detailed information, please refer 
             to :ref:`api_guide_Name`. Usually name is no need to set and 
             None by default.        
@@ -3856,6 +3914,9 @@ def collect_fpn_proposals(multi_rois,
         fpn_rois(Variable): 2-D LoDTensor with shape [N, 4] and data type is 
         float32 or float64. Selected RoIs. 
 
+        rois_num(Tensor): 1-D Tensor contains the RoIs's number of each 
+        image. The shape is [B] and data type is int32. B is the number of 
+        images. 
 
     Examples:
         .. code-block:: python
@@ -3879,21 +3940,38 @@ def collect_fpn_proposals(multi_rois,
     """
     check_type(multi_rois, 'multi_rois', list, 'collect_fpn_proposals')
     check_type(multi_scores, 'multi_scores', list, 'collect_fpn_proposals')
+    num_lvl = max_level - min_level + 1
+    input_rois = multi_rois[:num_lvl]
+    input_scores = multi_scores[:num_lvl]
+
+    if in_dygraph_mode():
+        assert rois_num_per_level is not None, "rois_num_per_level should not be None in dygraph mode."
+        attrs = ('post_nms_topN', post_nms_top_n)
+        output_rois, rois_num = core.ops.collect_fpn_proposals(
+            input_rois, input_scores, rois_num_per_level, *attrs)
+
     helper = LayerHelper('collect_fpn_proposals', **locals())
     dtype = helper.input_dtype('multi_rois')
     check_dtype(dtype, 'multi_rois', ['float32', 'float64'],
                 'collect_fpn_proposals')
-    num_lvl = max_level - min_level + 1
-    input_rois = multi_rois[:num_lvl]
-    input_scores = multi_scores[:num_lvl]
     output_rois = helper.create_variable_for_type_inference(dtype)
     output_rois.stop_gradient = True
+
+    inputs = {
+        'MultiLevelRois': input_rois,
+        'MultiLevelScores': input_scores,
+    }
+    outputs = {'FpnRois': output_rois}
+    if rois_num_per_level is not None:
+        inputs['MultiLevelRoIsNum'] = rois_num_per_level
+        rois_num = helper.create_variable_for_type_inference(dtype='int32')
+        rois_num.stop_gradient = True
+        outputs['RoisNum'] = rois_num
     helper.append_op(
         type='collect_fpn_proposals',
-        inputs={
-            'MultiLevelRois': input_rois,
-            'MultiLevelScores': input_scores
-        },
-        outputs={'FpnRois': output_rois},
+        inputs=inputs,
+        outputs=outputs,
         attrs={'post_nms_topN': post_nms_top_n})
+    if rois_num_per_level is not None:
+        return output_rois, rois_num
     return output_rois
diff --git a/python/paddle/fluid/layers/math_op_patch.py b/python/paddle/fluid/layers/math_op_patch.py
index 38fc34472c8bc64338e2468bdf3f4b0bab1370ce..4595f0cf93916d71a3d0ec582af1917500d68f12 100644
--- a/python/paddle/fluid/layers/math_op_patch.py
+++ b/python/paddle/fluid/layers/math_op_patch.py
@@ -16,7 +16,6 @@ from __future__ import print_function
 
 import warnings
 import inspect
-import paddle
 
 from .. import core
 from ..framework import Variable, unique_name
@@ -46,7 +45,6 @@ EXPRESSION_MAP = {
     "__pow__": "A ** B",
     "__rpow__": "A **= B",
     "__floordiv__": "A //B",
-    "__rfloordiv__": "A //= B",
     "__mod__": "A % B",
     "__eq__": "A == B",
     "__ne__": "A != B",
@@ -235,25 +233,6 @@ def monkey_patch_variable():
     def _scalar_div_(var, value):
         return _scalar_op_(var, 1.0 / value, 0.0)
 
-    # TODO(shenliang03):  currently, it supports divide, floor_divide, remainder
-    # for binary operator by using the api to achieve the type promotion
-    def _binary_method_creator_(op_type, reverse=False):
-        import paddle
-
-        def __impl__(self, other_var):
-            op = getattr(paddle, op_type)
-            if reverse:
-                return op(other_var, self)
-            else:
-                return op(self, other_var)
-
-        __impl__.__doc__ = """
-
-        See paddle.{}""".format(op_type)
-        __impl__.__name__ = op_type
-
-        return __impl__
-
     def _binary_creator_(method_name,
                          op_type,
                          reverse=False,
@@ -360,18 +339,22 @@ def monkey_patch_variable():
         #  a*b == b*a. Do not need to reverse explicitly
         ('__rmul__',
          _binary_creator_('__rmul__', 'elementwise_mul', False, _scalar_mul_)),
+        ('__div__', _binary_creator_('__div__', 'elementwise_div', False,
+                                     _scalar_div_)),
+        ('__truediv__', _binary_creator_('__truediv__', 'elementwise_div',
+                                         False, _scalar_div_)),
+        ('__rdiv__', _binary_creator_('__rdiv__', 'elementwise_div', True,
+                                      None)),
+        ('__rtruediv__', _binary_creator_('__rtruediv__', 'elementwise_div',
+                                          True, None)),
         ('__pow__', _binary_creator_('__pow__', 'elementwise_pow', False,
                                      None)),
         ('__rpow__', _binary_creator_('__rpow__', 'elementwise_pow', True,
                                       None)),
-        # These binary use paddle.optype
-        ('__div__', _binary_method_creator_('divide', False)),
-        ('__rdiv__', _binary_method_creator_('divide', True)),
-        ('__truediv__', _binary_method_creator_('divide', False)),
-        ('__rtruediv__', _binary_method_creator_('divide', True)),
-        ('__floordiv__', _binary_method_creator_('floor_divide', False)),
-        ('__rfloordiv__', _binary_method_creator_('floor_divide', True)),
-        ('__mod__', _binary_method_creator_('remainder', False)),
+        ('__floordiv__', _binary_creator_('__floordiv__',
+                                          'elementwise_floordiv', False, None)),
+        ('__mod__', _binary_creator_('__mod__', 'elementwise_mod', False,
+                                     None)),
         #  for logical compare
         ('__eq__', _binary_creator_('__eq__', 'equal', False, None)),
         ('__ne__', _binary_creator_('__ne__', 'not_equal', False, None)),
diff --git a/python/paddle/fluid/layers/nn.py b/python/paddle/fluid/layers/nn.py
index e77f58d31f7c49ebb315523e6a071136090dece2..3e7d10f8d1a02126c3d4bec490fcd2f3194123ee 100755
--- a/python/paddle/fluid/layers/nn.py
+++ b/python/paddle/fluid/layers/nn.py
@@ -3167,7 +3167,7 @@ def instance_norm(input,
 
     param_shape = [channel_num]
 
-    if param_attr and bias_attr:
+    if param_attr != False and bias_attr != False:
         # create parameter
         scale = helper.create_parameter(
             attr=helper.param_attr,
@@ -3190,7 +3190,7 @@ def instance_norm(input,
     instance_norm_out = helper.create_variable_for_type_inference(dtype)
 
     inputs = {"X": input}
-    if param_attr and bias_attr:
+    if param_attr != False and bias_attr != False:
         inputs["Scale"] = scale
         inputs["Bias"] = bias
 
@@ -4815,11 +4815,6 @@ def split(input, num_or_sections, dim=-1, name=None):
     Returns:
         list(Tensor): The list of segmented Tensors.
 
-    Raises:
-        TypeError: The data type of ``input`` must be one of bool, float16, float32, float64, int32, int64.
-        TypeError: ``num_or_sections`` is not int, list or tuple.
-        TypeError: ``dim`` is not int or Tensor. The data type of ``dim`` must be int32 or int64 when it's a Tensor.
-
     Example:
         .. code-block:: python
 
@@ -6103,11 +6098,6 @@ def reshape(x, shape, actual_shape=None, act=None, inplace=False, name=None):
     Returns:
         Tensor: A reshaped Tensor with the same data type as ``x``. It is a new tensor variable if ``inplace`` is ``False``, otherwise it is ``x``. If ``act`` is None, return the reshaped tensor variable, otherwise return the activated tensor variable.
 
-    Raises:
-        TypeError: If actual_shape is neither Tensor nor None.
-        ValueError: If more than one elements of ``shape`` is -1.
-        ValueError: If the element of ``shape`` is 0, the corresponding dimension should be less than or equal to the dimension of ``x``.
-        ValueError: If the elements in ``shape`` is negative except -1.
 
     Examples:
         .. code-block:: python
@@ -6316,6 +6306,15 @@ def unsqueeze(input, axes, name=None):
 
     """
     if in_dygraph_mode():
+        if isinstance(axes, int):
+            axes = [axes]
+        elif isinstance(axes, Variable):
+            axes = axes.numpy().tolist()
+        elif isinstance(axes, (list, tuple)):
+            axes = [
+                item.numpy().item(0) if isinstance(item, Variable) else item
+                for item in axes
+            ]
         out, _ = core.ops.unsqueeze2(input, 'axes', axes)
         return out
 
@@ -6863,7 +6862,8 @@ def roi_pool(input,
              pooled_height=1,
              pooled_width=1,
              spatial_scale=1.0,
-             rois_lod=None):
+             rois_num=None,
+             name=None):
     """
     :alias_main: paddle.nn.functional.roi_pool
 	:alias: paddle.nn.functional.roi_pool,paddle.nn.functional.vision.roi_pool
@@ -6883,10 +6883,14 @@ def roi_pool(input,
     Args:
         input (Variable): Input feature, 4D-Tensor with the shape of [N,C,H,W], where N is the batch size, C is the input channel, H is Height, W is weight. The data type is float32 or float64.
         rois (Variable): ROIs (Regions of Interest) to pool over. 2D-LoDTensor with the shape of [num_rois,4], the lod level is 1. Given as [[x1, y1, x2, y2], ...], (x1, y1) is the top left coordinates, and (x2, y2) is the bottom right coordinates.
-        rois_lod (Variable): The lod info of rois. Default: None
         pooled_height (int, optional): The pooled output height, data type is int32. Default: 1
         pooled_width (int, optional): The pooled output height, data type is int32. Default: 1
         spatial_scale (float, optional): Multiplicative spatial scale factor to translate ROI coords from their input scale to the scale used when pooling. Default: 1.0
+        rois_num (Tensor): The number of RoIs in each image. Default: None
+        name(str, optional): For detailed information, please refer
+            to :ref:`api_guide_Name`. Usually name is no need to set and
+            None by default.
+
 
     Returns:
         Variable: The pooled feature, 4D-Tensor with the shape of [num_rois, C, pooled_height, pooled_width].
@@ -6906,11 +6910,11 @@ def roi_pool(input,
 
         input_data = np.array([i for i in range(1,17)]).reshape(1,1,4,4).astype(DATATYPE)
         roi_data =fluid.create_lod_tensor(np.array([[1., 1., 2., 2.], [1.5, 1.5, 3., 3.]]).astype(DATATYPE),[[2]], place)
-        rois_lod_data = np.array([0, 2])
+        rois_num_data = np.array([2]).astype('int32')
 
         x = fluid.data(name='input', shape=[None,1,4,4], dtype=DATATYPE)
         rois = fluid.data(name='roi', shape=[None,4], dtype=DATATYPE)
-        rois_lod = fluid.data(name='rois_lod', shape=[None], dtype='int64')
+        rois_num = fluid.data(name='rois_num', shape=[None], dtype='int32')
 
         pool_out = fluid.layers.roi_pool(
                 input=x,
@@ -6918,24 +6922,36 @@ def roi_pool(input,
                 pooled_height=1,
                 pooled_width=1,
                 spatial_scale=1.0,
-                rois_lod=rois_lod)
+                rois_num=rois_num)
 
         exe = fluid.Executor(place)
-        out, = exe.run(feed={'input':input_data ,'roi':roi_data, 'rois_lod': rois_lod_data}, fetch_list=[pool_out.name])
+        out, = exe.run(feed={'input':input_data ,'roi':roi_data, 'rois_num': rois_num_data}, fetch_list=[pool_out.name])
         print(out)   #array([[[[11.]]], [[[16.]]]], dtype=float32)
         print(np.array(out).shape)  # (2, 1, 1, 1)
     """
+    if in_dygraph_mode():
+        assert rois_num is not None, "rois_num should not be None in dygraph mode."
+        pool_out, argmaxes = core.ops.roi_pool(
+            input, rois, rois_num, "pooled_height", pooled_height,
+            "pooled_width", pooled_width, "spatial_scale", spatial_scale)
+        return pool_out, argmaxes
+
     check_variable_and_dtype(input, 'input', ['float32'], 'roi_pool')
     check_variable_and_dtype(rois, 'rois', ['float32'], 'roi_pool')
     helper = LayerHelper('roi_pool', **locals())
     dtype = helper.input_dtype()
     pool_out = helper.create_variable_for_type_inference(dtype)
     argmaxes = helper.create_variable_for_type_inference(dtype='int32')
+
+    inputs = {
+        "X": input,
+        "ROIs": rois,
+    }
+    if rois_num is not None:
+        inputs['RoisNum'] = rois_num
     helper.append_op(
         type="roi_pool",
-        inputs={"X": input,
-                "ROIs": rois,
-                "RoisLod": rois_lod},
+        inputs=inputs,
         outputs={"Out": pool_out,
                  "Argmax": argmaxes},
         attrs={
@@ -6953,8 +6969,8 @@ def roi_align(input,
               pooled_width=1,
               spatial_scale=1.0,
               sampling_ratio=-1,
-              name=None,
-              rois_lod=None):
+              rois_num=None,
+              name=None):
     """
     :alias_main: paddle.nn.functional.roi_align
 	:alias: paddle.nn.functional.roi_align,paddle.nn.functional.vision.roi_align
@@ -6969,11 +6985,11 @@ def roi_align(input,
             data type is float32 or float64. Given as [[x1, y1, x2, y2], ...],
             (x1, y1) is the top left coordinates, and (x2, y2) is the bottom
             right coordinates.
-        rois_lod (Variable): The lod info of rois. Default: None
         pooled_height (int32, optional): ${pooled_height_comment} Default: 1
         pooled_width (int32, optional): ${pooled_width_comment} Default: 1
         spatial_scale (float32, optional): ${spatial_scale_comment} Default: 1.0
         sampling_ratio(int32, optional): ${sampling_ratio_comment} Default: -1
+        rois_num (Tensor): The number of RoIs in each image. Default: None
         name(str, optional): For detailed information, please refer
             to :ref:`api_guide_Name`. Usually name is no need to set and
             None by default.
@@ -6992,26 +7008,38 @@ def roi_align(input,
                 name='data', shape=[None, 256, 32, 32], dtype='float32')
             rois = fluid.data(
                 name='rois', shape=[None, 4], dtype='float32')
-            rois_lod = fluid.data(name='rois_lod', shape=[None], dtype='int64')
+            rois_num = fluid.data(name='rois_num', shape=[None], dtype='int32')
             align_out = fluid.layers.roi_align(input=x,
                                                rois=rois,
                                                pooled_height=7,
                                                pooled_width=7,
                                                spatial_scale=0.5,
                                                sampling_ratio=-1,
-                                               rois_lod=rois_lod)
+                                               rois_num=rois_num)
     """
+    if in_dygraph_mode():
+        assert rois_num is not None, "rois_num should not be None in dygraph mode."
+        align_out = core.ops.roi_align(
+            input, rois, rois_num, "pooled_height", pooled_height,
+            "pooled_width", pooled_width, "spatial_scale", spatial_scale,
+            "sampling_ratio", sampling_ratio)
+        return align_out
+
     check_variable_and_dtype(input, 'input', ['float32', 'float64'],
                              'roi_align')
     check_variable_and_dtype(rois, 'rois', ['float32', 'float64'], 'roi_align')
     helper = LayerHelper('roi_align', **locals())
     dtype = helper.input_dtype()
     align_out = helper.create_variable_for_type_inference(dtype)
+    inputs = {
+        "X": input,
+        "ROIs": rois,
+    }
+    if rois_num is not None:
+        inputs['RoisNum'] = rois_num
     helper.append_op(
         type="roi_align",
-        inputs={"X": input,
-                "ROIs": rois,
-                "RoisLod": rois_lod},
+        inputs=inputs,
         outputs={"Out": align_out},
         attrs={
             "pooled_height": pooled_height,
@@ -8256,10 +8284,6 @@ def gather(input, index, overwrite=True):
     Returns:
         output (Tensor): The output is a tensor with the same rank as input.
     
-    Raises:
-        TypeError: ``x`` must be a Tensor and the data type of ``x`` must to be one of float16, float32, float64, int32, int64, uint8.
-        TypeError: ``index`` must be a Tensor and the data type of ``index`` must be int32 or int64.
-
     Examples:
 
         .. code-block:: python
@@ -8349,10 +8373,6 @@ def gather_nd(input, index, name=None):
 
     Returns:
         output (Tensor): A tensor with the shape index.shape[:-1] + input.shape[index.shape[-1]:]
-    
-    Raises:
-        TypeError: ``input`` must be a Tensor and the data type of ``input`` must be one of float32, float64, int32 and int64.
-        TypeError: ``index`` must be a Tensor and the data type of ``index`` must be one of int32 and int64.
 
     Examples:
 
@@ -10018,15 +10038,16 @@ def stack(x, axis=0, name=None):
 
 
     Args:
-        x (Variable|list(Variable)): Input :code:`x` can be a single Tensor, a :code:`list` of Tensors.
-                                     If :code:`x` is a :code:`list`, the shapes of all these Tensors
+        x (list(Variable)|tuple(Variable)): Input :code:`x` can be a :code:`list` or :code:`tuple` of Tensors, the shapes of all these Tensors
                                      must be the same. Supposing input is N dims
                                      Tensors :math:`[d_0, d_1, ..., d_{n-1}]`, the output is N+1 dims
                                      Tensor :math:`[d_0, d_1, d_{axis-1}, len(x), d_{axis}, ..., d_{n-1}]`.
                                      Supported data types: float32, float64, int32, int64.
-        axis (int, optional): The axis along which all inputs are stacked. ``axis`` range is :math:`[-(R+1), R+1)`.
-                              R is the first tensor of inputs. If ``axis`` < 0, :math:`axis=axis+rank(x[0])+1`.
-                              The default value of axis is 0.
+        axis (int, optional): The axis along which all inputs are stacked. ``axis`` range is ``[-(R+1), R+1)``,
+                              where ``R`` is the number of dimensions of the first input tensor ``x[0]``. 
+                              If ``axis < 0``, ``axis = axis+R+1``. The default value of axis is 0.
+        name (str, optional): Please refer to :ref:`api_guide_Name`, Default None.
+    
 
     Returns:
         Variable: The stacked Tensor, has same data type with input Tensors. Output dim is :math:`rank(x[0])+1`.
@@ -10044,18 +10065,27 @@ def stack(x, axis=0, name=None):
 
             data = layers.stack([x1,x2], axis=1) # stack according to axis 1, data.shape=[None, 2, 1, 2]
 
-            # stack single Tensor
-            data = layers.stack(x1)  # stack according to axis 0, data.shape=[1, None, 1, 2]
 
     """
     axis = 0 if axis is None else axis
-    if not isinstance(x, list) and not isinstance(x, tuple):
-        x = [x]
 
     if in_dygraph_mode():
         return core.ops.stack(x, 'axis', axis)
 
+    if not isinstance(x, list) and not isinstance(x, tuple):
+        # NOTE:(zhiqiu) Only support Variable as input if the Variable is a LOD_TENSOR_ARRAY create by create_array, array_write, array_read, etc.
+        # In that case, Variable is array of tensors indeed.
+        if isinstance(x, Variable) and x.desc.type(
+        ) == core.VarDesc.VarType.LOD_TENSOR_ARRAY:
+            x = [x]
+        else:
+            raise TypeError("The type of '%s' in %s must be %s, but received %s"
+                            % ('x', 'stack',
+                               'list[Tensor], tuple[Tensor] or TensorArray',
+                               type(x)))
+
     helper = LayerHelper('stack', **locals())
+
     out = helper.create_variable_for_type_inference(x[0].dtype)
     if x[0].desc.type() == core.VarDesc.VarType.LOD_TENSOR_ARRAY:
         assert len(x) == 1, "If the elements of 'x' in stack are Variable(LoDTensorArray), " \
@@ -10600,7 +10630,7 @@ def gaussian_random(shape,
         dtype = convert_np_dtype_to_dtype_(dtype)
 
     if in_dygraph_mode():
-        shape = utils._convert_shape_to_list(shape)
+        shape = utils.convert_shape_to_list(shape)
         return core.ops.gaussian_random('shape', shape, 'mean',
                                         float(mean), 'std',
                                         float(std), 'seed', seed, 'dtype',
@@ -10617,7 +10647,7 @@ def gaussian_random(shape,
         'dtype': dtype,
         'use_mkldnn': False
     }
-    utils._get_shape_tensor_inputs(
+    utils.get_shape_tensor_inputs(
         inputs=inputs,
         attrs=attrs,
         shape=shape,
@@ -10849,8 +10879,7 @@ def slice(input, axes, starts, ends):
                 result = [ [2, 3, 4], ] # result = data[0:1, 1:4]
     Args:
         input (Variable): A ``Tensor`` or ``LoDTensor`` . The data type is ``float16``, ``float32``, ``float64``, ``int32`` or ``int64``.
-        axes (list|tuple): The data type is ``int32`` . Axes that `starts` and `ends` apply to.
-                            It's optional. If it is not provides, it will be treated as :math:`[0,1,...,len(starts)-1]`.
+        axes (list|tuple): The data type is ``int32`` . Axes that `starts` and `ends` apply to .
         starts (list|tuple|Variable): The data type is ``int32`` . If ``starts`` is a list or tuple, the elements of
                 it should be integers or Tensors with shape [1]. If ``starts`` is an Variable, it should be an 1-D Tensor.
                 It represents starting indices of corresponding axis in ``axes``.
@@ -11200,7 +11229,7 @@ def shape(input):
                 input.shape = [3, 2]
 
     Args:
-        input (Variable): The input can be N-D Tensor or SelectedRows with data type float16, float32, float64, int32, int64.
+        input (Variable): The input can be N-D Tensor or SelectedRows with data type bool, float16, float32, float64, int32, int64.
                           If input variable is type of SelectedRows, returns the shape of it's inner tensor.
 
     Returns:
@@ -11224,8 +11253,8 @@ def shape(input):
             print(res) # [array([  3, 100, 100], dtype=int32)]
     """
     check_variable_and_dtype(
-        input, 'input', ['float16', 'float32', 'float64', 'int32', 'int64'],
-        'shape')
+        input, 'input',
+        ['bool', 'float16', 'float32', 'float64', 'int32', 'int64'], 'shape')
     helper = LayerHelper('shape', **locals())
     out = helper.create_variable_for_type_inference(dtype='int32')
     helper.append_op(
@@ -12031,6 +12060,8 @@ for func in [
         elementwise_floordiv,
 ]:
     op_proto = OpProtoHolder.instance().get_op_proto(func.__name__)
+
+    # insert the c++ doc string on top of python doc string
     func.__doc__ = _generate_doc_string_(
         op_proto,
         additional_args_lines=[
@@ -12048,6 +12079,16 @@ for func in [
             "mkldnn_data_type", "Scale_x", "Scale_y", "Scale_out"
         }) + """\n""" + str(func.__doc__)
 
+    doc_list = func.__doc__.splitlines()
+
+    for idx, val in enumerate(doc_list):
+        if val.startswith("Warning: ") and val.endswith(
+                " instead."
+        ) and "and will be removed in future versions." in val:
+            doc_list.insert(0, doc_list.pop(idx))
+            func.__doc__ = "\n" + "\n".join(i for i in doc_list)
+            break
+
 for func in []:
     op_proto = OpProtoHolder.instance().get_op_proto(func.__name__)
     func.__doc__ = _generate_doc_string_(
@@ -12153,13 +12194,10 @@ def logical_and(x, y, out=None, name=None):
         .. code-block:: python
 
             import paddle
-            import numpy as np
 
             paddle.disable_static()
-            x_data = np.array([True], dtype=np.bool)
-            y_data = np.array([True, False, True, False], dtype=np.bool)
-            x = paddle.to_tensor(x_data)
-            y = paddle.to_tensor(y_data)
+            x = paddle.to_tensor([True])
+            y = paddle.to_tensor([True, False, True, False])
             res = paddle.logical_and(x, y)
             print(res.numpy()) # [True False True False]
     """
@@ -12272,11 +12310,9 @@ def logical_not(x, out=None, name=None):
     Examples:
         .. code-block:: python
             import paddle
-            import numpy as np
 
             paddle.disable_static()
-            x_data = np.array([True, False, True, False], dtype=np.bool)
-            x = paddle.to_variable(x_data)
+            x = paddle.to_tensor([True, False, True, False])
             res = paddle.logical_not(x)
             print(res.numpy()) # [False  True False  True]
     """
@@ -15094,7 +15130,7 @@ def uniform_random(shape, dtype='float32', min=-1.0, max=1.0, seed=0,
         dtype = convert_np_dtype_to_dtype_(dtype)
 
     if in_dygraph_mode():
-        shape = utils._convert_shape_to_list(shape)
+        shape = utils.convert_shape_to_list(shape)
         return core.ops.uniform_random('shape', shape, 'min',
                                        float(min), 'max',
                                        float(max), 'seed', seed, 'dtype', dtype)
@@ -15104,7 +15140,7 @@ def uniform_random(shape, dtype='float32', min=-1.0, max=1.0, seed=0,
 
     inputs = dict()
     attrs = {'seed': seed, 'min': min, 'max': max, 'dtype': dtype}
-    utils._get_shape_tensor_inputs(
+    utils.get_shape_tensor_inputs(
         inputs=inputs, attrs=attrs, shape=shape, op_type='uniform_random/rand')
 
     helper = LayerHelper("uniform_random", **locals())
diff --git a/python/paddle/fluid/layers/ops.py b/python/paddle/fluid/layers/ops.py
index 84cacea6ba5723f8a06fc87fa9c59d96f802e65a..6cdc617a0dc17ae9f0893083285c404ca73712f7 100644
--- a/python/paddle/fluid/layers/ops.py
+++ b/python/paddle/fluid/layers/ops.py
@@ -20,7 +20,10 @@ from ..framework import convert_np_dtype_to_dtype_, Variable
 from ..data_feeder import convert_dtype, check_variable_and_dtype, check_type, check_dtype
 from paddle.utils import deprecated
 
-__deprecated_func_name__ = {'tanh_shrink': 'tanhshrink', }
+__deprecated_func_name__ = {
+    'tanh_shrink': 'tanhshrink',
+    'logsigmoid': 'log_sigmoid'
+}
 
 __activations_noattr__ = [
     'sigmoid',
@@ -86,13 +89,11 @@ add_sample_code(globals()["sigmoid"], r"""
 Examples:
     .. code-block:: python
 
-        import numpy as np
         import paddle
         import paddle.nn.functional as F
         paddle.disable_static()
 
-        x_data = np.array([-0.4, -0.2, 0.1, 0.3])
-        x = paddle.to_variable(x_data)
+        x = paddle.to_tensor([-0.4, -0.2, 0.1, 0.3])
         out = F.sigmoid(x)
         print(out.numpy())
         # [0.40131234 0.450166   0.52497919 0.57444252]
@@ -103,14 +104,12 @@ add_sample_code(globals()["logsigmoid"], r"""
 Examples:
     .. code-block:: python
 
-        import numpy as np
         import paddle
         import paddle.nn.functional as F
         paddle.disable_static()
 
-        x_data = np.array([-0.4, -0.2, 0.1, 0.3])
-        x = paddle.to_variable(x_data)
-        out = F.logsigmoid(x)
+        x = paddle.to_tensor([-0.4, -0.2, 0.1, 0.3])
+        out = F.log_sigmoid(x)
         print(out.numpy())
         # [-0.91301525 -0.79813887 -0.64439666 -0.55435524]
 
@@ -120,12 +119,10 @@ add_sample_code(globals()["exp"], r"""
 Examples:
     .. code-block:: python
 
-        import numpy as np
         import paddle
         paddle.disable_static()
 
-        x_data = np.array([-0.4, -0.2, 0.1, 0.3])
-        x = paddle.to_variable(x_data)
+        x = paddle.to_tensor([-0.4, -0.2, 0.1, 0.3])
         out = paddle.exp(x)
         print(out.numpy())
         # [0.67032005 0.81873075 1.10517092 1.34985881]
@@ -136,12 +133,10 @@ add_sample_code(globals()["tanh"], r"""
 Examples:
     .. code-block:: python
 
-        import numpy as np
         import paddle
         paddle.disable_static()
 
-        x_data = np.array([-0.4, -0.2, 0.1, 0.3])
-        x = paddle.to_variable(x_data)
+        x = paddle.to_tensor([-0.4, -0.2, 0.1, 0.3])
         out = paddle.tanh(x)
         print(out.numpy())
         # [-0.37994896 -0.19737532  0.09966799  0.29131261]
@@ -152,12 +147,10 @@ add_sample_code(globals()["atan"], r"""
 Examples:
     .. code-block:: python
 
-        import numpy as np
         import paddle
         paddle.disable_static()
 
-        x_data = np.array([-0.4, -0.2, 0.1, 0.3])
-        x = paddle.to_variable(x_data)
+        x = paddle.to_tensor([-0.4, -0.2, 0.1, 0.3])
         out = paddle.atan(x)
         print(out.numpy())
         # [-0.38050638 -0.19739556  0.09966865  0.29145679]
@@ -170,11 +163,10 @@ Examples:
 
         import paddle
         import paddle.nn.functional as F
-        import numpy as np
 
         paddle.disable_static()
 
-        x = paddle.to_tensor(np.array([-0.4, -0.2, 0.1, 0.3]))
+        x = paddle.to_tensor([-0.4, -0.2, 0.1, 0.3])
         out = F.tanhshrink(x) # [-0.020051, -0.00262468, 0.000332005, 0.00868739]
 
 """)
@@ -183,12 +175,10 @@ add_sample_code(globals()["sqrt"], r"""
 Examples:
     .. code-block:: python
 
-        import numpy as np
         import paddle
         paddle.disable_static()
 
-        x_data = np.array([0.1, 0.2, 0.3, 0.4])
-        x = paddle.to_variable(x_data)
+        x = paddle.to_tensor([0.1, 0.2, 0.3, 0.4])
         out = paddle.sqrt(x)
         print(out.numpy())
         # [0.31622777 0.4472136  0.54772256 0.63245553]
@@ -199,12 +189,10 @@ add_sample_code(globals()["rsqrt"], r"""
 Examples:
     .. code-block:: python
 
-        import numpy as np
         import paddle
         paddle.disable_static()
 
-        x_data = np.array([0.1, 0.2, 0.3, 0.4])
-        x = paddle.to_variable(x_data)
+        x = paddle.to_tensor([0.1, 0.2, 0.3, 0.4])
         out = paddle.rsqrt(x)
         print(out.numpy())
         # [3.16227766 2.23606798 1.82574186 1.58113883]
@@ -215,12 +203,10 @@ add_sample_code(globals()["abs"], r"""
 Examples:
     .. code-block:: python
 
-        import numpy as np
         import paddle
         paddle.disable_static()
 
-        x_data = np.array([-0.4, -0.2, 0.1, 0.3])
-        x = paddle.to_variable(x_data)
+        x = paddle.to_tensor([-0.4, -0.2, 0.1, 0.3])
         out = paddle.abs(x)
         print(out.numpy())
         # [0.4 0.2 0.1 0.3]
@@ -231,12 +217,10 @@ add_sample_code(globals()["ceil"], r"""
 Examples:
     .. code-block:: python
 
-        import numpy as np
         import paddle
         paddle.disable_static()
 
-        x_data = np.array([-0.4, -0.2, 0.1, 0.3])
-        x = paddle.to_variable(x_data)
+        x = paddle.to_tensor([-0.4, -0.2, 0.1, 0.3])
         out = paddle.ceil(x)
         print(out.numpy())
         # [-0. -0.  1.  1.]
@@ -247,12 +231,10 @@ add_sample_code(globals()["floor"], r"""
 Examples:
     .. code-block:: python
 
-        import numpy as np
         import paddle
         paddle.disable_static()
 
-        x_data = np.array([-0.4, -0.2, 0.1, 0.3])
-        x = paddle.to_variable(x_data)
+        x = paddle.to_tensor([-0.4, -0.2, 0.1, 0.3])
         out = paddle.floor(x)
         print(out.numpy())
         # [-1. -1.  0.  0.]
@@ -263,12 +245,10 @@ add_sample_code(globals()["cos"], r"""
 Examples:
     .. code-block:: python
 
-        import numpy as np
         import paddle
         paddle.disable_static()
 
-        x_data = np.array([-0.4, -0.2, 0.1, 0.3])
-        x = paddle.to_variable(x_data)
+        x = paddle.to_tensor([-0.4, -0.2, 0.1, 0.3])
         out = paddle.cos(x)
         print(out.numpy())
         # [0.92106099 0.98006658 0.99500417 0.95533649]
@@ -279,12 +259,10 @@ add_sample_code(globals()["acos"], r"""
 Examples:
     .. code-block:: python
 
-        import numpy as np
         import paddle
         paddle.disable_static()
 
-        x_data = np.array([-0.4, -0.2, 0.1, 0.3])
-        x = paddle.to_variable(x_data)
+        x = paddle.to_tensor([-0.4, -0.2, 0.1, 0.3])
         out = paddle.acos(x)
         print(out.numpy())
         # [1.98231317 1.77215425 1.47062891 1.26610367]
@@ -295,12 +273,10 @@ add_sample_code(globals()["sin"], r"""
 Examples:
     .. code-block:: python
 
-        import numpy as np
         import paddle
         paddle.disable_static()
 
-        x_data = np.array([-0.4, -0.2, 0.1, 0.3])
-        x = paddle.to_variable(x_data)
+        x = paddle.to_tensor([-0.4, -0.2, 0.1, 0.3])
         out = paddle.sin(x)
         print(out.numpy())
         # [-0.38941834 -0.19866933  0.09983342  0.29552021]
@@ -311,12 +287,10 @@ add_sample_code(globals()["asin"], r"""
 Examples:
     .. code-block:: python
 
-        import numpy as np
         import paddle
         paddle.disable_static()
 
-        x_data = np.array([-0.4, -0.2, 0.1, 0.3])
-        x = paddle.to_variable(x_data)
+        x = paddle.to_tensor([-0.4, -0.2, 0.1, 0.3])
         out = paddle.asin(x)
         print(out.numpy())
         # [-0.41151685 -0.20135792  0.10016742  0.30469265]
@@ -327,12 +301,10 @@ add_sample_code(globals()["cosh"], r"""
 Examples:
     .. code-block:: python
 
-        import numpy as np
         import paddle
         paddle.disable_static()
 
-        x_data = np.array([-0.4, -0.2, 0.1, 0.3])
-        x = paddle.to_variable(x_data)
+        x = paddle.to_tensor([-0.4, -0.2, 0.1, 0.3])
         out = paddle.cosh(x)
         print(out.numpy())
         # [1.08107237 1.02006676 1.00500417 1.04533851]
@@ -343,12 +315,10 @@ add_sample_code(globals()["sinh"], r"""
 Examples:
     .. code-block:: python
 
-        import numpy as np
         import paddle
         paddle.disable_static()
 
-        x_data = np.array([-0.4, -0.2, 0.1, 0.3])
-        x = paddle.to_variable(x_data)
+        x = paddle.to_tensor([-0.4, -0.2, 0.1, 0.3])
         out = paddle.sinh(x)
         print(out.numpy())
         # [-0.41075233 -0.201336    0.10016675  0.30452029]
@@ -359,12 +329,10 @@ add_sample_code(globals()["round"], r"""
 Examples:
     .. code-block:: python
 
-        import numpy as np
         import paddle
         paddle.disable_static()
 
-        x_data = np.array([-0.5, -0.2, 0.6, 1.5])
-        x = paddle.to_variable(x_data)
+        x = paddle.to_tensor([-0.5, -0.2, 0.6, 1.5])
         out = paddle.round(x)
         print(out.numpy())
         # [-1. -0.  1.  2.]
@@ -375,12 +343,10 @@ add_sample_code(globals()["reciprocal"], r"""
 Examples:
     .. code-block:: python
 
-        import numpy as np
         import paddle
         paddle.disable_static()
 
-        x_data = np.array([-0.4, -0.2, 0.1, 0.3])
-        x = paddle.to_variable(x_data)
+        x = paddle.to_tensor([-0.4, -0.2, 0.1, 0.3])
         out = paddle.reciprocal(x)
         print(out.numpy())
         # [-2.5        -5.         10.          3.33333333]
@@ -391,12 +357,10 @@ add_sample_code(globals()["square"], r"""
 Examples:
     .. code-block:: python
 
-        import numpy as np
         import paddle
         paddle.disable_static()
 
-        x_data = np.array([-0.4, -0.2, 0.1, 0.3])
-        x = paddle.to_variable(x_data)
+        x = paddle.to_tensor([-0.4, -0.2, 0.1, 0.3])
         out = paddle.square(x)
         print(out.numpy())
         # [0.16 0.04 0.01 0.09]
@@ -409,11 +373,10 @@ Examples:
 
         import paddle
         import paddle.nn.functional as F
-        import numpy as np
 
         paddle.disable_static()
 
-        x = paddle.to_tensor(np.array([-0.4, -0.2, 0.1, 0.3]))
+        x = paddle.to_tensor([-0.4, -0.2, 0.1, 0.3])
         out = F.softplus(x) # [0.513015, 0.598139, 0.744397, 0.854355]
 
 """)
@@ -424,11 +387,10 @@ Examples:
 
         import paddle
         import paddle.nn.functional as F
-        import numpy as np
 
         paddle.disable_static()
 
-        x = paddle.to_tensor(np.array([-0.4, -0.2, 0.1, 0.3]))
+        x = paddle.to_tensor([-0.4, -0.2, 0.1, 0.3])
         out = F.softsign(x) # [-0.285714, -0.166667, 0.0909091, 0.230769]
 
 """)
@@ -761,11 +723,9 @@ Examples:
     
     .. code-block:: python
     
-        import numpy as np
         import paddle
         paddle.disable_static()
-        x_data = np.array([-0.4, -0.2, 0.1, 0.3])
-        x = paddle.to_tensor(x_data)
+        x = paddle.to_tensor([-0.4, -0.2, 0.1, 0.3])
         out = paddle.erf(x)
         print(out.numpy())
         # [-0.42839236 -0.22270259  0.11246292  0.32862676]
diff --git a/python/paddle/fluid/layers/tensor.py b/python/paddle/fluid/layers/tensor.py
index 77a78eb4a14a0a5ad9be9cff71131ca473106ab8..2fba578ec077f2a74388d433bf3ab5b3098e81ad 100644
--- a/python/paddle/fluid/layers/tensor.py
+++ b/python/paddle/fluid/layers/tensor.py
@@ -29,13 +29,14 @@ from ..data_feeder import check_variable_and_dtype, check_type, check_dtype, con
 from paddle.utils import deprecated
 import numpy
 import warnings
+from .utils import check_shape
 
 __all__ = [
     'create_tensor', 'create_parameter', 'create_global_var', 'cast',
     'tensor_array_to_tensor', 'concat', 'sums', 'assign',
     'fill_constant_batch_size_like', 'fill_constant', 'argmin', 'argmax',
     'argsort', 'ones', 'zeros', 'reverse', 'has_inf', 'has_nan', 'isfinite',
-    'range', 'linspace', 'zeros_like', 'ones_like', 'diag', 'eye'
+    'range', 'linspace', 'zeros_like', 'ones_like', 'diag', 'eye', 'triu'
 ]
 
 
@@ -276,11 +277,6 @@ def concat(input, axis=0, name=None):
         name (str, optional): The default value is None. Normally there is no
             need for user to set this property. For more information, please
             refer to :ref:`api_guide_Name`.
-    Raises:
-        TypeError: ``input`` must be one of list, tuple or Tensor.
-        TypeError: The data type of ``input`` must be one of bool, float16, float32, float64, int32 and int64. 
-        TypeError: The ``axis`` must be int or Tensor. The dtype of ``axis`` must be int32 or int64 when it's a Tensor.
-        TypeError: All the Tensors in ``input`` must have the same data type.
 
     Returns:
         Tensor: A Tensor with the same data type as ``input``.
@@ -657,12 +653,6 @@ def fill_constant(shape, dtype, value, force_cpu=False, out=None, name=None):
     Returns:
         Tensor: Tensor which is created according to shape and dtype.
 
-    Raises:
-        TypeError: The dtype must be one of bool, float16, float32, float64, int32 and int64
-            and the data type of ``out`` must be the same as the ``dtype``. 
-        TypeError: The shape must be one of list, tuple and Tensor, the data type of ``shape``
-            must be int32 or int64 when ``shape`` is a Tensor
-
     Examples:
         .. code-block:: python
 
@@ -690,11 +680,13 @@ def fill_constant(shape, dtype, value, force_cpu=False, out=None, name=None):
     if not isinstance(value, Variable):
         if dtype in ['int64', 'int32']:
             attrs['str_value'] = str(int(value))
+            attrs['value'] = int(value)
         else:
             attrs['str_value'] = str(float(value))
+            attrs['value'] = float(value)
 
     if in_dygraph_mode():
-        shape = utils._convert_shape_to_list(shape)
+        shape = utils.convert_shape_to_list(shape)
         if out is None:
             out = _varbase_creator(dtype=dtype)
 
@@ -718,20 +710,18 @@ def fill_constant(shape, dtype, value, force_cpu=False, out=None, name=None):
             value = cast(value, dtype)
         inputs['ValueTensor'] = value
 
+    check_shape(shape)
     check_dtype(dtype, 'dtype',
                 ['bool', 'float16', 'float32', 'float64', 'int32', 'int64'],
                 'fill_constant')
     check_type(shape, 'shape', (Variable, list, tuple), 'fill_constant')
 
-    if isinstance(shape, Variable):
-        check_dtype(shape.dtype, 'shape', ['int32', 'int64'], 'fill_constant')
-
     if out is not None:
         check_variable_and_dtype(out, 'out', [convert_dtype(dtype)],
                                  'fill_constant')
 
     helper = LayerHelper("fill_constant", **locals())
-    utils._get_shape_tensor_inputs(
+    utils.get_shape_tensor_inputs(
         inputs=inputs, attrs=attrs, shape=shape, op_type='fill_constant')
 
     if out is None:
@@ -1050,10 +1040,6 @@ def ones(shape, dtype, force_cpu=False):
 
     Returns:
         Tensor: A tensor of data type :attr:`dtype` with shape :attr:`shape` and all elements set to 1.
-    Raises:
-        TypeError: The ``dtype`` must be one of bool, float16, float32, float64, int32, int64.
-        TypeError: The ``shape`` must be one of list, tuple and Tensor. The data type of ``shape`` must
-            be int32 or int64 when it's a Tensor.
 
     Examples:
         .. code-block:: python
@@ -1086,10 +1072,6 @@ def zeros(shape, dtype, force_cpu=False, name=None):
     Returns:
         Tensor: A tensor of data type :attr:`dtype` with shape :attr:`shape` and all elements set to 0.
 
-    Raises:
-        TypeError: The ``dtype`` must be one of bool, float16, float32, float64, int32, int64.
-        TypeError: The ``shape`` must be one of list, tuple and Tensor. The data type of ``shape`` must
-            be int32 or int64 when it's a Tensor.
     Examples:
         .. code-block:: python
 
@@ -1442,7 +1424,7 @@ def linspace(start, stop, num, dtype=None, name=None):
         stop(int|float|Tensor): The input :attr:`stop` is start variable of range. It is a scalar, \
             or a Tensor of shape [1] with input data type int32, int64, float32 or float64.
         num(int|Tensor): The input :attr:`num` is given num of the sequence. It is an int scalar, \
-            or a Tensor of shape [1] with data type int32 or int64.
+            or a Tensor of shape [1] with data type int32.
         dtype(np.dtype|str, optional): The data type of output tensor, it could be
             int32, int64, float32 and float64. Default: if None, the data type is float32.
         name(str, optional): Normally there is no need for user to set this property. 
@@ -1453,14 +1435,6 @@ def linspace(start, stop, num, dtype=None, name=None):
         the data shape of this tensor is :math:`[num]` . If the :attr:`num` is set 1, the output tensor just has \
         the value with input :attr:`start`. 
 
-    Raises:
-        TypeError: The ``dtype`` must be one of int32, int64, float32 and float64.
-        TypeError: The type of ``num`` must be int When it's not a Tensor.
-        TypeError: The data type of ``num`` must be int32  When it's  a Tensor.
-        TypeError: The data type of ``start`` and  ``stop`` must be same as ``dtype`` When it's  a Tensor.
-
-
-
     Examples:
         .. code-block:: python
 
@@ -1474,35 +1448,51 @@ def linspace(start, stop, num, dtype=None, name=None):
     tensor_num = num
     tensor_start = start
     tensor_stop = stop
+    if not isinstance(num, Variable):
+        check_type(num, 'num', (int), 'linspace')
     if not isinstance(dtype, core.VarDesc.VarType):
         dtype = convert_np_dtype_to_dtype_(dtype)
     if not isinstance(start, Variable):
-        tensor_start = fill_constant([1], dtype, start)
+        with device_guard("cpu"):
+            tensor_start = fill_constant([1], dtype, start)
     if not isinstance(stop, Variable):
-        tensor_stop = fill_constant([1], dtype, stop)
+        with device_guard("cpu"):
+            tensor_stop = fill_constant([1], dtype, stop)
     if not isinstance(num, Variable):
-        tensor_num = fill_constant([1], 'int32', num)
+        with device_guard("cpu"):
+            tensor_num = fill_constant([1], 'int32', num)
     if in_dygraph_mode():
         return core.ops.linspace(tensor_start, tensor_stop, tensor_num, 'dtype',
                                  dtype)
 
     helper = LayerHelper("linspace", **locals())
 
+    start_dtype = convert_dtype(tensor_start.dtype)
+    stop_dtype = convert_dtype(tensor_stop.dtype)
+    out_dtype = convert_dtype(dtype)
     if isinstance(start, Variable):
-        check_dtype(start.dtype, 'start', (convert_dtype(dtype)), 'linspace')
+        check_dtype(start.dtype, 'start',
+                    ['float32', 'float64', 'int32', 'int64'], 'linspace')
     else:
         check_type(start, 'start', (int, float), 'linspace')
 
     if isinstance(stop, Variable):
-        check_dtype(stop.dtype, 'stop', (convert_dtype(dtype)), 'linspace')
+        check_dtype(stop.dtype, 'stop',
+                    ['float32', 'float64', 'int32', 'int64'], 'linspace')
     else:
         check_type(stop, 'stop', (int, float), 'linspace')
     if isinstance(num, Variable):
         check_dtype(num.dtype, 'num', ['int32'], 'linspace')
-    else:
-        check_type(num, 'num', (int), 'linspace')
     check_dtype(dtype, 'dtype', ['int32', 'int64', 'float32', 'float64'],
                 'linspace')
+    if ((stop_dtype == "float64" or start_dtype == "float64") and
+            out_dtype in ["float32", "int32"]) or ((stop_dtype == "int64" or
+                                                    start_dtype == "int64") and
+                                                   out_dtype == "int32"):
+        raise ValueError(
+            "The dtype of start/stop is {}/{} but the attr(dtype) of linspace is {}, "
+            "which may cause data type overflows. Please reset attr(dtype) of linspace."
+            .format(start_dtype, stop_dtype, dtype))
 
     out = helper.create_variable_for_type_inference(dtype=dtype)
 
@@ -1629,9 +1619,6 @@ def eye(num_rows,
 
     Returns:
         Tensor: An identity Tensor or LoDTensor of shape batch_shape + [num_rows, num_columns].
-    Raises:
-        TypeError: The `dtype` must be one of float16, float32, float64, int32 and int64.
-        TypeError: The `num_columns` must be non-negative int.
 
     Examples:
         .. code-block:: python
@@ -1743,3 +1730,9 @@ def ones_like(x, out=None):
         attrs={'value': 1.0},
         outputs={'Out': [out]})
     return out
+
+
+@deprecated(since="2.0.0", update_to="paddle.triu")
+def triu(input, diagonal=0, name=None):
+    import paddle
+    return paddle.tensor.triu(x=input, diagonal=diagonal, name=name)
diff --git a/python/paddle/fluid/layers/utils.py b/python/paddle/fluid/layers/utils.py
index 0d6965239e14b92d3d4997a9cf8efbe3fa7048b7..2095c9957e75b94396e573eba341f4cfded5dbc8 100644
--- a/python/paddle/fluid/layers/utils.py
+++ b/python/paddle/fluid/layers/utils.py
@@ -20,6 +20,7 @@ import numpy as np
 from ..framework import Variable
 from ..data_feeder import convert_dtype, check_variable_and_dtype, check_type, check_dtype
 from ..layer_helper import LayerHelper
+from sys import version_info
 
 
 def convert_to_list(value, n, name, dtype=np.int):
@@ -282,7 +283,7 @@ def _contain_var(list_or_tuple):
     return False
 
 
-def _get_shape_tensor_inputs(inputs, attrs, shape, op_type):
+def get_shape_tensor_inputs(inputs, attrs, shape, op_type):
     from .tensor import fill_constant, cast
 
     def _get_attr_shape(list_shape):
@@ -347,7 +348,7 @@ def _convert_to_tensor_list(old_list, dtype="int32"):
     return new_list_tensor
 
 
-def _convert_shape_to_list(shape):
+def convert_shape_to_list(shape):
     """
     Convert shape(list, tuple, variable) to list in imperative mode
     """
@@ -358,3 +359,22 @@ def _convert_shape_to_list(shape):
     else:
         shape = list(shape.numpy().astype(int))
     return shape
+
+
+def check_shape(shape):
+    """
+    Check shape type and shape elements type before passing it to fill_constant
+    """
+    if isinstance(shape, Variable):
+        check_dtype(shape.dtype, 'shape', ['int32', 'int64'], 'fill_constant')
+    else:
+        for ele in shape:
+            if not isinstance(ele, Variable):
+                if ele < 0:
+                    raise ValueError(
+                        "All elements in ``shape`` must be positive when it's a list or tuple"
+                    )
+                if not isinstance(ele, six.integer_types):
+                    raise TypeError(
+                        "All elements in ``shape`` must be integers when it's a list or tuple"
+                    )
diff --git a/python/paddle/fluid/optimizer.py b/python/paddle/fluid/optimizer.py
old mode 100644
new mode 100755
index 9e2d77df777d761b6904d8916c7a35fb8e6bfaba..1e7915ed781a6441f32fb86c3c92e6f68ca66b93
--- a/python/paddle/fluid/optimizer.py
+++ b/python/paddle/fluid/optimizer.py
@@ -170,7 +170,7 @@ class Optimizer(object):
         return state_dict
 
     @framework.dygraph_only
-    def set_dict(self, state_dict):
+    def set_state_dict(self, state_dict):
         '''
         Load optimizer state dict. For Adam optimizer, contains beta1, beta2, momentum etc. If LearningRateDecay have been used, global_step will be changed.
 
@@ -182,20 +182,22 @@ class Optimizer(object):
         Examples:
             .. code-block:: python
 
-                with fluid.dygraph.guard():
-                    emb = fluid.dygraph.Embedding([10, 10])
+                import paddle   
+
+                paddle.disable_static()
+
+                emb = paddle.nn.Embedding([10, 10])
 
-                    state_dict = emb.state_dict()
-                    fluid.save_dygraph(state_dict, "paddle_dy")
+                state_dict = emb.state_dict()
+                paddle.save(state_dict, "paddle_dy")
 
-                    adam = fluid.optimizer.Adam(learning_rate=fluid.layers.noam_decay( 100, 10000), 
+                adam = paddle.optimizer.Adam(learning_rate=fluid.layers.noam_decay( 100, 10000), 
                                                 parameter_list=emb.parameters())
-                    state_dict = adam.state_dict()
-                    fluid.save_dygraph(state_dict, "paddle_dy")
+                state_dict = adam.state_dict()
 
-                    para_state_dict, opti_state_dict = fluid.load_dygraph( "paddle_dy")
+                para_state_dict, opti_state_dict = paddle.load("paddle_dy")
 
-                    adam.set_dict(opti_state_dict)
+                adam.set_state_dict(opti_state_dict)
 
         '''
         from paddle.optimizer.lr_scheduler import _LRScheduler
@@ -257,6 +259,9 @@ class Optimizer(object):
 
                 tensor.set(load_para_np, framework._current_expected_place())
 
+    # [aliases] Compatible with old method names
+    set_dict = set_state_dict
+
     def get_opti_var_name_list(self):
         return self._opti_name_list
 
@@ -1599,7 +1604,7 @@ class LarsMomentumOptimizer(Optimizer):
         & local\_learning\_rate = learning\_rate * lars\_coeff * \\
           \\frac{||param||}{||gradient|| + lars\_weight\_decay * ||param||}
 
-        & velocity = mu * velocity + local\_learning\_rate * (gradient + lars\_weight\_decay * param)
+        & velocity = mu * velocity + local\_learning\_rate * (gradient + lars\_weight\_decay * param + epsilon)
 
         & param = param - velocity
 
@@ -1623,7 +1628,9 @@ class LarsMomentumOptimizer(Optimizer):
             :ref:`api_fluid_clip_GradientClipByValue` ). Default None, meaning there is no gradient clipping.
         name (str, optional): This parameter is used by developers to print debugging information. \
             For details, please refer to :ref:`api_guide_Name`. Default is None.
-
+        exclude_from_weight_decay (list[str], optional): Name string of layers which will be exclude from lars weight decay. Default is None.
+        epsilon (float, optional): Epsilon to avoid Division by Zero when calculate local lr. Default is 0.
+        
     Examples:
         .. code-block:: python
 
@@ -1654,7 +1661,9 @@ class LarsMomentumOptimizer(Optimizer):
                  parameter_list=None,
                  regularization=None,
                  grad_clip=None,
-                 name=None):
+                 name=None,
+                 exclude_from_weight_decay=None,
+                 epsilon=0):
         assert learning_rate is not None
         assert momentum is not None
         super(LarsMomentumOptimizer, self).__init__(
@@ -1667,6 +1676,11 @@ class LarsMomentumOptimizer(Optimizer):
         self._momentum = momentum
         self._lars_coeff = float(lars_coeff)
         self._lars_weight_decay = float(lars_weight_decay)
+        self._epsilon = float(epsilon)
+        if exclude_from_weight_decay is None:
+            self._exclude_from_weight_decay = []
+        else:
+            self._exclude_from_weight_decay = exclude_from_weight_decay
 
     def _create_accumulators(self, block, parameters):
         assert isinstance(block, framework.Block)
@@ -1677,6 +1691,14 @@ class LarsMomentumOptimizer(Optimizer):
     def _append_optimize_op(self, block, param_and_grad):
         assert isinstance(block, framework.Block)
 
+        _lars_weight_decay = self._lars_weight_decay
+        param_name = param_and_grad[0].name
+        if len(self._exclude_from_weight_decay) > 0:
+            for name in self._exclude_from_weight_decay:
+                if name in param_name:
+                    _lars_weight_decay = 0.0
+                    break
+
         velocity_acc = self._get_accumulator(self._velocity_acc_str,
                                              param_and_grad[0])
         # create the momentum optimize op
@@ -1695,7 +1717,8 @@ class LarsMomentumOptimizer(Optimizer):
             attrs={
                 "mu": self._momentum,
                 "lars_coeff": self._lars_coeff,
-                "lars_weight_decay": self._lars_weight_decay
+                "lars_weight_decay": _lars_weight_decay,
+                "epsilon": self._epsilon
             },
             stop_gradient=True)
 
@@ -3547,8 +3570,10 @@ class ExponentialMovingAverage(object):
                 # bias correction
                 with layers.control_flow.Switch() as switch:
                     with switch.case(global_step > 0):
-                        layers.assign(output=ema, input=ema / (1.0 - decay_pow))
-                layers.assign(input=ema, output=param)
+                        layers.assign(
+                            output=param, input=ema / (1.0 - decay_pow))
+                    with switch.default():
+                        layers.assign(output=param, input=ema)
 
         self.restore_program = Program()
         block = self.restore_program.global_block()
@@ -4595,15 +4620,16 @@ class RecomputeOptimizer(Optimizer):
             ), "_checkpoints should be a list of Variable or a list of String"
         self._checkpoints = checkpoints
 
-    def load(self, stat_dict):
+    @framework.deprecate_stat_dict
+    def load(self, state_dict):
         """
-	:api_attr: Static Graph
+	    :api_attr: Static Graph
 
         load function is not supported by Recompute Optimizer for now.
         :return: None
 
         Args:
-            stat_dict: the dict load by load_persistable method
+            state_dict: the dict load by load_persistable method
 
         Examples:
             .. code-block:: python
@@ -4627,8 +4653,8 @@ class RecomputeOptimizer(Optimizer):
                 sgd = fluid.optimizer.RecomputeOptimizer(sgd)
                 sgd._set_checkpoints([fc_1, pred])
                 try:
-                    stat_dict = {}
-                    sgd.load(stat_dict)
+                    state_dict = {}
+                    sgd.load(state_dict)
                 except NotImplementedError as e:
                     print(cpt.get_exception_message(e))
         """
diff --git a/python/paddle/fluid/reader.py b/python/paddle/fluid/reader.py
index 76c95be75d67d60cd59efe13ecba6f01a1c1d614..533222531f98b188f9fe5b47184ff39736488bd6 100644
--- a/python/paddle/fluid/reader.py
+++ b/python/paddle/fluid/reader.py
@@ -85,6 +85,30 @@ def _convert_places(places):
     return ret
 
 
+# NOTE(chenweihang): _reader_process_loop must be top level method to be pickled
+def _reader_process_loop(batch_reader, data_queue):
+    try:
+        # set signal handler
+        core._set_process_signal_handler()
+
+        # NOTE: [ mmap files clear ] When the child process exits unexpectedly,
+        # some shared memory objects may have been applied for but have not yet
+        # been put into the inter-process Queue. This part of the object needs
+        # to be cleaned up when the process ends.
+        CleanupFuncRegistrar.register(_cleanup_mmap)
+
+        for batch in batch_reader():
+            tensor_list = core._convert_to_tensor_list(batch)
+            data_queue.put(tensor_list)
+            core._remove_tensor_list_mmap_fds(tensor_list)
+        data_queue.put(None)
+    except KeyboardInterrupt:
+        # NOTE: Main process will raise KeyboardInterrupt anyways, ignore it in child process
+        pass
+    except:
+        six.reraise(*sys.exc_info())
+
+
 class DataLoaderBase(object):
     def __init__(self):
         self._places = None
@@ -811,7 +835,8 @@ class DygraphGeneratorLoader(DataLoaderBase):
             global multiprocess_queue_set
             multiprocess_queue_set.add(self._data_queue)
             self._process = multiprocessing.Process(
-                target=self._reader_process_loop)
+                target=_reader_process_loop,
+                args=(self._batch_reader, self._data_queue))
             self._process.daemon = True
             self._process.start()
 
@@ -867,28 +892,6 @@ class DygraphGeneratorLoader(DataLoaderBase):
         self._blocking_queue.kill()
         logging.error("DataLoader reader thread raised an exception!")
 
-    def _reader_process_loop(self):
-        try:
-            # set signal handler
-            core._set_process_signal_handler()
-
-            # NOTE: [ mmap files clear ] When the child process exits unexpectedly,
-            # some shared memory objects may have been applied for but have not yet
-            # been put into the inter-process Queue. This part of the object needs
-            # to be cleaned up when the process ends.
-            CleanupFuncRegistrar.register(_cleanup_mmap)
-
-            for batch in self._batch_reader():
-                tensor_list = core._convert_to_tensor_list(batch)
-                self._data_queue.put(tensor_list)
-                core._remove_tensor_list_mmap_fds(tensor_list)
-            self._data_queue.put(None)
-        except KeyboardInterrupt:
-            # NOTE: Main process will raise KeyboardInterrupt anyways, ignore it in child process
-            pass
-        except:
-            six.reraise(*sys.exc_info())
-
     def _reader_thread_loop_for_multiprocess(self):
         while not self._thread_done_event.is_set():
             try:
@@ -1723,13 +1726,13 @@ class DatasetLoader(DataLoaderBase):
             logging.warn('thread_num {} which is set in Dataset is ignored'.
                          format(dataset.thread_num))
 
-        dataset.set_thread(thread_num)
+        dataset._set_thread(thread_num)
 
         if isinstance(dataset, paddle.distributed.fleet.dataset.
                       InMemoryDataset) and dataset.queue_num > thread_num:
             logging.warn("queue_num {} which is set in Dataset is ignored".
                          format(dataset.queue_num))
-            dataset.set_queue_num(thread_num)
+            dataset._set_queue_num(thread_num)
 
         self._dataset = dataset
         use_slots = [
diff --git a/python/paddle/fluid/tests/book/test_fit_a_line.py b/python/paddle/fluid/tests/book/test_fit_a_line.py
index a7d5a0305993a637ba2ce7d59f91a0c03b700a69..9a2cc4ab1a1b9071825f92d7ed50d9db6f13a385 100644
--- a/python/paddle/fluid/tests/book/test_fit_a_line.py
+++ b/python/paddle/fluid/tests/book/test_fit_a_line.py
@@ -23,6 +23,8 @@ import math
 import sys
 import os
 
+paddle.enable_static()
+
 
 def train(use_cuda, save_dirname, is_local):
     x = fluid.layers.data(name='x', shape=[13], dtype='float32')
diff --git a/python/paddle/fluid/tests/book/test_image_classification.py b/python/paddle/fluid/tests/book/test_image_classification.py
index 22b74f2922887eb972806eac15904795b5a48ca7..7c2d5c693a9fdcea8f6249eaa8f418f87da1790e 100644
--- a/python/paddle/fluid/tests/book/test_image_classification.py
+++ b/python/paddle/fluid/tests/book/test_image_classification.py
@@ -24,6 +24,8 @@ import unittest
 import os
 import numpy as np
 
+paddle.enable_static()
+
 
 def resnet_cifar10(input, depth=32):
     def conv_bn_layer(input,
diff --git a/python/paddle/fluid/tests/book/test_label_semantic_roles.py b/python/paddle/fluid/tests/book/test_label_semantic_roles.py
index ef14600e6446505228b5cd70c9d9288cdae44a39..568d7518a1e0b161fe6b46c6a845c10681234c4b 100644
--- a/python/paddle/fluid/tests/book/test_label_semantic_roles.py
+++ b/python/paddle/fluid/tests/book/test_label_semantic_roles.py
@@ -25,6 +25,8 @@ import paddle
 import paddle.dataset.conll05 as conll05
 import paddle.fluid as fluid
 
+paddle.enable_static()
+
 word_dict, verb_dict, label_dict = conll05.get_dict()
 word_dict_len = len(word_dict)
 label_dict_len = len(label_dict)
diff --git a/python/paddle/fluid/tests/book/test_machine_translation.py b/python/paddle/fluid/tests/book/test_machine_translation.py
index 5e241aaa32727686b84a0354a11d5a92f9576a90..a0056ba3bab06bb90ddc8b0ffe7587cf1a1d59b1 100644
--- a/python/paddle/fluid/tests/book/test_machine_translation.py
+++ b/python/paddle/fluid/tests/book/test_machine_translation.py
@@ -24,6 +24,8 @@ from paddle.fluid.executor import Executor
 import unittest
 import os
 
+paddle.enable_static()
+
 dict_size = 30000
 source_dict_dim = target_dict_dim = dict_size
 hidden_dim = 32
diff --git a/python/paddle/fluid/tests/book/test_recognize_digits.py b/python/paddle/fluid/tests/book/test_recognize_digits.py
index 4fbb146752e73358a02a19fd5109e84ad00ecbae..71c57b851600d097ca4c6f13b6ba2050af9c825b 100644
--- a/python/paddle/fluid/tests/book/test_recognize_digits.py
+++ b/python/paddle/fluid/tests/book/test_recognize_digits.py
@@ -26,6 +26,8 @@ import paddle
 import paddle.fluid as fluid
 from paddle.fluid.layers.device import get_places
 
+paddle.enable_static()
+
 BATCH_SIZE = 64
 
 
diff --git a/python/paddle/fluid/tests/book/test_recommender_system.py b/python/paddle/fluid/tests/book/test_recommender_system.py
index 433b5498de718d46395676b70b0abd0ab9240336..c2ab249f5713d419b95ff848f061568f3d058457 100644
--- a/python/paddle/fluid/tests/book/test_recommender_system.py
+++ b/python/paddle/fluid/tests/book/test_recommender_system.py
@@ -26,6 +26,8 @@ import paddle.fluid.nets as nets
 from paddle.fluid.executor import Executor
 from paddle.fluid.optimizer import SGDOptimizer
 
+paddle.enable_static()
+
 IS_SPARSE = True
 USE_GPU = False
 BATCH_SIZE = 256
diff --git a/python/paddle/fluid/tests/book/test_rnn_encoder_decoder.py b/python/paddle/fluid/tests/book/test_rnn_encoder_decoder.py
index 0d65513c122d3ea9effcc391f6049b9c1b462546..3791e386ecfdefde15207926a6b43f0a14d4060e 100644
--- a/python/paddle/fluid/tests/book/test_rnn_encoder_decoder.py
+++ b/python/paddle/fluid/tests/book/test_rnn_encoder_decoder.py
@@ -25,6 +25,9 @@ import math
 import sys
 import unittest
 from paddle.fluid.executor import Executor
+import paddle
+
+paddle.enable_static()
 
 dict_size = 30000
 source_dict_dim = target_dict_dim = dict_size
diff --git a/python/paddle/fluid/tests/book/test_word2vec.py b/python/paddle/fluid/tests/book/test_word2vec.py
index c919584554b1613b6b3b125cf7beaddda931c47f..aae4de70aca19fbbfb9aa303bf2a9049b05854f1 100644
--- a/python/paddle/fluid/tests/book/test_word2vec.py
+++ b/python/paddle/fluid/tests/book/test_word2vec.py
@@ -23,6 +23,8 @@ import numpy as np
 import math
 import sys
 
+paddle.enable_static()
+
 
 def train(use_cuda, is_sparse, is_parallel, save_dirname, is_local=True):
     PASS_NUM = 100
diff --git a/python/paddle/fluid/tests/custom_op/test_custom_op.py b/python/paddle/fluid/tests/custom_op/test_custom_op.py
index 0d02da53d66d3a0ad3160f130153f013db92e1c9..c9f7d0b7c966ad1f99160de4b879f09b013bc513 100644
--- a/python/paddle/fluid/tests/custom_op/test_custom_op.py
+++ b/python/paddle/fluid/tests/custom_op/test_custom_op.py
@@ -21,6 +21,8 @@ import contextlib
 import paddle
 import paddle.fluid as fluid
 
+paddle.enable_static()
+
 file_dir = os.path.dirname(os.path.abspath(__file__))
 fluid.load_op_library(os.path.join(file_dir, 'librelu2_op.so'))
 
diff --git a/python/paddle/fluid/tests/test_beam_search_decoder.py b/python/paddle/fluid/tests/test_beam_search_decoder.py
index fe8a9daa3bea4b99bb42edc78538685c5ce11fe3..69f3ff46b3ac9c50f588a64182d02783cbc93aed 100644
--- a/python/paddle/fluid/tests/test_beam_search_decoder.py
+++ b/python/paddle/fluid/tests/test_beam_search_decoder.py
@@ -29,6 +29,8 @@ from paddle.fluid.contrib.decoder.beam_search_decoder import *
 import unittest
 import os
 
+paddle.enable_static()
+
 dict_size = 30000
 source_dict_dim = target_dict_dim = dict_size
 src_dict, trg_dict = paddle.dataset.wmt14.get_dict(dict_size)
diff --git a/python/paddle/fluid/tests/test_data_feeder.py b/python/paddle/fluid/tests/test_data_feeder.py
index 16a33fd3ab3c794494687ba39278e327560686ec..d50c57e670b070238ba67f0a68930841159bc9ed 100644
--- a/python/paddle/fluid/tests/test_data_feeder.py
+++ b/python/paddle/fluid/tests/test_data_feeder.py
@@ -16,6 +16,9 @@ from __future__ import print_function
 
 import paddle.fluid as fluid
 import unittest
+import paddle
+
+paddle.enable_static()
 
 
 class TestDataFeeder(unittest.TestCase):
diff --git a/python/paddle/fluid/tests/test_detection.py b/python/paddle/fluid/tests/test_detection.py
index a1526934f4aa1415c529e79bfa8dea6c0754bea9..05b9067ec400f8be4da49bad31423767b2e876ea 100644
--- a/python/paddle/fluid/tests/test_detection.py
+++ b/python/paddle/fluid/tests/test_detection.py
@@ -19,6 +19,60 @@ import paddle.fluid.layers as layers
 from paddle.fluid.layers import detection
 from paddle.fluid.framework import Program, program_guard
 import unittest
+import contextlib
+import numpy as np
+from unittests.test_imperative_base import new_program_scope
+from paddle.fluid.dygraph import base
+from paddle.fluid import core
+import paddle
+
+paddle.enable_static()
+
+
+class LayerTest(unittest.TestCase):
+    @classmethod
+    def setUpClass(cls):
+        cls.seed = 111
+
+    @classmethod
+    def tearDownClass(cls):
+        pass
+
+    def _get_place(self, force_to_use_cpu=False):
+        # this option for ops that only have cpu kernel
+        if force_to_use_cpu:
+            return core.CPUPlace()
+        else:
+            if core.is_compiled_with_cuda():
+                return core.CUDAPlace(0)
+            return core.CPUPlace()
+
+    @contextlib.contextmanager
+    def static_graph(self):
+        with new_program_scope():
+            fluid.default_startup_program().random_seed = self.seed
+            fluid.default_main_program().random_seed = self.seed
+            yield
+
+    def get_static_graph_result(self,
+                                feed,
+                                fetch_list,
+                                with_lod=False,
+                                force_to_use_cpu=False):
+        exe = fluid.Executor(self._get_place(force_to_use_cpu))
+        exe.run(fluid.default_startup_program())
+        return exe.run(fluid.default_main_program(),
+                       feed=feed,
+                       fetch_list=fetch_list,
+                       return_numpy=(not with_lod))
+
+    @contextlib.contextmanager
+    def dynamic_graph(self, force_to_use_cpu=False):
+        with fluid.dygraph.guard(
+                self._get_place(force_to_use_cpu=force_to_use_cpu)):
+            fluid.default_startup_program().random_seed = self.seed
+            fluid.default_main_program().random_seed = self.seed
+            yield
 
 
 class TestDetection(unittest.TestCase):
@@ -481,45 +535,67 @@ class TestRpnTargetAssign(unittest.TestCase):
             print(str(program))
 
 
-class TestGenerateProposals(unittest.TestCase):
+class TestGenerateProposals(LayerTest):
     def test_generate_proposals(self):
-        program = Program()
-        with program_guard(program):
-            data_shape = [20, 64, 64]
-            images = fluid.layers.data(
-                name='images', shape=data_shape, dtype='float32')
-            im_info = fluid.layers.data(
-                name='im_info', shape=[3], dtype='float32')
-            anchors, variances = fluid.layers.anchor_generator(
-                name='anchor_generator',
-                input=images,
-                anchor_sizes=[32, 64],
-                aspect_ratios=[1.0],
-                variance=[0.1, 0.1, 0.2, 0.2],
-                stride=[16.0, 16.0],
-                offset=0.5)
-            num_anchors = anchors.shape[2]
-            scores = fluid.layers.data(
-                name='scores', shape=[num_anchors, 8, 8], dtype='float32')
-            bbox_deltas = fluid.layers.data(
-                name='bbox_deltas',
-                shape=[num_anchors * 4, 8, 8],
-                dtype='float32')
-            rpn_rois, rpn_roi_probs = fluid.layers.generate_proposals(
-                name='generate_proposals',
-                scores=scores,
-                bbox_deltas=bbox_deltas,
-                im_info=im_info,
-                anchors=anchors,
-                variances=variances,
-                pre_nms_top_n=6000,
-                post_nms_top_n=1000,
-                nms_thresh=0.5,
-                min_size=0.1,
-                eta=1.0)
-            self.assertIsNotNone(rpn_rois)
-            self.assertIsNotNone(rpn_roi_probs)
-            print(rpn_rois.shape)
+        scores_np = np.random.rand(2, 3, 4, 4).astype('float32')
+        bbox_deltas_np = np.random.rand(2, 12, 4, 4).astype('float32')
+        im_info_np = np.array([[8, 8, 0.5], [6, 6, 0.5]]).astype('float32')
+        anchors_np = np.reshape(np.arange(4 * 4 * 3 * 4),
+                                [4, 4, 3, 4]).astype('float32')
+        variances_np = np.ones((4, 4, 3, 4)).astype('float32')
+
+        with self.static_graph():
+            scores = fluid.data(
+                name='scores', shape=[2, 3, 4, 4], dtype='float32')
+            bbox_deltas = fluid.data(
+                name='bbox_deltas', shape=[2, 12, 4, 4], dtype='float32')
+            im_info = fluid.data(name='im_info', shape=[2, 3], dtype='float32')
+            anchors = fluid.data(
+                name='anchors', shape=[4, 4, 3, 4], dtype='float32')
+            variances = fluid.data(
+                name='var', shape=[4, 4, 3, 4], dtype='float32')
+            rois, roi_probs, rois_num = fluid.layers.generate_proposals(
+                scores,
+                bbox_deltas,
+                im_info,
+                anchors,
+                variances,
+                pre_nms_top_n=10,
+                post_nms_top_n=5,
+                return_rois_num=True)
+            rois_stat, roi_probs_stat, rois_num_stat = self.get_static_graph_result(
+                feed={
+                    'scores': scores_np,
+                    'bbox_deltas': bbox_deltas_np,
+                    'im_info': im_info_np,
+                    'anchors': anchors_np,
+                    'var': variances_np
+                },
+                fetch_list=[rois, roi_probs, rois_num],
+                with_lod=True)
+
+        with self.dynamic_graph():
+            scores_dy = base.to_variable(scores_np)
+            bbox_deltas_dy = base.to_variable(bbox_deltas_np)
+            im_info_dy = base.to_variable(im_info_np)
+            anchors_dy = base.to_variable(anchors_np)
+            variances_dy = base.to_variable(variances_np)
+            rois, roi_probs, rois_num = fluid.layers.generate_proposals(
+                scores_dy,
+                bbox_deltas_dy,
+                im_info_dy,
+                anchors_dy,
+                variances_dy,
+                pre_nms_top_n=10,
+                post_nms_top_n=5,
+                return_rois_num=True)
+            rois_dy = rois.numpy()
+            roi_probs_dy = roi_probs.numpy()
+            rois_num_dy = rois_num.numpy()
+
+        self.assertTrue(np.array_equal(np.array(rois_stat), rois_dy))
+        self.assertTrue(np.array_equal(np.array(roi_probs_stat), roi_probs_dy))
+        self.assertTrue(np.array_equal(np.array(rois_num_stat), rois_num_dy))
 
 
 class TestYoloDetection(unittest.TestCase):
@@ -648,30 +724,81 @@ class TestMulticlassNMS2(unittest.TestCase):
             self.assertIsNotNone(index)
 
 
-class TestCollectFpnPropsals(unittest.TestCase):
+class TestCollectFpnPropsals(LayerTest):
     def test_collect_fpn_proposals(self):
-        program = Program()
-        with program_guard(program):
+        multi_bboxes_np = []
+        multi_scores_np = []
+        rois_num_per_level_np = []
+        for i in range(4):
+            bboxes_np = np.random.rand(5, 4).astype('float32')
+            scores_np = np.random.rand(5, 1).astype('float32')
+            rois_num = np.array([2, 3]).astype('int32')
+            multi_bboxes_np.append(bboxes_np)
+            multi_scores_np.append(scores_np)
+            rois_num_per_level_np.append(rois_num)
+
+        with self.static_graph():
             multi_bboxes = []
             multi_scores = []
+            rois_num_per_level = []
             for i in range(4):
-                bboxes = layers.data(
+                bboxes = fluid.data(
                     name='rois' + str(i),
-                    shape=[10, 4],
+                    shape=[5, 4],
                     dtype='float32',
-                    lod_level=1,
-                    append_batch_size=False)
-                scores = layers.data(
+                    lod_level=1)
+                scores = fluid.data(
                     name='scores' + str(i),
-                    shape=[10, 1],
+                    shape=[5, 1],
                     dtype='float32',
-                    lod_level=1,
-                    append_batch_size=False)
+                    lod_level=1)
+                rois_num = fluid.data(
+                    name='rois_num' + str(i), shape=[None], dtype='int32')
+
                 multi_bboxes.append(bboxes)
                 multi_scores.append(scores)
-            fpn_rois = layers.collect_fpn_proposals(multi_bboxes, multi_scores,
-                                                    2, 5, 10)
-            self.assertIsNotNone(fpn_rois)
+                rois_num_per_level.append(rois_num)
+
+            fpn_rois, rois_num = layers.collect_fpn_proposals(
+                multi_bboxes,
+                multi_scores,
+                2,
+                5,
+                10,
+                rois_num_per_level=rois_num_per_level)
+            feed = {}
+            for i in range(4):
+                feed['rois' + str(i)] = multi_bboxes_np[i]
+                feed['scores' + str(i)] = multi_scores_np[i]
+                feed['rois_num' + str(i)] = rois_num_per_level_np[i]
+            fpn_rois_stat, rois_num_stat = self.get_static_graph_result(
+                feed=feed, fetch_list=[fpn_rois, rois_num], with_lod=True)
+            fpn_rois_stat = np.array(fpn_rois_stat)
+            rois_num_stat = np.array(rois_num_stat)
+
+        with self.dynamic_graph():
+            multi_bboxes_dy = []
+            multi_scores_dy = []
+            rois_num_per_level_dy = []
+            for i in range(4):
+                bboxes_dy = base.to_variable(multi_bboxes_np[i])
+                scores_dy = base.to_variable(multi_scores_np[i])
+                rois_num_dy = base.to_variable(rois_num_per_level_np[i])
+                multi_bboxes_dy.append(bboxes_dy)
+                multi_scores_dy.append(scores_dy)
+                rois_num_per_level_dy.append(rois_num_dy)
+            fpn_rois_dy, rois_num_dy = fluid.layers.collect_fpn_proposals(
+                multi_bboxes_dy,
+                multi_scores_dy,
+                2,
+                5,
+                10,
+                rois_num_per_level=rois_num_per_level_dy)
+            fpn_rois_dy = fpn_rois_dy.numpy()
+            rois_num_dy = rois_num_dy.numpy()
+
+        self.assertTrue(np.array_equal(fpn_rois_stat, fpn_rois_dy))
+        self.assertTrue(np.array_equal(rois_num_stat, rois_num_dy))
 
     def test_collect_fpn_proposals_error(self):
         def generate_input(bbox_type, score_type, name):
@@ -717,20 +844,51 @@ class TestCollectFpnPropsals(unittest.TestCase):
                 post_nms_top_n=2000)
 
 
-class TestDistributeFpnProposals(unittest.TestCase):
+class TestDistributeFpnProposals(LayerTest):
     def test_distribute_fpn_proposals(self):
-        program = Program()
-        with program_guard(program):
-            fpn_rois = fluid.layers.data(
-                name='data', shape=[4], dtype='float32', lod_level=1)
-            multi_rois, restore_ind = layers.distribute_fpn_proposals(
-                fpn_rois=fpn_rois,
+        rois_np = np.random.rand(10, 4).astype('float32')
+        rois_num_np = np.array([4, 6]).astype('int32')
+        with self.static_graph():
+            rois = fluid.data(name='rois', shape=[10, 4], dtype='float32')
+            rois_num = fluid.data(name='rois_num', shape=[None], dtype='int32')
+            multi_rois, restore_ind, rois_num_per_level = layers.distribute_fpn_proposals(
+                fpn_rois=rois,
                 min_level=2,
                 max_level=5,
                 refer_level=4,
-                refer_scale=224)
-            self.assertIsNotNone(multi_rois)
-            self.assertIsNotNone(restore_ind)
+                refer_scale=224,
+                rois_num=rois_num)
+            fetch_list = multi_rois + [restore_ind] + rois_num_per_level
+            output_stat = self.get_static_graph_result(
+                feed={'rois': rois_np,
+                      'rois_num': rois_num_np},
+                fetch_list=fetch_list,
+                with_lod=True)
+            output_stat_np = []
+            for output in output_stat:
+                output_np = np.array(output)
+                if len(output_np) > 0:
+                    output_stat_np.append(output_np)
+
+        with self.dynamic_graph():
+            rois_dy = base.to_variable(rois_np)
+            rois_num_dy = base.to_variable(rois_num_np)
+            multi_rois_dy, restore_ind_dy, rois_num_per_level_dy = layers.distribute_fpn_proposals(
+                fpn_rois=rois_dy,
+                min_level=2,
+                max_level=5,
+                refer_level=4,
+                refer_scale=224,
+                rois_num=rois_num_dy)
+            output_dy = multi_rois_dy + [restore_ind_dy] + rois_num_per_level_dy
+            output_dy_np = []
+            for output in output_dy:
+                output_np = output.numpy()
+                if len(output_np) > 0:
+                    output_dy_np.append(output_np)
+
+        for res_stat, res_dy in zip(output_stat_np, output_dy_np):
+            self.assertTrue(np.array_equal(res_stat, res_dy))
 
     def test_distribute_fpn_proposals_error(self):
         program = Program()
diff --git a/python/paddle/fluid/tests/test_error_clip.py b/python/paddle/fluid/tests/test_error_clip.py
index 3c977afc7c813908fbe2dfb7445d9ca183cf2231..7859fca15f643fa00384ae4387ca07074b2ed868 100644
--- a/python/paddle/fluid/tests/test_error_clip.py
+++ b/python/paddle/fluid/tests/test_error_clip.py
@@ -22,6 +22,7 @@ BATCH_SIZE = 128
 CLIP_MAX = 2e-6
 CLIP_MIN = -1e-6
 
+paddle.enable_static()
 prog = fluid.framework.Program()
 
 with fluid.program_guard(main_program=prog):
diff --git a/python/paddle/fluid/tests/test_if_else_op.py b/python/paddle/fluid/tests/test_if_else_op.py
index 1c992b9d8cd38a3851f99b1fc78ef5639c7f6eef..b7792e5ce27a55c9862d1e9a751fc6599d83dc7e 100644
--- a/python/paddle/fluid/tests/test_if_else_op.py
+++ b/python/paddle/fluid/tests/test_if_else_op.py
@@ -28,6 +28,8 @@ from paddle.fluid.layers.control_flow import ConditionalBlock
 import unittest
 import numpy as np
 
+paddle.enable_static()
+
 
 class TestMNISTIfElseOp(unittest.TestCase):
     # FIXME: https://github.com/PaddlePaddle/Paddle/issues/12245#issuecomment-406462379
diff --git a/python/paddle/fluid/tests/test_python_operator_overriding.py b/python/paddle/fluid/tests/test_python_operator_overriding.py
index 5f92c437ec726f510d9194d23f1a01a5478827d6..fd9dc961988df750144e089a971938148c21940a 100644
--- a/python/paddle/fluid/tests/test_python_operator_overriding.py
+++ b/python/paddle/fluid/tests/test_python_operator_overriding.py
@@ -21,6 +21,9 @@ import numpy as np
 import paddle.fluid.layers as layers
 import paddle.fluid.framework as framework
 import paddle.fluid as fluid
+import paddle
+
+paddle.enable_static()
 
 
 class TestPythonOperatorOverride(unittest.TestCase):
diff --git a/python/paddle/fluid/tests/unittests/CMakeLists.txt b/python/paddle/fluid/tests/unittests/CMakeLists.txt
index b78c597de839ec57ad95dc2953c4d71a653f378a..8d236dca22f2266771a029b7cdbf7db21aefb1fe 100644
--- a/python/paddle/fluid/tests/unittests/CMakeLists.txt
+++ b/python/paddle/fluid/tests/unittests/CMakeLists.txt
@@ -4,6 +4,7 @@ set(GC_ENVS FLAGS_eager_delete_tensor_gb=0.0 FLAGS_fast_eager_deletion_mode=1 FL
 set(dist_ENVS http_proxy="" https_proxy="")
 
 file(GLOB DIST_TEST_OPS RELATIVE "${CMAKE_CURRENT_SOURCE_DIR}" "test_dist_*.py")
+list(REMOVE_ITEM DIST_TEST_OPS "test_dist_op")
 if(NOT WITH_NCCL)
     list(REMOVE_ITEM DIST_TEST_OPS "test_dist_mnist_dgc_nccl")
 endif()
@@ -47,6 +48,8 @@ list(APPEND MIXED_DIST_TEST_OPS test_fleet_dgc_meta_optimizer)
 list(APPEND MIXED_DIST_TEST_OPS test_fleet_private_function)
 list(APPEND MIXED_DIST_TEST_OPS test_fleet_graph_executor)
 list(APPEND MIXED_DIST_TEST_OPS test_fleet_meta_optimizer_base)
+list(APPEND MIXED_DIST_TEST_OPS test_fleet_distributed_strategy)
+list(APPEND MIXED_DIST_TEST_OPS test_fleet_auto)
 foreach(TEST_OP ${MIXED_DIST_TEST_OPS})
   list(REMOVE_ITEM TEST_OPS ${TEST_OP})
 endforeach()
@@ -323,7 +326,6 @@ list(REMOVE_ITEM TEST_OPS test_basic_gru_api)
 list(REMOVE_ITEM TEST_OPS test_basic_gru_unit_op)
 list(REMOVE_ITEM TEST_OPS test_basic_lstm_api)
 list(REMOVE_ITEM TEST_OPS test_basic_lstm_unit_op)
-list(REMOVE_ITEM TEST_OPS test_imperative_debug_string)
 list(REMOVE_ITEM TEST_OPS test_fuse_bn_act_pass)
 list(REMOVE_ITEM TEST_OPS test_imperative_static_runner_mnist)
 list(REMOVE_ITEM TEST_OPS test_imperative_static_runner_while)
@@ -397,23 +399,22 @@ py_test_modules(test_bilinear_interp_op MODULES test_bilinear_interp_op ENVS ${G
 py_test_modules(test_nearest_interp_op MODULES test_nearest_interp_op ENVS ${GC_ENVS})
 py_test_modules(test_imperative_resnet MODULES test_imperative_resnet ENVS
     FLAGS_cudnn_deterministic=1 SERIAL)
-set_tests_properties(test_imperative_resnet PROPERTIES LABELS "RUN_TYPE=EXCLUSIVE")
+set_tests_properties(test_imperative_resnet PROPERTIES LABELS "RUN_TYPE=EXCLUSIVE:NIGHTLY")
 py_test_modules(test_imperative_resnet_sorted_gradient MODULES test_imperative_resnet_sorted_gradient ENVS
         FLAGS_cudnn_deterministic=1 SERIAL)
-set_tests_properties(test_imperative_resnet_sorted_gradient PROPERTIES LABELS "RUN_TYPE=EXCLUSIVE")
+set_tests_properties(test_imperative_resnet_sorted_gradient PROPERTIES LABELS "RUN_TYPE=EXCLUSIVE:NIGHTLY")
 py_test_modules(test_imperative_mnist MODULES test_imperative_mnist ENVS
     FLAGS_cudnn_deterministic=1)
 py_test_modules(test_imperative_mnist_sorted_gradient MODULES test_imperative_mnist_sorted_gradient ENVS
         FLAGS_cudnn_deterministic=1)
 py_test_modules(test_imperative_se_resnext MODULES test_imperative_se_resnext ENVS
     FLAGS_cudnn_deterministic=1 SERIAL)
-set_tests_properties(test_imperative_se_resnext PROPERTIES LABELS "RUN_TYPE=EXCLUSIVE")
+set_tests_properties(test_imperative_se_resnext PROPERTIES LABELS "RUN_TYPE=EXCLUSIVE:NIGHTLY")
 py_test_modules(test_imperative_ocr_attention_model MODULES test_imperative_ocr_attention_model ENVS
         FLAGS_cudnn_deterministic=1 SERIAL)
 py_test_modules(test_install_check MODULES test_install_check ENVS
         FLAGS_cudnn_deterministic=1 SERIAL)
 set_tests_properties(test_install_check PROPERTIES LABELS "RUN_TYPE=DIST")
-py_test_modules(test_imperative_debug_string MODULES test_imperative_debug_string ENVS FLAGS_dygraph_debug=1)
 py_test_modules(test_imperative_static_runner_mnist MODULES test_imperative_static_runner_mnist ENVS
     FLAGS_cudnn_deterministic=1)
 py_test_modules(test_imperative_static_runner_while MODULES test_imperative_static_runner_while ENVS
@@ -458,6 +459,8 @@ if(WITH_DISTRIBUTE)
     	   py_test_modules(test_fleet_pipeline_meta_optimizer MODULES test_fleet_pipeline_meta_optimizer ENVS ${dist_ENVS})
     	   py_test_modules(test_fleet_private_function MODULES test_fleet_private_function ENVS ${dist_ENVS})
 	   py_test_modules(test_fleet_meta_optimizer_base MODULES test_fleet_meta_optimizer_base ENVS ${dist_ENVS})
+	   py_test_modules(test_fleet_distributed_strategy MODULES test_fleet_distributed_strategy)
+	   #py_test_modules(test_fleet_auto MODULES test_fleet_auto ENVS ${dist_ENVS})
         if(NOT WIN32)
             py_test_modules(test_fleet_localsgd_meta_optimizer MODULES test_fleet_localsgd_meta_optimizer ENVS ${dist_ENVS})
             py_test_modules(test_fleet_lars_meta_optimizer MODULES test_fleet_lars_meta_optimizer ENVS ${dist_ENVS})
@@ -555,8 +558,8 @@ endif()
 set_tests_properties(test_parallel_executor_test_while_train test_parallel_executor_mnist
         test_parallel_executor_feed_persistable_var
         test_buffer_shared_memory_reuse_pass_and_fuse_optimization_op_pass
-        test_data_norm_op test_imperative_using_non_zero_gpu test_fuse_bn_act_pass
-        test_optimizer_in_control_flow test_dataloader_keep_order
+        test_data_norm_op
+        test_dataloader_keep_order
         test_dataloader_unkeep_order
         test_parallel_executor_fetch_isolated_var
         test_parallel_executor_inference_feed_partial_data
@@ -585,8 +588,10 @@ endif()
 
 # setting timeout value for old unittests
 # set_tests_properties(test_dist_fleet_sparse_embedding_ctr PROPERTIES TIMEOUT 200)
-set_tests_properties(test_fused_elemwise_activation_op PROPERTIES TIMEOUT 150)
-set_tests_properties(test_gru_op PROPERTIES TIMEOUT 200)
-set_tests_properties(test_layer_norm_op PROPERTIES TIMEOUT 150)
-set_tests_properties(test_pool3d_op PROPERTIES TIMEOUT 150)
-set_tests_properties(test_regularizer PROPERTIES TIMEOUT 150)
+if(NOT WIN32 AND NOT APPLE)
+    set_tests_properties(test_fused_elemwise_activation_op PROPERTIES TIMEOUT 150)
+    set_tests_properties(test_gru_op PROPERTIES TIMEOUT 200)
+    set_tests_properties(test_layer_norm_op PROPERTIES TIMEOUT 150)
+    set_tests_properties(test_pool3d_op PROPERTIES TIMEOUT 150)
+    set_tests_properties(test_regularizer PROPERTIES TIMEOUT 150)
+endif()
diff --git a/python/paddle/fluid/tests/unittests/auto_checkpoint_utils.py b/python/paddle/fluid/tests/unittests/auto_checkpoint_utils.py
index 529ff4ec45d1fdc6d1d8e765e38cff53d36aade7..2464882d617effb838c1f40d40ec2d89c13e73d2 100644
--- a/python/paddle/fluid/tests/unittests/auto_checkpoint_utils.py
+++ b/python/paddle/fluid/tests/unittests/auto_checkpoint_utils.py
@@ -20,8 +20,7 @@ from paddle.fluid.incubate.fleet.collective import CollectiveOptimizer, fleet
 import os
 import sys
 
-from paddle.fluid.incubate.fleet.utils.fs import LocalFS
-from paddle.fluid.incubate.fleet.utils.hdfs import HDFSClient
+from paddle.distributed.fleet.utils.fs import LocalFS, HDFSClient
 import paddle.fluid.incubate.checkpoint.auto_checkpoint as acp
 from paddle.fluid.incubate.checkpoint.checkpoint_saver import PaddleModel
 from paddle.fluid.framework import program_guard
diff --git a/python/paddle/fluid/tests/unittests/c_comm_init_op.py b/python/paddle/fluid/tests/unittests/c_comm_init_op.py
index db77477cca62d10ff6692013a64a8d2ce5a38ec1..ed6a75230c60d194783cffec117b8d1d2bb9cda0 100644
--- a/python/paddle/fluid/tests/unittests/c_comm_init_op.py
+++ b/python/paddle/fluid/tests/unittests/c_comm_init_op.py
@@ -19,6 +19,9 @@ import os
 import paddle.fluid.core as core
 import paddle.fluid as fluid
 from paddle.distributed.fleet.base.private_helper_function import wait_server_ready
+import paddle
+
+paddle.enable_static()
 
 
 class TestCCommInitOp(unittest.TestCase):
diff --git a/python/paddle/fluid/tests/unittests/check_nan_inf_base.py b/python/paddle/fluid/tests/unittests/check_nan_inf_base.py
index 8e75b3c3438c0afb24871838b66e1285da78c592..c682c795019caff14e17f808ceac3fa5a5162562 100644
--- a/python/paddle/fluid/tests/unittests/check_nan_inf_base.py
+++ b/python/paddle/fluid/tests/unittests/check_nan_inf_base.py
@@ -28,6 +28,8 @@ import paddle
 import paddle.fluid as fluid
 import paddle.compat as cpt
 
+paddle.enable_static()
+
 np.random.seed(0)
 
 
diff --git a/python/paddle/fluid/tests/unittests/collective_allgather_api.py b/python/paddle/fluid/tests/unittests/collective_allgather_api.py
index bdf4ca07ae9b57e083137945f58aaabb571e20ec..63d7f52c11a8ad3ad041ad82e30b8124a899fd61 100644
--- a/python/paddle/fluid/tests/unittests/collective_allgather_api.py
+++ b/python/paddle/fluid/tests/unittests/collective_allgather_api.py
@@ -35,6 +35,8 @@ import paddle.fluid.layers as layers
 from functools import reduce
 from test_collective_api_base import TestCollectiveAPIRunnerBase, runtime_main
 
+paddle.enable_static()
+
 
 class TestCollectiveAllgatherAPI(TestCollectiveAPIRunnerBase):
     def __init__(self):
diff --git a/python/paddle/fluid/tests/unittests/collective_allgather_op.py b/python/paddle/fluid/tests/unittests/collective_allgather_op.py
index 349996547687657453497956007d2431b11ea45f..f77a97aa915f6fd63a4d5ed0d95752c6ca022eb1 100644
--- a/python/paddle/fluid/tests/unittests/collective_allgather_op.py
+++ b/python/paddle/fluid/tests/unittests/collective_allgather_op.py
@@ -34,6 +34,8 @@ import paddle.fluid.layers as layers
 from functools import reduce
 from test_collective_base import TestCollectiveRunnerBase, runtime_main
 
+paddle.enable_static()
+
 
 class TestCollectiveAllGather(TestCollectiveRunnerBase):
     def __init__(self):
diff --git a/python/paddle/fluid/tests/unittests/collective_allreduce_api.py b/python/paddle/fluid/tests/unittests/collective_allreduce_api.py
index aea429ae5e3e622ee1b584796ef87edc1d4c8d72..67242b274fcb154273127ef020fc14896af6ad8e 100644
--- a/python/paddle/fluid/tests/unittests/collective_allreduce_api.py
+++ b/python/paddle/fluid/tests/unittests/collective_allreduce_api.py
@@ -35,6 +35,8 @@ import paddle.fluid.layers as layers
 from functools import reduce
 from test_collective_api_base import TestCollectiveAPIRunnerBase, runtime_main
 
+paddle.enable_static()
+
 
 class TestCollectiveAllreduceAPI(TestCollectiveAPIRunnerBase):
     def __init__(self):
diff --git a/python/paddle/fluid/tests/unittests/collective_allreduce_op.py b/python/paddle/fluid/tests/unittests/collective_allreduce_op.py
index 9aef8879cab15ade735195ab173d9386764fb690..eef59ee3dde92c6ceaecbe15e997fb958b1fab19 100644
--- a/python/paddle/fluid/tests/unittests/collective_allreduce_op.py
+++ b/python/paddle/fluid/tests/unittests/collective_allreduce_op.py
@@ -35,6 +35,8 @@ import paddle.fluid.layers as layers
 from functools import reduce
 from test_collective_base import TestCollectiveRunnerBase, runtime_main
 
+paddle.enable_static()
+
 
 class TestCollectiveAllreduce(TestCollectiveRunnerBase):
     def __init__(self):
diff --git a/python/paddle/fluid/tests/unittests/collective_barrier_api.py b/python/paddle/fluid/tests/unittests/collective_barrier_api.py
index 09b3c27126d926ac7175f6045f385adf4d530b44..dbcc70d540bd6acff89342f2a44a751757c39494 100644
--- a/python/paddle/fluid/tests/unittests/collective_barrier_api.py
+++ b/python/paddle/fluid/tests/unittests/collective_barrier_api.py
@@ -35,6 +35,8 @@ import paddle.fluid.layers as layers
 from functools import reduce
 from test_collective_api_base import TestCollectiveAPIRunnerBase, runtime_main
 
+paddle.enable_static()
+
 
 class TestCollectiveBarrierAPI(TestCollectiveAPIRunnerBase):
     def __init__(self):
diff --git a/python/paddle/fluid/tests/unittests/collective_broadcast_api.py b/python/paddle/fluid/tests/unittests/collective_broadcast_api.py
index a879a027b50688234c8efb8468e6eac660d8a145..08a3d948906a8bb40299bd0aed645b8425f1e7ae 100644
--- a/python/paddle/fluid/tests/unittests/collective_broadcast_api.py
+++ b/python/paddle/fluid/tests/unittests/collective_broadcast_api.py
@@ -35,6 +35,8 @@ import paddle.fluid.layers as layers
 from functools import reduce
 from test_collective_api_base import TestCollectiveAPIRunnerBase, runtime_main
 
+paddle.enable_static()
+
 
 class TestCollectiveBroadcastAPI(TestCollectiveAPIRunnerBase):
     def __init__(self):
diff --git a/python/paddle/fluid/tests/unittests/collective_broadcast_op.py b/python/paddle/fluid/tests/unittests/collective_broadcast_op.py
index 18f0485f923e4f72f76be3b0b34ebeb1d89c926c..127f48be61851a8264b2a6d4db57fcbd984f1d53 100644
--- a/python/paddle/fluid/tests/unittests/collective_broadcast_op.py
+++ b/python/paddle/fluid/tests/unittests/collective_broadcast_op.py
@@ -35,6 +35,8 @@ import paddle.fluid.layers as layers
 from functools import reduce
 from test_collective_base import TestCollectiveRunnerBase, runtime_main
 
+paddle.enable_static()
+
 
 class TestCollectiveBroadcast(TestCollectiveRunnerBase):
     def __init__(self):
diff --git a/python/paddle/fluid/tests/unittests/collective_reduce_api.py b/python/paddle/fluid/tests/unittests/collective_reduce_api.py
index 3e89b1cb3ee8550d3dbb4e1a055f092e57126c7f..41e31146a22297fc9328ebc804638c729bd423f0 100644
--- a/python/paddle/fluid/tests/unittests/collective_reduce_api.py
+++ b/python/paddle/fluid/tests/unittests/collective_reduce_api.py
@@ -35,6 +35,8 @@ import paddle.fluid.layers as layers
 from functools import reduce
 from test_collective_api_base import TestCollectiveAPIRunnerBase, runtime_main
 
+paddle.enable_static()
+
 
 class TestCollectiveReduceAPI(TestCollectiveAPIRunnerBase):
     def __init__(self):
diff --git a/python/paddle/fluid/tests/unittests/collective_reduce_op.py b/python/paddle/fluid/tests/unittests/collective_reduce_op.py
index da61284344b58d44c5ba02af5ed42c553f857c94..0448c66d1323405abe3fb468583073caf260bb6e 100644
--- a/python/paddle/fluid/tests/unittests/collective_reduce_op.py
+++ b/python/paddle/fluid/tests/unittests/collective_reduce_op.py
@@ -35,6 +35,8 @@ import paddle.fluid.layers as layers
 from functools import reduce
 from test_collective_base import TestCollectiveRunnerBase, runtime_main
 
+paddle.enable_static()
+
 
 class TestCollectiveReduce(TestCollectiveRunnerBase):
     def __init__(self):
diff --git a/python/paddle/fluid/tests/unittests/collective_reduce_op_calc_stream.py b/python/paddle/fluid/tests/unittests/collective_reduce_op_calc_stream.py
index 7e6904286234364e7ae84a5c21b9826885f99dc4..7a9e0b148d55667622470a4ad117991fc7ad4c0a 100644
--- a/python/paddle/fluid/tests/unittests/collective_reduce_op_calc_stream.py
+++ b/python/paddle/fluid/tests/unittests/collective_reduce_op_calc_stream.py
@@ -35,6 +35,8 @@ import paddle.fluid.layers as layers
 from functools import reduce
 from test_collective_base import TestCollectiveRunnerBase, runtime_main
 
+paddle.enable_static()
+
 
 class TestCollectiveReduce(TestCollectiveRunnerBase):
     def __init__(self):
diff --git a/python/paddle/fluid/tests/unittests/collective_reducescatter.py b/python/paddle/fluid/tests/unittests/collective_reducescatter.py
index 2f14277ae1e549b0b8dc075694752c18b395d230..8b989c73d4deb69e85b821ef0b2091ef0af7a0c4 100644
--- a/python/paddle/fluid/tests/unittests/collective_reducescatter.py
+++ b/python/paddle/fluid/tests/unittests/collective_reducescatter.py
@@ -34,6 +34,8 @@ import paddle.fluid.layers as layers
 from functools import reduce
 from test_collective_base import TestCollectiveRunnerBase, runtime_main
 
+paddle.enable_static()
+
 
 class TestCollectiveReduceScatter(TestCollectiveRunnerBase):
     def __init__(self):
diff --git a/python/paddle/fluid/tests/unittests/collective_reducescatter_op.py b/python/paddle/fluid/tests/unittests/collective_reducescatter_op.py
index 3e286d7f43db6e9cd290b88a0be5a4ae1215737a..91712e2b50f230b68743a4fcd3a7cba767c1f304 100644
--- a/python/paddle/fluid/tests/unittests/collective_reducescatter_op.py
+++ b/python/paddle/fluid/tests/unittests/collective_reducescatter_op.py
@@ -35,6 +35,8 @@ import paddle.fluid.layers as layers
 from functools import reduce
 from test_collective_base import TestCollectiveRunnerBase, runtime_main
 
+paddle.enable_static()
+
 
 class TestCollectiveReduceScatter(TestCollectiveRunnerBase):
     def __init__(self):
diff --git a/python/paddle/fluid/tests/unittests/collective_scatter_api.py b/python/paddle/fluid/tests/unittests/collective_scatter_api.py
index f68929ad3b36d5a0bf145a93b30172f0422dc9f9..ca36c8c83a5e26c74de88a701cc9421ddf0d81d2 100644
--- a/python/paddle/fluid/tests/unittests/collective_scatter_api.py
+++ b/python/paddle/fluid/tests/unittests/collective_scatter_api.py
@@ -35,6 +35,8 @@ import paddle.fluid.layers as layers
 from functools import reduce
 from test_collective_api_base import TestCollectiveAPIRunnerBase, runtime_main
 
+paddle.enable_static()
+
 
 class TestCollectiveScatterAPI(TestCollectiveAPIRunnerBase):
     def __init__(self):
diff --git a/python/paddle/fluid/tests/unittests/collective_scatter_op.py b/python/paddle/fluid/tests/unittests/collective_scatter_op.py
index efe5e17bcce1ecddf859edbb3543876fe5fc9f89..7afa4aec63990372d69f1d16c133e6698aef4dc9 100644
--- a/python/paddle/fluid/tests/unittests/collective_scatter_op.py
+++ b/python/paddle/fluid/tests/unittests/collective_scatter_op.py
@@ -35,6 +35,8 @@ import paddle.fluid.layers as layers
 from functools import reduce
 from test_collective_base import TestCollectiveRunnerBase, runtime_main
 
+paddle.enable_static()
+
 
 class TestCollectiveScatter(TestCollectiveRunnerBase):
     def __init__(self):
diff --git a/python/paddle/fluid/tests/unittests/ctr_dataset_reader.py b/python/paddle/fluid/tests/unittests/ctr_dataset_reader.py
index 863c001f226f86384e2820cb6877ded48cffa119..15e98481c26b20de4e9fa493fa022380ba1fcd63 100644
--- a/python/paddle/fluid/tests/unittests/ctr_dataset_reader.py
+++ b/python/paddle/fluid/tests/unittests/ctr_dataset_reader.py
@@ -153,7 +153,7 @@ def gen_fake_line(dnn_data_num=7,
     return line
 
 
-def prepare_fake_data(file_nums=8, file_lines=1000):
+def prepare_fake_data(file_nums=9, file_lines=1000):
     """
     Create fake data with same type as avazu_ctr_data
     """
diff --git a/python/paddle/fluid/tests/unittests/dist_allreduce_op.py b/python/paddle/fluid/tests/unittests/dist_allreduce_op.py
index 88a3cd14c43334f2abed9c8b435b64d47a65dc85..de52072d4a8388aaf7d90428a2704e984360b7ba 100644
--- a/python/paddle/fluid/tests/unittests/dist_allreduce_op.py
+++ b/python/paddle/fluid/tests/unittests/dist_allreduce_op.py
@@ -30,6 +30,8 @@ import signal
 from functools import reduce
 from test_dist_base import TestDistRunnerBase, runtime_main
 
+paddle.enable_static()
+
 DTYPE = "float32"
 paddle.dataset.mnist.fetch()
 
diff --git a/python/paddle/fluid/tests/unittests/dist_fleet_ctr.py b/python/paddle/fluid/tests/unittests/dist_fleet_ctr.py
index dc39472d7aed8f52ee3bb0f85a5e503db9093070..8277499fcce341207fa75a74dfda0a2ccc2e3b63 100644
--- a/python/paddle/fluid/tests/unittests/dist_fleet_ctr.py
+++ b/python/paddle/fluid/tests/unittests/dist_fleet_ctr.py
@@ -30,6 +30,8 @@ import ctr_dataset_reader
 from test_dist_fleet_base import runtime_main, FleetDistRunnerBase
 from paddle.distributed.fleet.base.util_factory import fleet_util
 
+paddle.enable_static()
+
 # Fix seed for test
 fluid.default_startup_program().random_seed = 1
 fluid.default_main_program().random_seed = 1
@@ -208,14 +210,16 @@ class TestDistCTR2x2(FleetDistRunnerBase):
         filelist = train_file_list
 
         # config dataset
-        dataset = paddle.distributed.fleet.DatasetFactory().create_dataset()
-        dataset.set_batch_size(batch_size)
-        dataset.set_use_var(self.feeds)
+        dataset = paddle.distributed.QueueDataset()
         pipe_command = 'python ctr_dataset_reader.py'
-        dataset.set_pipe_command(pipe_command)
+
+        dataset.init(
+            batch_size=batch_size,
+            use_var=self.feeds,
+            pipe_command=pipe_command,
+            thread_num=thread_num)
 
         dataset.set_filelist(filelist)
-        dataset.set_thread(thread_num)
 
         for epoch_id in range(1):
             pass_start = time.time()
diff --git a/python/paddle/fluid/tests/unittests/dist_fleet_ctr_ps_gpu.py b/python/paddle/fluid/tests/unittests/dist_fleet_ctr_ps_gpu.py
index 03d0fa447daf3e3a502e7d77491045f92695496c..0e3c80992771424e4216a79b991de1c62884c757 100644
--- a/python/paddle/fluid/tests/unittests/dist_fleet_ctr_ps_gpu.py
+++ b/python/paddle/fluid/tests/unittests/dist_fleet_ctr_ps_gpu.py
@@ -114,14 +114,14 @@ class TestDistGpuPsCTR2x2(TestDistCTR2x2):
             filelist.append(train_file_path)
 
         # config dataset
-        dataset = paddle.fleet.DatasetFactory().create_dataset()
-        dataset.set_batch_size(batch_size)
-        dataset.set_use_var(self.feeds)
+        dataset = paddle.distributed.QueueDataset()
+        dataset._set_batch_size(batch_size)
+        dataset._set_use_var(self.feeds)
         pipe_command = 'python ctr_dataset_reader.py'
-        dataset.set_pipe_command(pipe_command)
+        dataset._set_pipe_command(pipe_command)
 
         dataset.set_filelist(filelist)
-        dataset.set_thread(thread_num)
+        dataset._set_thread(thread_num)
 
         for epoch_id in range(1):
             pass_start = time.time()
diff --git a/python/paddle/fluid/tests/unittests/dist_fleet_heter_ctr.py b/python/paddle/fluid/tests/unittests/dist_fleet_heter_ctr.py
index 0de898d6dde217ec6d5cdf53611f986f7b04863f..2f938a813d8a7598e49023066759a490eab53263 100644
--- a/python/paddle/fluid/tests/unittests/dist_fleet_heter_ctr.py
+++ b/python/paddle/fluid/tests/unittests/dist_fleet_heter_ctr.py
@@ -31,6 +31,8 @@ from test_dist_fleet_heter_base import runtime_main, FleetDistHeterRunnerBase
 from dist_fleet_ctr import TestDistCTR2x2, fake_ctr_reader
 from paddle.distributed.fleet.base.util_factory import fleet_util
 
+paddle.enable_static()
+
 # Fix seed for test
 fluid.default_startup_program().random_seed = 1
 fluid.default_main_program().random_seed = 1
@@ -177,20 +179,20 @@ class TestHeterPsCTR2x2(FleetDistHeterRunnerBase):
         fleet.init_worker()
         exe.run(fluid.default_startup_program())
 
-        thread_num = 1
+        thread_num = int(os.getenv("CPU_NUM", 2))
         batch_size = 128
         filelist = fleet_util.get_file_shard(train_file_list)
         print("filelist: {}".format(filelist))
 
         # config dataset
-        dataset = paddle.distributed.fleet.DatasetFactory().create_dataset()
-        dataset.set_batch_size(batch_size)
-        dataset.set_use_var(self.feeds)
+        dataset = paddle.distributed.QueueDataset()
+        dataset._set_batch_size(batch_size)
+        dataset._set_use_var(self.feeds)
         pipe_command = 'python ctr_dataset_reader.py'
-        dataset.set_pipe_command(pipe_command)
+        dataset._set_pipe_command(pipe_command)
 
         dataset.set_filelist(filelist)
-        dataset.set_thread(thread_num)
+        dataset._set_thread(thread_num)
 
         for epoch_id in range(1):
             pass_start = time.time()
diff --git a/python/paddle/fluid/tests/unittests/dist_fleet_simnet_bow.py b/python/paddle/fluid/tests/unittests/dist_fleet_simnet_bow.py
index 7d5ca4fc6e3916eab29942c85e88664f60cbf032..2ea69e1b6763087bb2b278b59a8a59b4331847da 100644
--- a/python/paddle/fluid/tests/unittests/dist_fleet_simnet_bow.py
+++ b/python/paddle/fluid/tests/unittests/dist_fleet_simnet_bow.py
@@ -34,6 +34,8 @@ from functools import reduce
 from test_dist_fleet_base import runtime_main, FleetDistRunnerBase
 from paddle.distributed.fleet.base.util_factory import fleet_util
 
+paddle.enable_static()
+
 DTYPE = "int64"
 DATA_URL = 'http://paddle-dist-ce-data.bj.bcebos.com/simnet.train.1000'
 DATA_MD5 = '24e49366eb0611c552667989de2f57d5'
diff --git a/python/paddle/fluid/tests/unittests/dist_mnist.py b/python/paddle/fluid/tests/unittests/dist_mnist.py
index 20e89bd46c67b557cd6ab1ad0fd531a6b22f947d..f63139464e7552ed82c74171717b1b32f33caa09 100644
--- a/python/paddle/fluid/tests/unittests/dist_mnist.py
+++ b/python/paddle/fluid/tests/unittests/dist_mnist.py
@@ -31,6 +31,8 @@ from functools import reduce
 from test_dist_base import TestDistRunnerBase, runtime_main
 from paddle.fluid.incubate.fleet.collective import fleet, DistributedStrategy
 
+paddle.enable_static()
+
 DTYPE = "float32"
 paddle.dataset.mnist.fetch()
 
diff --git a/python/paddle/fluid/tests/unittests/dist_se_resnext.py b/python/paddle/fluid/tests/unittests/dist_se_resnext.py
index a2fd61e2387ee362946c15788d76cba4dec46055..5ba40c7c8388c45810852946f5e790bc1213767d 100644
--- a/python/paddle/fluid/tests/unittests/dist_se_resnext.py
+++ b/python/paddle/fluid/tests/unittests/dist_se_resnext.py
@@ -30,6 +30,8 @@ import sys
 import signal
 from test_dist_base import TestDistRunnerBase, runtime_main
 
+paddle.enable_static()
+
 # Fix seed for test
 fluid.default_startup_program().random_seed = 1
 fluid.default_main_program().random_seed = 1
diff --git a/python/paddle/fluid/tests/unittests/dygraph_to_static/predictor_utils.py b/python/paddle/fluid/tests/unittests/dygraph_to_static/predictor_utils.py
index 75bff108dd43665df0fc1c8b166a935946b4fbc7..ba0adaf32e15db71162aed71c042100a0cd50e26 100644
--- a/python/paddle/fluid/tests/unittests/dygraph_to_static/predictor_utils.py
+++ b/python/paddle/fluid/tests/unittests/dygraph_to_static/predictor_utils.py
@@ -81,7 +81,7 @@ class PredictorTools(object):
                 tensor.set_lod(feed_data.lod())
 
         # ensure no diff in multiple repeat times
-        repeat_time = 10
+        repeat_time = 2
         for i in range(repeat_time):
             predictor.zero_copy_run()
 
diff --git a/python/paddle/fluid/tests/unittests/dygraph_to_static/test_declarative.py b/python/paddle/fluid/tests/unittests/dygraph_to_static/test_declarative.py
index 949286f63efb3357325f25b02f60e938eebd28e8..450ef7557bc1574c31a00d05154aead19083c1bc 100644
--- a/python/paddle/fluid/tests/unittests/dygraph_to_static/test_declarative.py
+++ b/python/paddle/fluid/tests/unittests/dygraph_to_static/test_declarative.py
@@ -19,7 +19,7 @@ import paddle
 import paddle.fluid as fluid
 from paddle.static import InputSpec
 from paddle.fluid.dygraph import to_variable, declarative, ProgramTranslator, Layer, jit
-from paddle.fluid.dygraph.dygraph_to_static.program_translator import ConcreteProgram
+from paddle.fluid.dygraph.dygraph_to_static.program_translator import ConcreteProgram, StaticLayer
 
 from test_basic_api_transformation import dyfunc_to_variable
 
@@ -47,8 +47,8 @@ class SimpleNet(Layer):
         return z
 
     @declarative(input_spec=[[InputSpec([None, 10]), InputSpec([None, 10])]])
-    def func_with_list(self, l):
-        x, y, int_val = l
+    def func_with_list(self, l, int_val=1):
+        x, y = l
         z = x + y
         z = z + int_val
         return z
@@ -60,10 +60,7 @@ class SimpleNet(Layer):
     def func_with_dict(self, d):
         x = d['x']
         y = d['y']
-        int_val = d['int_val']
-
         z = x + y
-        z = z + int_val
 
         return z
 
@@ -84,6 +81,23 @@ class SimpleNet(Layer):
         return z
 
 
+class TestStaticLayerInstance(unittest.TestCase):
+    def test_instance_same_class(self):
+        with fluid.dygraph.guard(fluid.CPUPlace()):
+            net_1 = SimpleNet()
+            net_2 = SimpleNet()
+
+            self.assertTrue(isinstance(net_1.forward, StaticLayer))
+            self.assertTrue(isinstance(net_2.forward, StaticLayer))
+            self.assertNotEqual(net_1.forward, net_2.forward)
+
+            # convert layer into static progam of net_1
+            net_1.forward.concrete_program
+            self.assertTrue(len(net_1.forward.program_cache) == 1)
+            # check no conversion applid with net_2
+            self.assertTrue(len(net_2.forward.program_cache) == 0)
+
+
 class TestInputSpec(unittest.TestCase):
     def setUp(self):
         pass
@@ -114,10 +128,10 @@ class TestInputSpec(unittest.TestCase):
             self.assertTrue(len(net.add_func.program_cache) == 1)
 
             # 5. test input with list
-            out = net.func_with_list([x, y, int_val])
+            out = net.func_with_list([x, y], int_val)
 
             # 6. test input with dict
-            out = net.func_with_dict({'x': x, 'y': y, 'int_val': int_val})
+            out = net.func_with_dict({'x': x, 'y': y})
 
             # 7. test input with lits contains dict
             int_np = np.ones([1]).astype('float32')
@@ -224,7 +238,6 @@ class TestDifferentInputSpecCacheProgram(unittest.TestCase):
         # 1. specific InputSpec for `x`/`y`
         concrete_program_1 = foo.get_concrete_program(
             InputSpec([None, 10]), InputSpec([10]))
-        print(concrete_program_1)
         self.assertTrue(len(foo.program_cache) == 1)
 
         # 2. specific `c`/`d` explicitly with same default value
@@ -277,6 +290,30 @@ class TestDifferentInputSpecCacheProgram(unittest.TestCase):
                 foo_3.concrete_program
 
 
+class TestInputDefaultName(unittest.TestCase):
+    def setUp(self):
+        paddle.disable_static()
+        self.net = SimpleNet()
+
+    def assert_default_name(self, func_name, input_names):
+        decorated_func = getattr(self.net, func_name)
+
+        spec_names = [x.name for x in decorated_func.inputs]
+        self.assertListEqual(spec_names, input_names)
+
+    def test_common_input(self):
+        self.assert_default_name('forward', ['x'])
+
+    def test_list_input(self):
+        self.assert_default_name('func_with_list', ['l_0', 'l_1'])
+
+    def test_dict_input(self):
+        self.assert_default_name('func_with_dict', ['x', 'y'])
+
+    def test_nest_input(self):
+        self.assert_default_name('func_with_list_dict', ['dl_0', 'x', 'y'])
+
+
 class TestDeclarativeAPI(unittest.TestCase):
     def test_error(self):
         func = declarative(dyfunc_to_variable)
@@ -295,5 +332,50 @@ class TestDeclarativeAPI(unittest.TestCase):
             func(np.ones(5).astype("int32"))
 
 
+class TestDecorateModelDirectly(unittest.TestCase):
+    def setUp(self):
+        paddle.disable_static()
+        program_trans.enable(True)
+        self.x = to_variable(np.ones([4, 10]).astype('float32'))
+
+    def test_fake_input(self):
+        net = SimpleNet()
+        net = declarative(net)
+        y = net(self.x)
+        self.assertTrue(len(net.forward.program_cache) == 1)
+
+    def test_input_spec(self):
+        net = SimpleNet()
+        net = declarative(net, input_spec=[InputSpec([None, 8, 10])])
+        self.assertTrue(len(net.forward.inputs) == 1)
+        self.assertTrue(len(net.forward.program_cache) == 1)
+        input_shape = net.forward.inputs[0].shape
+        self.assertListEqual(list(input_shape), [-1, 8, 10])
+
+        # redecorate
+        net = declarative(net, input_spec=[InputSpec([None, 16, 10])])
+        input_shape = net.forward.inputs[0].shape
+        self.assertListEqual(list(input_shape), [-1, 16, 10])
+
+
+class TestErrorWithInitFromStaticMode(unittest.TestCase):
+    def test_raise_error(self):
+        # disable imperative
+        paddle.enable_static()
+
+        net = SimpleNet()
+        with self.assertRaisesRegexp(RuntimeError,
+                                     "only available in dynamic mode"):
+            net.forward.concrete_program
+
+        with self.assertRaisesRegexp(RuntimeError,
+                                     "only available in dynamic mode"):
+            net.forward.inputs
+
+        with self.assertRaisesRegexp(RuntimeError,
+                                     "only available in dynamic mode"):
+            net.forward.outputs
+
+
 if __name__ == '__main__':
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/dygraph_to_static/test_error.py b/python/paddle/fluid/tests/unittests/dygraph_to_static/test_error.py
index 586020d434519b12c6fff4cbba812a013cf45c3d..2998ba85757e7677d5f9ab39ff81682a8b315072 100644
--- a/python/paddle/fluid/tests/unittests/dygraph_to_static/test_error.py
+++ b/python/paddle/fluid/tests/unittests/dygraph_to_static/test_error.py
@@ -14,15 +14,15 @@
 
 from __future__ import print_function
 
+import os
 import inspect
 import unittest
-
 import numpy as np
+import paddle
 import paddle.fluid as fluid
 from paddle.fluid.core import EnforceNotMet
-from paddle.fluid.dygraph.dygraph_to_static.error import ERROR_DATA, ErrorData
+from paddle.fluid.dygraph.dygraph_to_static import error
 from paddle.fluid.dygraph.dygraph_to_static.origin_info import unwrap
-from paddle.fluid.dygraph.jit import declarative
 
 
 def inner_func():
@@ -30,7 +30,7 @@ def inner_func():
     return
 
 
-@declarative
+@paddle.jit.to_static
 def func_error_in_compile_time(x):
     x = fluid.dygraph.to_variable(x)
     inner_func()
@@ -41,14 +41,14 @@ def func_error_in_compile_time(x):
     return x_v
 
 
-@declarative
+@paddle.jit.to_static
 def func_error_in_compile_time_2(x):
     x = fluid.dygraph.to_variable(x)
     x = fluid.layers.reshape(x, shape=[1, 2])
     return x
 
 
-@declarative
+@paddle.jit.to_static
 def func_error_in_runtime(x, iter_num=3):
     x = fluid.dygraph.to_variable(x)
     two = fluid.layers.fill_constant(shape=[1], value=2, dtype="int32")
@@ -61,6 +61,9 @@ class TestErrorInCompileTime(unittest.TestCase):
         self.set_func()
         self.set_input()
         self.set_exception_type()
+        self.prog_trans = paddle.jit.ProgramTranslator()
+        self.simplify_error = 1
+        self.disable_error = 0
 
     def set_func(self):
         self.func = func_error_in_compile_time
@@ -88,14 +91,38 @@ class TestErrorInCompileTime(unittest.TestCase):
         for m in self.expected_message:
             self.assertIn(m, error_message)
 
-    def test(self):
-        with fluid.dygraph.guard():
-            with self.assertRaises(self.exception_type) as cm:
-                self.func(self.input)
-            exception = cm.exception
-            error_data = getattr(exception, ERROR_DATA)
-            self.assertIsInstance(error_data, ErrorData)
-            self._test_create_message(error_data)
+    def _test_attach_and_raise_new_exception(self, func_call):
+        paddle.disable_static()
+        with self.assertRaises(self.exception_type) as cm:
+            func_call()
+        exception = cm.exception
+
+        error_data = getattr(exception, error.ERROR_DATA, None)
+
+        self.assertIsInstance(error_data, error.ErrorData)
+        self._test_create_message(error_data)
+
+    def test_static_layer_call(self):
+        # NOTE: self.func(self.input) is the StaticLayer().__call__(self.input)
+        call_dy2static = lambda: self.func(self.input)
+
+        self.set_flags(0)
+        self._test_attach_and_raise_new_exception(call_dy2static)
+
+    def test_program_translator_get_output(self):
+        call_dy2static = lambda : self.prog_trans.get_output(unwrap(self.func), self.input)
+
+        self.set_flags(0)
+        self._test_attach_and_raise_new_exception(call_dy2static)
+
+    def set_flags(self, disable_error=0, simplify_error=1):
+        os.environ[error.DISABLE_ERROR_ENV_NAME] = str(disable_error)
+        self.disable_error = int(os.getenv(error.DISABLE_ERROR_ENV_NAME, 0))
+        self.assertEqual(self.disable_error, disable_error)
+
+        os.environ[error.SIMPLIFY_ERROR_ENV_NAME] = str(simplify_error)
+        self.simplify_error = int(os.getenv(error.SIMPLIFY_ERROR_ENV_NAME, 1))
+        self.assertEqual(self.simplify_error, simplify_error)
 
 
 class TestErrorInCompileTime2(TestErrorInCompileTime):
@@ -143,5 +170,28 @@ class TestErrorInRuntime(TestErrorInCompileTime):
             self.assertIn(m, error_message)
 
 
+@unwrap
+@paddle.jit.to_static()
+def func_decorated_by_other_1():
+    return 1
+
+
+@paddle.jit.to_static()
+@unwrap
+def func_decorated_by_other_2():
+    return 1
+
+
+class TestErrorInOther(unittest.TestCase):
+    def test(self):
+        paddle.disable_static()
+        prog_trans = paddle.jit.ProgramTranslator()
+        with self.assertRaises(NotImplementedError):
+            prog_trans.get_output(func_decorated_by_other_1)
+
+        with self.assertRaises(NotImplementedError):
+            func_decorated_by_other_2()
+
+
 if __name__ == '__main__':
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/dygraph_to_static/test_logging_utils.py b/python/paddle/fluid/tests/unittests/dygraph_to_static/test_logging_utils.py
index 510b615654751500c33dc3311353ba7e2f8baf40..b8a18179742df108d44dbf527adba819eefe91cd 100644
--- a/python/paddle/fluid/tests/unittests/dygraph_to_static/test_logging_utils.py
+++ b/python/paddle/fluid/tests/unittests/dygraph_to_static/test_logging_utils.py
@@ -56,8 +56,30 @@ class TestLoggingUtils(unittest.TestCase):
         with self.assertRaises(TypeError):
             paddle.jit.set_verbosity(3.3)
 
-    def test_code_level(self):
+    def test_also_to_stdout(self):
+        logging_utils._TRANSLATOR_LOGGER.need_to_echo_log_to_stdout = None
+        self.assertEqual(
+            logging_utils._TRANSLATOR_LOGGER.need_to_echo_log_to_stdout, False)
 
+        paddle.jit.set_verbosity(also_to_stdout=False)
+        self.assertEqual(
+            logging_utils._TRANSLATOR_LOGGER.need_to_echo_log_to_stdout, False)
+
+        logging_utils._TRANSLATOR_LOGGER.need_to_echo_node_to_stdout = None
+        self.assertEqual(
+            logging_utils._TRANSLATOR_LOGGER.need_to_echo_code_to_stdout, False)
+
+        paddle.jit.set_code_level(also_to_stdout=True)
+        self.assertEqual(
+            logging_utils._TRANSLATOR_LOGGER.need_to_echo_code_to_stdout, True)
+
+        with self.assertRaises(AssertionError):
+            paddle.jit.set_verbosity(also_to_stdout=1)
+
+        with self.assertRaises(AssertionError):
+            paddle.jit.set_code_level(also_to_stdout=1)
+
+    def test_set_code_level(self):
         paddle.jit.set_code_level(None)
         os.environ[logging_utils.CODE_LEVEL_ENV_NAME] = '2'
         self.assertEqual(logging_utils.get_code_level(), 2)
@@ -71,7 +93,25 @@ class TestLoggingUtils(unittest.TestCase):
         with self.assertRaises(TypeError):
             paddle.jit.set_code_level(3.3)
 
-    def test_log(self):
+    def test_log_api(self):
+        # test api for CI Converage
+        logging_utils.set_verbosity(1, True)
+
+        logging_utils.warn("warn")
+        logging_utils.error("error")
+
+        logging_utils.log(1, "log level 1")
+        logging_utils.log(2, "log level 2")
+
+        source_code = "x = 3"
+        ast_code = gast.parse(source_code)
+        logging_utils.set_code_level(1, True)
+        logging_utils.log_transformed_code(1, ast_code, "TestTransformer")
+        logging_utils.set_code_level(logging_utils.LOG_AllTransformer, True)
+        logging_utils.log_transformed_code(logging_utils.LOG_AllTransformer,
+                                           ast_code, "TestTransformer")
+
+    def test_log_message(self):
         stream = io.BytesIO() if six.PY2 else io.StringIO()
         log = self.translator_logger.logger
         stdout_handler = logging.StreamHandler(stream)
@@ -84,13 +124,14 @@ class TestLoggingUtils(unittest.TestCase):
 
         if six.PY3:
             with mock.patch.object(sys, 'stdout', stream):
+                logging_utils.set_verbosity(1, False)
                 logging_utils.warn(warn_msg)
                 logging_utils.error(error_msg)
-                self.translator_logger.verbosity_level = 1
                 logging_utils.log(1, log_msg_1)
                 logging_utils.log(2, log_msg_2)
 
-            result_msg = '\n'.join([warn_msg, error_msg, log_msg_1, ""])
+            result_msg = '\n'.join(
+                [warn_msg, error_msg, "(Level 1) " + log_msg_1, ""])
             self.assertEqual(result_msg, stream.getvalue())
 
     def test_log_transformed_code(self):
diff --git a/python/paddle/fluid/tests/unittests/dygraph_to_static/test_origin_info.py b/python/paddle/fluid/tests/unittests/dygraph_to_static/test_origin_info.py
index b03777b6ebc7f3cceb73cd32e6fdfea11755320e..3f77e9ade285e2c3d8452ea2171505442ee52fb0 100644
--- a/python/paddle/fluid/tests/unittests/dygraph_to_static/test_origin_info.py
+++ b/python/paddle/fluid/tests/unittests/dygraph_to_static/test_origin_info.py
@@ -14,6 +14,7 @@
 
 from __future__ import print_function
 
+import sys
 import unittest
 
 from paddle.fluid.dygraph.dygraph_to_static.ast_transformer import DygraphToStaticAst
@@ -177,8 +178,20 @@ class TestOriginInfoWithDecoratedFunc(TestOriginInfo):
 
     def set_dygraph_info(self):
         self.line_num = 2
-        self.line_index_list = [0, 2]
-        self.dy_rel_lineno_list = [0, 2]
+
+        # NOTE(liym27):
+        #   There are differences in ast_node.lineno between PY3.8+ and PY3.8-.
+        #   If the first gast.FunctionDef has decorator, the lineno of gast.FunctionDef is differs.
+        #       1. < PY3.8
+        #           its lineno equals to the lineno of the first decorator node, which is not right.
+        #       2. >= PY3.8
+        #           its lineno is the actual lineno, which is right.
+        if sys.version_info >= (3, 8):
+            self.line_index_list = [1, 2]
+            self.dy_rel_lineno_list = [1, 2]
+        else:
+            self.line_index_list = [0, 2]
+            self.dy_rel_lineno_list = [0, 2]
         self.dy_abs_col_offset = [0, 4]
         self.dy_func_name = [self.dygraph_func.__name__] * self.line_num
 
@@ -199,8 +212,13 @@ class TestOriginInfoWithDecoratedFunc2(TestOriginInfo):
 
     def set_dygraph_info(self):
         self.line_num = 2
-        self.line_index_list = [0, 3]
-        self.dy_rel_lineno_list = [0, 3]
+
+        if sys.version_info >= (3, 8):
+            self.line_index_list = [2, 3]
+            self.dy_rel_lineno_list = [2, 3]
+        else:
+            self.line_index_list = [0, 3]
+            self.dy_rel_lineno_list = [0, 3]
         self.dy_abs_col_offset = [0, 4]
         self.dy_func_name = [self.dygraph_func.__name__] * self.line_num
 
diff --git a/python/paddle/fluid/tests/unittests/dygraph_to_static/test_partial_program.py b/python/paddle/fluid/tests/unittests/dygraph_to_static/test_partial_program.py
index f0fbe54f9dbbf93121655e784601467c13b3a70d..91067f360995e1661c200df923a698f3f146b71e 100644
--- a/python/paddle/fluid/tests/unittests/dygraph_to_static/test_partial_program.py
+++ b/python/paddle/fluid/tests/unittests/dygraph_to_static/test_partial_program.py
@@ -14,6 +14,7 @@
 
 from __future__ import print_function
 import numpy as np
+import paddle
 import paddle.fluid as fluid
 from paddle.fluid.layers.utils import flatten
 from paddle.fluid.dygraph import declarative, ProgramTranslator
@@ -151,5 +152,33 @@ class TestWithTrainAndEval(unittest.TestCase):
                              partial_layer._train_program)
 
 
+class GPT2LMHeadModel(fluid.dygraph.Layer):
+    def __init__(self):
+        super(GPT2LMHeadModel, self).__init__()
+        self.embedding0 = paddle.nn.Embedding(20, 16)
+        self.embedding1 = paddle.nn.Embedding(20, 32)
+        self.lm_head_weight = paddle.to_tensor(
+            np.random.rand(2, 3).astype('float32'))
+
+    @declarative
+    def forward(self, x):
+        x = fluid.layers.reshape(x, shape=[-1, 6])
+        x1, x2, x3 = fluid.layers.split(input=x, dim=1, num_or_sections=3)
+        return x1
+
+
+class TestPruneUnusedParamInProgram(unittest.TestCase):
+    def test_prune(self):
+        input_ids = np.array([[15, 11, 6, 3, 18, 13]]).astype("float32")
+
+        place = fluid.CPUPlace()
+        with fluid.dygraph.guard(place):
+            model = GPT2LMHeadModel()
+            model.eval()
+            input_ids = paddle.to_tensor(input_ids)
+            out = model(input_ids)
+            self.assertTrue(np.array_equal(out.numpy(), [[15, 11]]))
+
+
 if __name__ == '__main__':
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/dygraph_to_static/test_program_translator.py b/python/paddle/fluid/tests/unittests/dygraph_to_static/test_program_translator.py
index 873d9ecb53549e9d6a3982ca4528e63526bd3a0d..b0ab55758ee7d9eeb5a9bd747934e6f7a1992f7b 100644
--- a/python/paddle/fluid/tests/unittests/dygraph_to_static/test_program_translator.py
+++ b/python/paddle/fluid/tests/unittests/dygraph_to_static/test_program_translator.py
@@ -21,6 +21,7 @@ import numpy as np
 import textwrap
 import unittest
 
+import paddle
 import paddle.fluid as fluid
 from paddle.fluid.dygraph.dygraph_to_static import ProgramTranslator
 from paddle.fluid.dygraph.jit import declarative
@@ -279,5 +280,33 @@ class TestEnableDeclarative(unittest.TestCase):
                     static_output.numpy(), dygraph_output.numpy(), atol=1e-4))
 
 
+class Net(fluid.dygraph.layers.Layer):
+    def __init__(self):
+        super(Net, self).__init__()
+
+    def forward(self, x):
+        return x + 1
+
+
+class TestErrorWithInitFromStaticMode(unittest.TestCase):
+    def setUp(self):
+        self.program_translator = ProgramTranslator()
+        self.x = np.random.randn(10, 32).astype('float32')
+
+    def test_raise_error(self):
+        # disable imperative
+        paddle.enable_static()
+        net = Net()
+
+        self.program_translator.enable(True)
+        with self.assertRaisesRegexp(RuntimeError,
+                                     "only available in dynamic mode"):
+            self.program_translator.get_output(net.forward, self.x)
+
+        with self.assertRaisesRegexp(RuntimeError,
+                                     "only available in dynamic mode"):
+            self.program_translator.get_program(net.forward, self.x)
+
+
 if __name__ == '__main__':
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/dygraph_to_static/test_save_inference_model.py b/python/paddle/fluid/tests/unittests/dygraph_to_static/test_save_inference_model.py
index 6cf59c030c00384b225d5d13160f68a3558084b9..cf7708c675aa9c1fb8faf5f8585b458be88b6c83 100644
--- a/python/paddle/fluid/tests/unittests/dygraph_to_static/test_save_inference_model.py
+++ b/python/paddle/fluid/tests/unittests/dygraph_to_static/test_save_inference_model.py
@@ -133,7 +133,7 @@ class TestPartialProgramRaiseError(unittest.TestCase):
             x = fluid.dygraph.to_variable(x_data)
             out = net(x)
 
-            program_cache = SimpleFcLayer.forward.program_cache
+            program_cache = net.forward.program_cache
             _, (concrete_program, _) = program_cache.last()
 
             params = concrete_program.parameters
diff --git a/python/paddle/fluid/tests/unittests/dygraph_to_static/test_yolov3.py b/python/paddle/fluid/tests/unittests/dygraph_to_static/test_yolov3.py
index 4453dff892fcaacd65ed5f1bdf81817db51c6fe1..6aa9156a0d4cb8e737f395d04521257ccb95559e 100644
--- a/python/paddle/fluid/tests/unittests/dygraph_to_static/test_yolov3.py
+++ b/python/paddle/fluid/tests/unittests/dygraph_to_static/test_yolov3.py
@@ -17,12 +17,14 @@ import random
 import time
 import unittest
 
+import paddle
 import paddle.fluid as fluid
 from paddle.fluid.dygraph import ProgramTranslator
 from paddle.fluid.dygraph import to_variable
 
 from yolov3 import cfg, YOLOv3
 
+paddle.enable_static()
 random.seed(0)
 np.random.seed(0)
 
diff --git a/python/paddle/fluid/tests/unittests/hdfs_test_utils.py b/python/paddle/fluid/tests/unittests/hdfs_test_utils.py
index 6a752bc3053d7d0672bd0002250252c3bbbfa1e1..766dcc39af1a55614b0c179a686dbaab5af1e349 100644
--- a/python/paddle/fluid/tests/unittests/hdfs_test_utils.py
+++ b/python/paddle/fluid/tests/unittests/hdfs_test_utils.py
@@ -19,7 +19,7 @@ from paddle.fluid.incubate.fleet.collective import CollectiveOptimizer, fleet
 import os
 import sys
 
-from paddle.distributed.fleet.utils import LocalFS, HDFSClient, FSTimeOut, FSFileExistsError, FSFileNotExistsError
+from paddle.distributed.fleet.utils.fs import LocalFS, HDFSClient, FSTimeOut, FSFileExistsError, FSFileNotExistsError
 
 java_home = os.environ["JAVA_HOME"]
 
diff --git a/python/paddle/fluid/tests/unittests/ir/inference/test_conv_affine_channel_fuse_pass.py b/python/paddle/fluid/tests/unittests/ir/inference/test_conv_affine_channel_fuse_pass.py
new file mode 100644
index 0000000000000000000000000000000000000000..ec0bd52e9261017335f0bf424d32f26d4c465029
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/ir/inference/test_conv_affine_channel_fuse_pass.py
@@ -0,0 +1,228 @@
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+import numpy as np
+from inference_pass_test import InferencePassTest
+import paddle.fluid as fluid
+import paddle.fluid.core as core
+from paddle.fluid.core import PassVersionChecker
+
+
+class ConvAffineChannelFusePassExplicitPaddingTest(InferencePassTest):
+    def setUp(self):
+        with fluid.program_guard(self.main_program, self.startup_program):
+            data = fluid.data(
+                name="data", shape=[-1, 3, 64, 64], dtype="float32")
+            conv_out = fluid.layers.conv2d(
+                input=data,
+                num_filters=3,
+                filter_size=3,
+                groups=3,
+                padding=[1, 1, 1, 1],
+                bias_attr=False,
+                act=None)
+            input_scale = fluid.layers.create_parameter(
+                shape=[3], dtype="float32")
+            input_bias = fluid.layers.create_parameter(
+                shape=[3], dtype="float32")
+            ac_out = fluid.layers.affine_channel(
+                x=conv_out, scale=input_scale, bias=input_bias)
+
+        self.feeds = {
+            "data": np.random.random([1, 3, 64, 64]).astype("float32"),
+        }
+        self.fetch_list = [ac_out]
+
+    def test_check_output(self):
+        self.check_output()
+
+        self.assertTrue(
+            PassVersionChecker.IsCompatible('conv_affine_channel_fuse_pass'))
+
+
+class ConvAffineChannelFusePassValidPaddingTest(InferencePassTest):
+    def setUp(self):
+        with fluid.program_guard(self.main_program, self.startup_program):
+            data = fluid.data(
+                name="data", shape=[-1, 3, 64, 64], dtype="float32")
+            conv_out = fluid.layers.conv2d(
+                input=data,
+                num_filters=3,
+                filter_size=3,
+                groups=3,
+                padding='VALID',
+                bias_attr=False,
+                act=None)
+            input_scale = fluid.layers.create_parameter(
+                shape=[3], dtype="float32")
+            input_bias = fluid.layers.create_parameter(
+                shape=[3], dtype="float32")
+            ac_out = fluid.layers.affine_channel(
+                x=conv_out, scale=input_scale, bias=input_bias)
+
+        self.feeds = {
+            "data": np.random.random([1, 3, 64, 64]).astype("float32"),
+        }
+        self.fetch_list = [ac_out]
+
+    def test_check_output(self):
+        self.check_output()
+
+        self.assertTrue(
+            PassVersionChecker.IsCompatible('conv_affine_channel_fuse_pass'))
+
+
+class ConvAffineChannelFusePassSamePaddingTest(InferencePassTest):
+    def setUp(self):
+        with fluid.program_guard(self.main_program, self.startup_program):
+            data = fluid.data(
+                name="data", shape=[-1, 3, 64, 64], dtype="float32")
+            conv_out = fluid.layers.conv2d(
+                input=data,
+                num_filters=3,
+                filter_size=3,
+                groups=3,
+                padding='SAME',
+                bias_attr=False,
+                act=None)
+            input_scale = fluid.layers.create_parameter(
+                shape=[3], dtype="float32")
+            input_bias = fluid.layers.create_parameter(
+                shape=[3], dtype="float32")
+            ac_out = fluid.layers.affine_channel(
+                x=conv_out, scale=input_scale, bias=input_bias)
+
+        self.feeds = {
+            "data": np.random.random([1, 3, 64, 64]).astype("float32"),
+        }
+        self.fetch_list = [ac_out]
+
+    def test_check_output(self):
+        self.check_output()
+
+        self.assertTrue(
+            PassVersionChecker.IsCompatible('conv_affine_channel_fuse_pass'))
+
+
+class ConvEltwiseAddAffineChannelFusePassExplicitPaddingTest(InferencePassTest):
+    def setUp(self):
+        with fluid.program_guard(self.main_program, self.startup_program):
+            data = fluid.data(
+                name="data", shape=[-1, 3, 64, 64], dtype="float32")
+            param_attr = fluid.ParamAttr(
+                initializer=fluid.initializer.Xavier(uniform=False),
+                learning_rate=0.001)
+            conv_out = fluid.layers.conv2d(
+                input=data,
+                num_filters=3,
+                filter_size=3,
+                groups=3,
+                padding=[1, 1, 1, 1],
+                bias_attr=param_attr,
+                act=None)
+            input_scale = fluid.layers.create_parameter(
+                shape=[3], dtype="float32")
+            input_bias = fluid.layers.create_parameter(
+                shape=[3], dtype="float32")
+            ac_out = fluid.layers.affine_channel(
+                x=conv_out, scale=input_scale, bias=input_bias)
+
+        self.feeds = {
+            "data": np.random.random([1, 3, 64, 64]).astype("float32"),
+        }
+        self.fetch_list = [ac_out]
+
+    def test_check_output(self):
+        self.check_output()
+
+        self.assertTrue(
+            PassVersionChecker.IsCompatible(
+                'conv_eltwiseadd_affine_channel_fuse_pass'))
+
+
+class ConvEltwiseAddAffineChannelFusePassValidPaddingTest(InferencePassTest):
+    def setUp(self):
+        with fluid.program_guard(self.main_program, self.startup_program):
+            data = fluid.data(
+                name="data", shape=[-1, 3, 64, 64], dtype="float32")
+            param_attr = fluid.ParamAttr(
+                initializer=fluid.initializer.Xavier(uniform=False),
+                learning_rate=0.001)
+            conv_out = fluid.layers.conv2d(
+                input=data,
+                num_filters=3,
+                filter_size=3,
+                groups=3,
+                padding='VALID',
+                bias_attr=param_attr,
+                act=None)
+            input_scale = fluid.layers.create_parameter(
+                shape=[3], dtype="float32")
+            input_bias = fluid.layers.create_parameter(
+                shape=[3], dtype="float32")
+            ac_out = fluid.layers.affine_channel(
+                x=conv_out, scale=input_scale, bias=input_bias)
+
+        self.feeds = {
+            "data": np.random.random([1, 3, 64, 64]).astype("float32"),
+        }
+        self.fetch_list = [ac_out]
+
+    def test_check_output(self):
+        self.check_output()
+
+        self.assertTrue(
+            PassVersionChecker.IsCompatible(
+                'conv_eltwiseadd_affine_channel_fuse_pass'))
+
+
+class ConvEltwiseAddAffineChannelFusePassSamePaddingTest(InferencePassTest):
+    def setUp(self):
+        with fluid.program_guard(self.main_program, self.startup_program):
+            data = fluid.data(
+                name="data", shape=[-1, 3, 64, 64], dtype="float32")
+            param_attr = fluid.ParamAttr(
+                initializer=fluid.initializer.Xavier(uniform=False),
+                learning_rate=0.001)
+            conv_out = fluid.layers.conv2d(
+                input=data,
+                num_filters=3,
+                filter_size=3,
+                groups=3,
+                padding='Same',
+                bias_attr=param_attr,
+                act=None)
+            input_scale = fluid.layers.create_parameter(
+                shape=[3], dtype="float32")
+            input_bias = fluid.layers.create_parameter(
+                shape=[3], dtype="float32")
+            ac_out = fluid.layers.affine_channel(
+                x=conv_out, scale=input_scale, bias=input_bias)
+
+        self.feeds = {
+            "data": np.random.random([1, 3, 64, 64]).astype("float32"),
+        }
+        self.fetch_list = [ac_out]
+
+    def test_check_output(self):
+        self.check_output()
+
+        self.assertTrue(
+            PassVersionChecker.IsCompatible(
+                'conv_eltwiseadd_affine_channel_fuse_pass'))
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/ir/inference/test_conv_bias_mkldnn_fuse_pass.py b/python/paddle/fluid/tests/unittests/ir/inference/test_conv_bias_mkldnn_fuse_pass.py
new file mode 100644
index 0000000000000000000000000000000000000000..5eb397b5a95b240dcaff9dee3758646b35ab5022
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/ir/inference/test_conv_bias_mkldnn_fuse_pass.py
@@ -0,0 +1,171 @@
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+
+import unittest
+import numpy as np
+from inference_pass_test import InferencePassTest
+import paddle.fluid as fluid
+import paddle.fluid.core as core
+from paddle.fluid.core import AnalysisConfig
+"""Test for fusion of conv and bias."""
+
+
+#padding SAME
+class ConvBiasMkldnnFusePassTest(InferencePassTest):
+    def setUp(self):
+        with fluid.program_guard(self.main_program, self.startup_program):
+            data = fluid.data(
+                name="data", shape=[-1, 3, 100, 100], dtype="float32")
+            param_attr = fluid.ParamAttr(
+                initializer=fluid.initializer.Xavier(uniform=False),
+                learning_rate=0.001)
+            conv_out = fluid.layers.conv2d(
+                input=data,
+                num_filters=3,
+                filter_size=3,
+                padding="SAME",
+                bias_attr=param_attr)
+
+        self.feeds = {
+            "data": np.random.random((1, 3, 100, 100)).astype("float32")
+        }
+        self.fetch_list = [conv_out]
+        self.enable_mkldnn = True
+
+    def test_check_output(self):
+        use_gpu = False
+        self.check_output_with_option(use_gpu)
+
+
+#padding VALID
+class ConvBiasMkldnnFusePassTest1(InferencePassTest):
+    def setUp(self):
+        with fluid.program_guard(self.main_program, self.startup_program):
+            data = fluid.data(
+                name="data", shape=[-1, 3, 100, 100], dtype="float32")
+            param_attr = fluid.ParamAttr(
+                initializer=fluid.initializer.Xavier(uniform=False),
+                learning_rate=0.001)
+            conv_out = fluid.layers.conv2d(
+                input=data,
+                num_filters=3,
+                filter_size=3,
+                padding="VALID",
+                bias_attr=param_attr)
+
+        self.feeds = {
+            "data": np.random.random((1, 3, 100, 100)).astype("float32")
+        }
+        self.fetch_list = [conv_out]
+        self.enable_mkldnn = True
+
+    def test_check_output(self):
+        use_gpu = False
+        self.check_output_with_option(use_gpu)
+
+
+#padding number
+class ConvBiasMkldnnFusePassTest2(InferencePassTest):
+    def setUp(self):
+        with fluid.program_guard(self.main_program, self.startup_program):
+            data = fluid.data(
+                name="data", shape=[-1, 3, 100, 100], dtype="float32")
+            param_attr = fluid.ParamAttr(
+                initializer=fluid.initializer.Xavier(uniform=False),
+                learning_rate=0.001)
+            conv_out = fluid.layers.conv2d(
+                input=data,
+                num_filters=3,
+                filter_size=3,
+                padding=[2, 4, 6, 8],
+                bias_attr=param_attr)
+
+        self.feeds = {
+            "data": np.random.random((1, 3, 100, 100)).astype("float32")
+        }
+        self.fetch_list = [conv_out]
+        self.enable_mkldnn = True
+
+    def test_check_output(self):
+        use_gpu = False
+        self.check_output_with_option(use_gpu)
+
+
+#dilation not supported yet, just print warning log and does not fuse
+class ConvBiasMkldnnFusePassTest3(InferencePassTest):
+    def setUp(self):
+        with fluid.program_guard(self.main_program, self.startup_program):
+            data = fluid.data(
+                name="data", shape=[-1, 3, 100, 100], dtype="float32")
+            param_attr = fluid.ParamAttr(
+                initializer=fluid.initializer.Xavier(uniform=False),
+                learning_rate=0.001)
+            conv_out = fluid.layers.conv2d(
+                input=data,
+                num_filters=3,
+                filter_size=3,
+                padding="VALID",
+                dilation=2,
+                groups=3,
+                bias_attr=param_attr,
+                use_cudnn=False,
+                act="softmax",
+                data_format="NCHW")
+
+        self.feeds = {
+            "data": np.random.random((1, 3, 100, 100)).astype("float32")
+        }
+        self.fetch_list = [conv_out]
+        self.enable_mkldnn = True
+
+    def test_check_output(self):
+        use_gpu = False
+        self.check_output_with_option(use_gpu)
+
+
+#all conv params except for dilation
+class ConvBiasMkldnnFusePassTest4(InferencePassTest):
+    def setUp(self):
+        with fluid.program_guard(self.main_program, self.startup_program):
+            data = fluid.data(
+                name="data", shape=[-1, 3, 100, 100], dtype="float32")
+            param_attr = fluid.ParamAttr(
+                initializer=fluid.initializer.Xavier(uniform=False),
+                learning_rate=0.001)
+            conv_out = fluid.layers.conv2d(
+                input=data,
+                num_filters=3,
+                filter_size=3,
+                padding="VALID",
+                groups=3,
+                bias_attr=param_attr,
+                use_cudnn=False,
+                act="softmax",
+                data_format="NCHW")
+
+        self.feeds = {
+            "data": np.random.random((1, 3, 100, 100)).astype("float32")
+        }
+        self.fetch_list = [conv_out]
+        self.enable_mkldnn = True
+
+    def test_check_output(self):
+        use_gpu = False
+        self.check_output_with_option(use_gpu)
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/ir/inference/test_conv_bn_fuse_pass.py b/python/paddle/fluid/tests/unittests/ir/inference/test_conv_bn_fuse_pass.py
new file mode 100644
index 0000000000000000000000000000000000000000..ffe177e59d86e4451b3da1e40fd62e1d398b03b2
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/ir/inference/test_conv_bn_fuse_pass.py
@@ -0,0 +1,177 @@
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+import numpy as np
+from inference_pass_test import InferencePassTest
+import paddle.fluid as fluid
+import paddle.fluid.core as core
+from paddle.fluid.core import PassVersionChecker
+
+
+class ConvBnFusePassExplicitPaddingTest(InferencePassTest):
+    def setUp(self):
+        with fluid.program_guard(self.main_program, self.startup_program):
+            data = fluid.data(
+                name="data", shape=[-1, 3, 64, 64], dtype="float32")
+            conv_out = fluid.layers.conv2d(
+                input=data,
+                num_filters=6,
+                filter_size=6,
+                groups=3,
+                padding=[1, 1, 1, 1],
+                bias_attr=False,
+                act=None)
+            bn_out = fluid.layers.batch_norm(conv_out, is_test=True)
+
+        self.feeds = {
+            "data": np.random.random([1, 3, 64, 64]).astype("float32"),
+        }
+        self.fetch_list = [bn_out]
+
+    def test_check_output(self):
+        self.check_output()
+        self.assertTrue(PassVersionChecker.IsCompatible('conv_bn_fuse_pass'))
+
+
+class ConvBnFusePassValidPaddingTest(InferencePassTest):
+    def setUp(self):
+        with fluid.program_guard(self.main_program, self.startup_program):
+            data = fluid.data(
+                name="data", shape=[-1, 3, 64, 64], dtype="float32")
+            conv_out = fluid.layers.conv2d(
+                input=data,
+                num_filters=6,
+                filter_size=6,
+                groups=3,
+                padding='VALID',
+                bias_attr=False,
+                act=None)
+            bn_out = fluid.layers.batch_norm(conv_out, is_test=True)
+
+        self.feeds = {
+            "data": np.random.random([1, 3, 64, 64]).astype("float32"),
+        }
+        self.fetch_list = [bn_out]
+
+    def test_check_output(self):
+        self.check_output()
+        self.assertTrue(PassVersionChecker.IsCompatible('conv_bn_fuse_pass'))
+
+
+class ConvBnFusePassSamePaddingTest(InferencePassTest):
+    def setUp(self):
+        with fluid.program_guard(self.main_program, self.startup_program):
+            data = fluid.data(
+                name="data", shape=[-1, 3, 64, 64], dtype="float32")
+            conv_out = fluid.layers.conv2d(
+                input=data,
+                num_filters=6,
+                filter_size=6,
+                groups=3,
+                padding='SAME',
+                bias_attr=False,
+                act=None)
+            bn_out = fluid.layers.batch_norm(conv_out, is_test=True)
+
+        self.feeds = {
+            "data": np.random.random([1, 3, 64, 64]).astype("float32"),
+        }
+        self.fetch_list = [bn_out]
+
+    def test_check_output(self):
+        self.check_output()
+        self.assertTrue(PassVersionChecker.IsCompatible('conv_bn_fuse_pass'))
+
+
+class ConvEltwiseAddBnFuseExplicitPaddingPass(InferencePassTest):
+    def setUp(self):
+        with fluid.program_guard(self.main_program, self.startup_program):
+            data = fluid.data(
+                name="data", shape=[-1, 3, 64, 64], dtype="float32")
+            conv_out = fluid.layers.conv2d(
+                input=data,
+                num_filters=6,
+                filter_size=6,
+                groups=3,
+                padding=[1, 1, 1, 1],
+                bias_attr=None,
+                act=None)
+            bn_out = fluid.layers.batch_norm(conv_out, is_test=True)
+
+        self.feeds = {
+            "data": np.random.random([1, 3, 64, 64]).astype("float32"),
+        }
+        self.fetch_list = [bn_out]
+
+    def test_check_output(self):
+        self.check_output()
+        self.assertTrue(
+            PassVersionChecker.IsCompatible('conv_eltwiseadd_bn_fuse_pass'))
+
+
+class ConvEltwiseAddBnFuseValidPaddingPass(InferencePassTest):
+    def setUp(self):
+        with fluid.program_guard(self.main_program, self.startup_program):
+            data = fluid.data(
+                name="data", shape=[-1, 3, 64, 64], dtype="float32")
+            conv_out = fluid.layers.conv2d(
+                input=data,
+                num_filters=6,
+                filter_size=6,
+                groups=3,
+                padding='VALID',
+                bias_attr=None,
+                act=None)
+            bn_out = fluid.layers.batch_norm(conv_out, is_test=True)
+
+        self.feeds = {
+            "data": np.random.random([1, 3, 64, 64]).astype("float32"),
+        }
+        self.fetch_list = [bn_out]
+
+    def test_check_output(self):
+        self.check_output()
+        self.assertTrue(
+            PassVersionChecker.IsCompatible('conv_eltwiseadd_bn_fuse_pass'))
+
+
+class ConvEltwiseAddBnFuseSamePaddingPass(InferencePassTest):
+    def setUp(self):
+        with fluid.program_guard(self.main_program, self.startup_program):
+            data = fluid.data(
+                name="data", shape=[-1, 3, 64, 64], dtype="float32")
+            conv_out = fluid.layers.conv2d(
+                input=data,
+                num_filters=6,
+                filter_size=6,
+                groups=3,
+                padding='SAME',
+                bias_attr=None,
+                act=None)
+            bn_out = fluid.layers.batch_norm(conv_out, is_test=True)
+
+        self.feeds = {
+            "data": np.random.random([1, 3, 64, 64]).astype("float32"),
+        }
+        self.fetch_list = [bn_out]
+
+    def test_check_output(self):
+        self.check_output()
+        self.assertTrue(
+            PassVersionChecker.IsCompatible('conv_eltwiseadd_bn_fuse_pass'))
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/ir/inference/test_conv_elementwise_add2_act_fuse_pass.py b/python/paddle/fluid/tests/unittests/ir/inference/test_conv_elementwise_add2_act_fuse_pass.py
index d6dbd397b90368d5cac27c3c5d92b7a7dce9dcf5..6907b6a7eb50eb90224af678c0ae6d053e05ac9b 100644
--- a/python/paddle/fluid/tests/unittests/ir/inference/test_conv_elementwise_add2_act_fuse_pass.py
+++ b/python/paddle/fluid/tests/unittests/ir/inference/test_conv_elementwise_add2_act_fuse_pass.py
@@ -19,6 +19,7 @@ import numpy as np
 from inference_pass_test import InferencePassTest
 import paddle.fluid as fluid
 import paddle.fluid.core as core
+from paddle.fluid.core import PassVersionChecker
 from paddle.fluid.core import AnalysisConfig
 """Test for fusion of conv, elementwise_add and 2 act."""
 
@@ -46,6 +47,9 @@ class ConvElementwiseAdd2ActFusePassTest(InferencePassTest):
         if core.is_compiled_with_cuda():
             use_gpu = True
             self.check_output_with_option(use_gpu)
+        self.assertTrue(
+            PassVersionChecker.IsCompatible(
+                'conv_elementwise_add2_act_fuse_pass'))
 
 
 if __name__ == "__main__":
diff --git a/python/paddle/fluid/tests/unittests/ir/inference/test_conv_elementwise_add_act_fuse_pass.py b/python/paddle/fluid/tests/unittests/ir/inference/test_conv_elementwise_add_act_fuse_pass.py
index 2e9035420d7ee45dd69de8d3cd8acc9bb1590c72..6ff60aa6debf8968e1f674e3c7aff822dfde075c 100644
--- a/python/paddle/fluid/tests/unittests/ir/inference/test_conv_elementwise_add_act_fuse_pass.py
+++ b/python/paddle/fluid/tests/unittests/ir/inference/test_conv_elementwise_add_act_fuse_pass.py
@@ -19,6 +19,7 @@ import numpy as np
 from inference_pass_test import InferencePassTest
 import paddle.fluid as fluid
 import paddle.fluid.core as core
+from paddle.fluid.core import PassVersionChecker
 from paddle.fluid.core import AnalysisConfig
 """Test for fusion of conv, elementwise_add and act."""
 
@@ -48,6 +49,9 @@ class ConvElementwiseAddActFusePassTest(InferencePassTest):
         if core.is_compiled_with_cuda():
             use_gpu = True
             self.check_output_with_option(use_gpu)
+        self.assertTrue(
+            PassVersionChecker.IsCompatible(
+                'conv_elementwise_add_act_fuse_pass'))
 
 
 if __name__ == "__main__":
diff --git a/python/paddle/fluid/tests/unittests/ir/inference/test_conv_elementwise_add_fuse_pass.py b/python/paddle/fluid/tests/unittests/ir/inference/test_conv_elementwise_add_fuse_pass.py
index 7c4e0d6e76ec45c4b75ba91522306a4dd0abc7c5..96b046edaec49038d6c8e137494c13bd7484cb7f 100644
--- a/python/paddle/fluid/tests/unittests/ir/inference/test_conv_elementwise_add_fuse_pass.py
+++ b/python/paddle/fluid/tests/unittests/ir/inference/test_conv_elementwise_add_fuse_pass.py
@@ -19,6 +19,7 @@ import numpy as np
 from inference_pass_test import InferencePassTest
 import paddle.fluid as fluid
 import paddle.fluid.core as core
+from paddle.fluid.core import PassVersionChecker
 from paddle.fluid.core import AnalysisConfig
 """Test for fusion of conv and elementwise_add."""
 
@@ -44,6 +45,8 @@ class ConvElementwiseAddFusePassTest(InferencePassTest):
         if core.is_compiled_with_cuda():
             use_gpu = True
             self.check_output_with_option(use_gpu)
+        self.assertTrue(
+            PassVersionChecker.IsCompatible('conv_elementwise_add_fuse_pass'))
 
 
 if __name__ == "__main__":
diff --git a/python/paddle/fluid/tests/unittests/ir/inference/test_fc_fuse_pass.py b/python/paddle/fluid/tests/unittests/ir/inference/test_fc_fuse_pass.py
new file mode 100644
index 0000000000000000000000000000000000000000..a62adcea3f94379aa81643e26a7df53ab92fe676
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/ir/inference/test_fc_fuse_pass.py
@@ -0,0 +1,54 @@
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+
+import unittest
+import numpy as np
+from inference_pass_test import InferencePassTest
+import paddle.fluid as fluid
+import paddle.fluid.core as core
+from paddle.fluid.core import AnalysisConfig
+from paddle.fluid.core import PassVersionChecker
+
+
+class FcFusePassTest(InferencePassTest):
+    def setUp(self):
+        with fluid.program_guard(self.main_program, self.startup_program):
+            data = fluid.data(
+                name="data", shape=[-1, 128, 768], dtype="float32")
+            data_y = fluid.data(name="y", shape=[-1, 128, 768], dtype="float32")
+            fc_out1 = fluid.layers.fc(input=data,
+                                      size=3072,
+                                      num_flatten_dims=2,
+                                      act="relu")
+            fc_out2 = fluid.layers.fc(input=fc_out1,
+                                      size=768,
+                                      num_flatten_dims=2)
+
+        self.feeds = {"data": np.random.random((4, 128, 768)).astype("float32")}
+        self.fetch_list = [fc_out2]
+
+    def test_check_output(self):
+        use_gpu = [False]
+        if core.is_compiled_with_cuda():
+            use_gpu.append(True)
+        for i in range(len(use_gpu)):
+            self.check_output_with_option(use_gpu[i])
+
+        self.assertTrue(PassVersionChecker.IsCompatible('fc_fuse_pass'))
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/ir/inference/test_fc_gru_fuse_pass.py b/python/paddle/fluid/tests/unittests/ir/inference/test_fc_gru_fuse_pass.py
new file mode 100644
index 0000000000000000000000000000000000000000..f7b43470d402f8671091365db04237797a012e78
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/ir/inference/test_fc_gru_fuse_pass.py
@@ -0,0 +1,86 @@
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+import numpy as np
+from inference_pass_test import InferencePassTest
+import paddle.fluid as fluid
+import paddle.fluid.core as core
+from paddle.fluid.core import PassVersionChecker
+
+
+class FcGruFusePassTest(InferencePassTest):
+    def setUp(self):
+        with fluid.program_guard(self.main_program, self.startup_program):
+            dict_dim, emb_dim = 128, 64
+            data = fluid.data(
+                name='step_data', shape=[None], dtype='int64', lod_level=1)
+            emb = fluid.embedding(input=data, size=[dict_dim, emb_dim])
+            hidden_dim = 512
+            x = fluid.layers.fc(input=emb, size=hidden_dim * 3)
+            hidden = fluid.layers.dynamic_gru(
+                input=x,
+                size=hidden_dim,
+                bias_attr=True,
+                origin_mode=False,
+                is_reverse=True)
+
+        batch = 16
+        lod_tensor = fluid.LoDTensor()
+        lod_tensor.set(np.random.randint(
+            0, dict_dim, size=[batch]).astype("int64"),
+                       fluid.CPUPlace())
+        lod_tensor.set_lod([[0, batch]])
+        self.feeds = {"step_data": lod_tensor}
+        self.fetch_list = [hidden]
+
+    def test_check_output(self):
+        use_gpu = False
+        self.check_output_with_option(use_gpu)
+        self.assertTrue(PassVersionChecker.IsCompatible('fc_gru_fuse_pass'))
+
+
+class MulGruFusePassTest(InferencePassTest):
+    def setUp(self):
+        with fluid.program_guard(self.main_program, self.startup_program):
+            dict_dim, emb_dim = 128, 64
+            data = fluid.data(
+                name='step_data', shape=[None], dtype='int64', lod_level=1)
+            emb = fluid.embedding(input=data, size=[dict_dim, emb_dim])
+            hidden_dim = 512
+            x = fluid.layers.fc(input=emb, size=hidden_dim * 3, bias_attr=False)
+            hidden = fluid.layers.dynamic_gru(
+                input=x,
+                size=hidden_dim,
+                bias_attr=True,
+                origin_mode=False,
+                is_reverse=True)
+
+        batch = 16
+        lod_tensor = fluid.LoDTensor()
+        lod_tensor.set(np.random.randint(
+            0, dict_dim, size=[batch]).astype("int64"),
+                       fluid.CPUPlace())
+        lod_tensor.set_lod([[0, batch]])
+        self.feeds = {"step_data": lod_tensor}
+        self.fetch_list = [hidden]
+
+    def test_check_output(self):
+        use_gpu = False
+        self.check_output_with_option(use_gpu)
+        self.assertTrue(PassVersionChecker.IsCompatible('mul_gru_fuse_pass'))
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/ir/inference/test_fc_lstm_fuse_pass.py b/python/paddle/fluid/tests/unittests/ir/inference/test_fc_lstm_fuse_pass.py
new file mode 100644
index 0000000000000000000000000000000000000000..fbb4373dae2c44148a5ac6b65c11a3d47adfd1a1
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/ir/inference/test_fc_lstm_fuse_pass.py
@@ -0,0 +1,52 @@
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+import numpy as np
+from inference_pass_test import InferencePassTest
+import paddle.fluid as fluid
+import paddle.fluid.core as core
+from paddle.fluid.core import PassVersionChecker
+
+
+class MulLstmFusePassTest(InferencePassTest):
+    def setUp(self):
+        with fluid.program_guard(self.main_program, self.startup_program):
+            dict_dim, emb_dim = 128, 64
+            hidden_dim = 512
+
+            data = fluid.data(
+                name='data', shape=[1], dtype='int64', lod_level=1)
+            emb = fluid.embedding(input=data, size=[dict_dim, emb_dim])
+            x = fluid.layers.fc(input=emb, size=hidden_dim * 4, bias_attr=False)
+            forward, cell = fluid.layers.dynamic_lstm(
+                input=x, size=hidden_dim * 4)
+
+        batch = 16
+        lod_tensor = fluid.LoDTensor()
+        lod_tensor.set(np.random.randint(
+            0, dict_dim, size=[batch]).astype("int64"),
+                       fluid.CPUPlace())
+        lod_tensor.set_lod([[0, batch]])
+        self.feeds = {"data": lod_tensor}
+        self.fetch_list = [forward, cell]
+
+    def test_check_output(self):
+        use_gpu = False
+        self.check_output_with_option(use_gpu)
+        self.assertTrue(PassVersionChecker.IsCompatible('mul_lstm_fuse_pass'))
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/ir/inference/test_repeated_fc_relu_fuse_pass.py b/python/paddle/fluid/tests/unittests/ir/inference/test_repeated_fc_relu_fuse_pass.py
new file mode 100644
index 0000000000000000000000000000000000000000..c78884480dab38446a20c456aed060eff847a9ed
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/ir/inference/test_repeated_fc_relu_fuse_pass.py
@@ -0,0 +1,94 @@
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+import numpy as np
+from inference_pass_test import InferencePassTest
+import paddle.fluid as fluid
+import paddle.fluid.core as core
+from paddle.fluid.core import PassVersionChecker
+
+
+class RepeatedFcReluFusePass3Test(InferencePassTest):
+    def setUp(self):
+        fc_num = 3
+        with fluid.program_guard(self.main_program, self.startup_program):
+            data = fluid.data(
+                name="data", shape=[-1, 3, 64, 64], dtype="float32")
+            param_attr = fluid.ParamAttr(
+                initializer=fluid.initializer.Xavier(uniform=False),
+                learning_rate=0.001)
+            conv_out = fluid.layers.conv2d(
+                input=data,
+                num_filters=3,
+                filter_size=3,
+                bias_attr=param_attr,
+                act=None)
+            fc_outs = []
+            fc_outs.append(
+                fluid.layers.fc(input=[conv_out], act="relu", size=1000))
+            for i in range(1, fc_num):
+                fc_outs.append(
+                    fluid.layers.fc(
+                        input=[fc_outs[i - 1]], act="relu", size=1000))
+        self.feeds = {
+            "data": np.random.random([1, 3, 64, 64]).astype("float32"),
+        }
+        self.fetch_list = [fc_outs[fc_num - 1]]
+
+    def test_check_output(self):
+        use_gpu = False
+        self.check_output_with_option(use_gpu)
+
+        self.assertTrue(
+            PassVersionChecker.IsCompatible('repeated_fc_relu_fuse_pass'))
+
+
+class RepeatedFcReluFusePass9Test(InferencePassTest):
+    def setUp(self):
+        fc_num = 9
+        with fluid.program_guard(self.main_program, self.startup_program):
+            data = fluid.data(
+                name="data", shape=[-1, 3, 64, 64], dtype="float32")
+            param_attr = fluid.ParamAttr(
+                initializer=fluid.initializer.Xavier(uniform=False),
+                learning_rate=0.001)
+            conv_out = fluid.layers.conv2d(
+                input=data,
+                num_filters=3,
+                filter_size=3,
+                bias_attr=param_attr,
+                act=None)
+            fc_outs = []
+            fc_outs.append(
+                fluid.layers.fc(input=[conv_out], act="relu", size=1000))
+            for i in range(1, fc_num):
+                fc_outs.append(
+                    fluid.layers.fc(
+                        input=[fc_outs[i - 1]], act="relu", size=1000))
+        self.feeds = {
+            "data": np.random.random([1, 3, 64, 64]).astype("float32"),
+        }
+        self.fetch_list = [fc_outs[fc_num - 1]]
+
+    def test_check_output(self):
+        use_gpu = False
+        self.check_output_with_option(use_gpu)
+
+        self.assertTrue(
+            PassVersionChecker.IsCompatible('repeated_fc_relu_fuse_pass'))
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/ir/inference/test_seqconv_eltadd_relu_fuse_pass.py b/python/paddle/fluid/tests/unittests/ir/inference/test_seqconv_eltadd_relu_fuse_pass.py
new file mode 100644
index 0000000000000000000000000000000000000000..eadda5ba06a79f061bcf87f9b0bf2c0770c763f5
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/ir/inference/test_seqconv_eltadd_relu_fuse_pass.py
@@ -0,0 +1,140 @@
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+
+import unittest
+import numpy as np
+from inference_pass_test import InferencePassTest
+import paddle.fluid as fluid
+import paddle.fluid.core as core
+from paddle.fluid.core import AnalysisConfig
+from paddle.fluid.core import PassVersionChecker
+
+
+class SeqconvEltaddReluFusePassTest(InferencePassTest):
+    def setUp(self):
+        with fluid.program_guard(self.main_program, self.startup_program):
+            data = fluid.data(name="data", shape=[100, 100], dtype="float32")
+            param_attr = fluid.ParamAttr(
+                initializer=fluid.initializer.Xavier(uniform=False),
+                learning_rate=0.001)
+            conv_out = fluid.layers.sequence_conv(
+                input=data,
+                num_filters=16,
+                filter_size=4,
+                padding_start=0,
+                act="relu",
+                bias_attr=param_attr)
+
+        np_data = np.random.random((80, 100)).astype('float32')
+        x_lod_tensor = fluid.create_lod_tensor(np_data, [[10, 20, 30, 20]],
+                                               fluid.CPUPlace())
+        self.feeds = {"data": x_lod_tensor}
+        self.fetch_list = [conv_out]
+        self.enable_mkldnn = True
+
+    def test_check_output(self):
+        self.check_output()
+        self.assertTrue(
+            PassVersionChecker.IsCompatible('seqconv_eltadd_relu_fuse_pass'))
+
+
+class SeqconvEltaddReluFusePassTestPaddingStartPositive(InferencePassTest):
+    def setUp(self):
+        with fluid.program_guard(self.main_program, self.startup_program):
+            data = fluid.data(name="data", shape=[-1, 4], dtype="float32")
+            param_attr = fluid.ParamAttr(
+                initializer=fluid.initializer.Xavier(uniform=False),
+                learning_rate=0.001)
+            conv_out = fluid.layers.sequence_conv(
+                input=data,
+                num_filters=16,
+                filter_size=3,
+                padding_start=2,
+                act="relu",
+                bias_attr=param_attr)
+
+        np_data = np.array([[1, 1, 1, 1], [2, 2, 2, 2], [3, 3, 3, 3],
+                            [4, 4, 4, 4], [5, 5, 5, 5], [6, 6, 6, 6],
+                            [7, 7, 7, 7]]).astype('float32')
+        x_lod_tensor = fluid.create_lod_tensor(np_data, [[5, 2]],
+                                               fluid.CPUPlace())
+        self.feeds = {"data": x_lod_tensor}
+        self.fetch_list = [conv_out]
+        self.enable_mkldnn = True
+
+    def test_check_output(self):
+        self.check_output()
+        self.assertTrue(
+            PassVersionChecker.IsCompatible('seqconv_eltadd_relu_fuse_pass'))
+
+
+class SeqconvEltaddReluFusePassTestPaddingStartNegative(InferencePassTest):
+    def setUp(self):
+        with fluid.program_guard(self.main_program, self.startup_program):
+            data = fluid.data(name="data", shape=[100, 100], dtype="float32")
+            param_attr = fluid.ParamAttr(
+                initializer=fluid.initializer.Xavier(uniform=False),
+                learning_rate=0.001)
+            conv_out = fluid.layers.sequence_conv(
+                input=data,
+                num_filters=16,
+                filter_size=4,
+                padding_start=-1,
+                act="relu",
+                bias_attr=param_attr)
+
+        np_data = np.random.random((80, 100)).astype('float32')
+        x_lod_tensor = fluid.create_lod_tensor(np_data, [[10, 20, 30, 20]],
+                                               fluid.CPUPlace())
+        self.feeds = {"data": x_lod_tensor}
+        self.fetch_list = [conv_out]
+        self.enable_mkldnn = True
+
+    def test_check_output(self):
+        self.check_output()
+        self.assertTrue(
+            PassVersionChecker.IsCompatible('seqconv_eltadd_relu_fuse_pass'))
+
+
+class SeqconvEltaddReluFusePassTestPaddingStartNone(InferencePassTest):
+    def setUp(self):
+        with fluid.program_guard(self.main_program, self.startup_program):
+            data = fluid.data(name="data", shape=[100, 100], dtype="float32")
+            param_attr = fluid.ParamAttr(
+                initializer=fluid.initializer.Xavier(uniform=False),
+                learning_rate=0.001)
+            conv_out = fluid.layers.sequence_conv(
+                input=data,
+                num_filters=16,
+                filter_size=4,
+                act="relu",
+                bias_attr=param_attr)
+
+        np_data = np.random.random((80, 100)).astype('float32')
+        x_lod_tensor = fluid.create_lod_tensor(np_data, [[10, 20, 30, 20]],
+                                               fluid.CPUPlace())
+        self.feeds = {"data": x_lod_tensor}
+        self.fetch_list = [conv_out]
+        self.enable_mkldnn = True
+
+    def test_check_output(self):
+        self.check_output()
+        self.assertTrue(
+            PassVersionChecker.IsCompatible('seqconv_eltadd_relu_fuse_pass'))
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/ir/inference/test_squared_mat_sub_fuse_pass.py b/python/paddle/fluid/tests/unittests/ir/inference/test_squared_mat_sub_fuse_pass.py
new file mode 100644
index 0000000000000000000000000000000000000000..5fa242df4e412fb9c2f3af08b3a186c3e086f2d6
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/ir/inference/test_squared_mat_sub_fuse_pass.py
@@ -0,0 +1,63 @@
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+
+import unittest
+import numpy as np
+from inference_pass_test import InferencePassTest
+import paddle
+import paddle.fluid as fluid
+import paddle.fluid.core as core
+from paddle.fluid.core import AnalysisConfig
+from paddle.fluid.core import PassVersionChecker
+
+
+class SquaredMatSubFusePassTest(InferencePassTest):
+    def setUp(self):
+        with fluid.program_guard(self.main_program, self.startup_program):
+            data_a = fluid.data(name="data_a", shape=[128, 1], dtype="float32")
+            data_b = fluid.data(name="data_b", shape=[256, 1], dtype="float32")
+
+            fc_a = fluid.layers.fc(data_a, size=256)
+            fc_b = fluid.layers.fc(data_b, size=64)
+
+            data_a_square = paddle.square(fc_a)
+            data_b_square = paddle.square(fc_b)
+
+            matmul_ab = paddle.matmul(fc_a, fc_b)
+            matmul_ab_square = paddle.square(matmul_ab)
+            matmul_square_ab = paddle.matmul(data_a_square, data_b_square)
+
+            scale = paddle.fill_constant(shape=[1], value=0.5, dtype='float32')
+
+            sub_val = paddle.elementwise_sub(matmul_ab_square, matmul_square_ab)
+            squared_mat_sub_out = fluid.layers.elementwise_mul(sub_val, scale)
+
+        self.feeds = {
+            "data_a": np.random.random((128, 1)).astype("float32"),
+            "data_b": np.random.random((256, 1)).astype("float32")
+        }
+        self.fetch_list = [squared_mat_sub_out]
+
+    def test_check_output(self):
+        use_gpu = False
+        self.check_output_with_option(use_gpu)
+
+        self.assertTrue(
+            PassVersionChecker.IsCompatible('squared_mat_sub_fuse_pass'))
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/ir/inference/test_transpose_flatten_concat_fuse_pass.py b/python/paddle/fluid/tests/unittests/ir/inference/test_transpose_flatten_concat_fuse_pass.py
index dfcd1758db2b22b211f84be528739aa71132ab8a..83d4b7091cb3276ba8e2c1ff9fd7dca9b1692c63 100644
--- a/python/paddle/fluid/tests/unittests/ir/inference/test_transpose_flatten_concat_fuse_pass.py
+++ b/python/paddle/fluid/tests/unittests/ir/inference/test_transpose_flatten_concat_fuse_pass.py
@@ -17,6 +17,7 @@ import numpy as np
 from inference_pass_test import InferencePassTest
 import paddle.fluid as fluid
 import paddle.fluid.core as core
+from paddle.fluid.core import PassVersionChecker
 
 
 class TransposeFlattenConcatFusePassTest(InferencePassTest):
@@ -45,6 +46,39 @@ class TransposeFlattenConcatFusePassTest(InferencePassTest):
             use_gpu = True
             self.check_output_with_option(use_gpu)
 
+        PassVersionChecker.IsCompatible('transpose_flatten_concat_fuse_pass')
+
+
+class TransposeFlattenConcatFusePassWithAxisTest(InferencePassTest):
+    def setUp(self):
+        with fluid.program_guard(self.main_program, self.startup_program):
+            data1 = fluid.data(name="data1", shape=[5, 5, 5], dtype="float32")
+            data2 = fluid.data(name="data2", shape=[5, 5, 5], dtype="float32")
+            trans1 = fluid.layers.transpose(data1, perm=[2, 1, 0])
+            trans2 = fluid.layers.transpose(data2, perm=[2, 1, 0])
+            flatt1 = fluid.layers.flatten(trans1, axis=2)
+            flatt2 = fluid.layers.flatten(trans2, axis=2)
+            concat_out = fluid.layers.concat([flatt1, flatt2], axis=1)
+            # There is no parameters for above structure. 
+            # Hence, append a batch_norm to avoid failure caused by load_combined. 
+            out = fluid.layers.batch_norm(concat_out, is_test=True)
+
+        self.feeds = {
+            "data1": np.random.random([5, 5, 5]).astype("float32"),
+            "data2": np.random.random([5, 5, 5]).astype("float32")
+        }
+        self.fetch_list = [out]
+
+    def test_check_output(self):
+        # There is no cpu pass for transpose_flatten_concat_fuse
+        if core.is_compiled_with_cuda():
+            use_gpu = True
+            self.check_output_with_option(use_gpu)
+
+        self.assertTrue(
+            PassVersionChecker.IsCompatible(
+                'transpose_flatten_concat_fuse_pass'))
+
 
 if __name__ == "__main__":
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/ir/inference/test_trt_pad_op.py b/python/paddle/fluid/tests/unittests/ir/inference/test_trt_pad_op.py
new file mode 100644
index 0000000000000000000000000000000000000000..060f6c6c5f0446661e886390637714ad7dfc300d
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/ir/inference/test_trt_pad_op.py
@@ -0,0 +1,53 @@
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+
+import unittest
+import numpy as np
+from inference_pass_test import InferencePassTest
+import paddle.fluid as fluid
+import paddle.fluid.core as core
+from paddle.fluid.core import AnalysisConfig
+
+
+class PadOpTRTTest(InferencePassTest):
+    def setUp(self):
+        with fluid.program_guard(self.main_program, self.startup_program):
+            data = fluid.data(
+                name="data", shape=[1, 3, 128, 128], dtype="float32")
+            pad_out = fluid.layers.pad(x=data,
+                                       paddings=[0, 0, 0, 0, 0, 1, 1, 2],
+                                       pad_value=0.0)
+            out = fluid.layers.batch_norm(pad_out, is_test=True)
+
+        self.feeds = {
+            "data": np.random.random((1, 3, 128, 128)).astype("float32")
+        }
+        self.enable_trt = True
+        self.trt_parameters = PadOpTRTTest.TensorRTParam(
+            1 << 30, 32, 1, AnalysisConfig.Precision.Float32, False, False)
+        self.fetch_list = [out]
+
+    def test_check_output(self):
+        use_gpu = [False]
+        if core.is_compiled_with_cuda():
+            use_gpu.append(True)
+
+        for i in range(len(use_gpu)):
+            self.check_output_with_option(use_gpu[i])
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/ir/inference/test_trt_shuffle_channel_detect_pass.py b/python/paddle/fluid/tests/unittests/ir/inference/test_trt_shuffle_channel_detect_pass.py
new file mode 100644
index 0000000000000000000000000000000000000000..e9c304496afcc694e47924b9cf0fbd168d08c665
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/ir/inference/test_trt_shuffle_channel_detect_pass.py
@@ -0,0 +1,51 @@
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+import numpy as np
+from inference_pass_test import InferencePassTest
+import paddle.fluid as fluid
+import paddle.fluid.core as core
+from paddle.fluid.core import PassVersionChecker
+from paddle.fluid.core import AnalysisConfig
+
+
+class ShuffleChannelFuseTRTPassTest(InferencePassTest):
+    def setUp(self):
+        with fluid.program_guard(self.main_program, self.startup_program):
+            data = fluid.data(
+                name="data", shape=[-1, 6, 64, 64], dtype="float32")
+            reshape1 = fluid.layers.reshape(x=data, shape=[-1, 2, 3, 64, 64])
+            trans = fluid.layers.transpose(x=reshape1, perm=[0, 2, 1, 3, 4])
+            reshape2 = fluid.layers.reshape(x=trans, shape=[-1, 6, 64, 64])
+            out = fluid.layers.batch_norm(reshape2, is_test=True)
+
+        self.feeds = {
+            "data": np.random.random([1, 6, 64, 64]).astype("float32"),
+        }
+        self.enable_trt = True
+        self.trt_parameters = ShuffleChannelFuseTRTPassTest.TensorRTParam(
+            1 << 30, 32, 1, AnalysisConfig.Precision.Float32, False, False)
+        self.fetch_list = [out]
+
+    def test_check_output(self):
+
+        self.check_output()
+
+        self.assertTrue(
+            PassVersionChecker.IsCompatible('shuffle_channel_detect_pass'))
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/ir/inference/test_trt_slice_plugin.py b/python/paddle/fluid/tests/unittests/ir/inference/test_trt_slice_plugin.py
new file mode 100644
index 0000000000000000000000000000000000000000..660a9c93e66715f41e4a972ff571c0c00f31316f
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/ir/inference/test_trt_slice_plugin.py
@@ -0,0 +1,150 @@
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+
+import unittest
+import numpy as np
+from inference_pass_test import InferencePassTest
+import paddle.fluid as fluid
+import paddle.fluid.core as core
+from paddle.fluid.core import AnalysisConfig
+
+
+#normal starts && ends
+class SlicePluginTRTTest1(InferencePassTest):
+    def setUp(self):
+        with fluid.program_guard(self.main_program, self.startup_program):
+            data = fluid.data(name="data", shape=[3, 3, 3, 3], dtype="float32")
+            axes = [1, 3]
+            starts = [0, 1]
+            ends = [2, 3]
+            slice_out = fluid.layers.slice(
+                data, axes=axes, starts=starts, ends=ends)
+            out = fluid.layers.batch_norm(slice_out, is_test=True)
+
+        self.feeds = {
+            "data": np.random.random((3, 3, 3, 3)).astype("float32"),
+        }
+        # Diff occurred between GPU and TRT. 
+        # In order to provide TRT CI ASAP, this test for trt part 
+        # is disabled temporarily. 
+        self.enable_trt = True
+        self.trt_parameters = SlicePluginTRTTest1.TensorRTParam(
+            1 << 30, 32, 1, AnalysisConfig.Precision.Float32, False, False)
+        self.fetch_list = [out]
+
+    def test_check_output(self):
+        use_gpu = [False]
+        if core.is_compiled_with_cuda():
+            use_gpu.append(True)
+        for i in range(len(use_gpu)):
+            self.check_output_with_option(use_gpu[i])
+
+
+#negative starts && ends
+class SlicePluginTRTTest2(InferencePassTest):
+    def setUp(self):
+        with fluid.program_guard(self.main_program, self.startup_program):
+            data = fluid.data(name="data", shape=[3, 3, 3, 3], dtype="float32")
+            axes = [2, 3]
+            starts = [-3, -2]
+            ends = [-1, 3]
+            slice_out = fluid.layers.slice(
+                data, axes=axes, starts=starts, ends=ends)
+            out = fluid.layers.batch_norm(slice_out, is_test=True)
+
+        self.feeds = {
+            "data": np.random.random((3, 3, 3, 3)).astype("float32"),
+        }
+        # Diff occurred between GPU and TRT. 
+        # In order to provide TRT CI ASAP, this test for trt part 
+        # is disabled temporarily. 
+        self.enable_trt = True
+        self.trt_parameters = SlicePluginTRTTest2.TensorRTParam(
+            1 << 30, 32, 1, AnalysisConfig.Precision.Float32, False, False)
+        self.fetch_list = [out]
+
+    def test_check_output(self):
+        use_gpu = [False]
+        if core.is_compiled_with_cuda():
+            use_gpu.append(True)
+        for i in range(len(use_gpu)):
+            self.check_output_with_option(use_gpu[i])
+
+
+#exceeded bound starts && ends
+class SlicePluginTRTTest3(InferencePassTest):
+    def setUp(self):
+        with fluid.program_guard(self.main_program, self.startup_program):
+            data = fluid.data(name="data", shape=[3, 3, 3, 3], dtype="float32")
+            axes = [2, 3]
+            starts = [-5, -2]
+            ends = [-1, 8]
+            slice_out = fluid.layers.slice(
+                data, axes=axes, starts=starts, ends=ends)
+            out = fluid.layers.batch_norm(slice_out, is_test=True)
+
+        self.feeds = {
+            "data": np.random.random((3, 3, 3, 3)).astype("float32"),
+        }
+        # Diff occurred between GPU and TRT. 
+        # In order to provide TRT CI ASAP, this test for trt part 
+        # is disabled temporarily. 
+        self.enable_trt = True
+        self.trt_parameters = SlicePluginTRTTest3.TensorRTParam(
+            1 << 30, 32, 1, AnalysisConfig.Precision.Float32, False, False)
+        self.fetch_list = [out]
+
+    def test_check_output(self):
+        use_gpu = [False]
+        if core.is_compiled_with_cuda():
+            use_gpu.append(True)
+        for i in range(len(use_gpu)):
+            self.check_output_with_option(use_gpu[i])
+
+
+#fp16
+class SlicePluginTRTTest4(InferencePassTest):
+    def setUp(self):
+        with fluid.program_guard(self.main_program, self.startup_program):
+            data = fluid.data(name="data", shape=[3, 3, 3, 3], dtype="float32")
+            axes = [2, 3]
+            starts = [-5, -2]
+            ends = [-1, 8]
+            slice_out = fluid.layers.slice(
+                data, axes=axes, starts=starts, ends=ends)
+            out = fluid.layers.batch_norm(slice_out, is_test=True)
+
+        self.feeds = {
+            "data": np.random.random((3, 3, 3, 3)).astype("float32"),
+        }
+        # Diff occurred between GPU and TRT. 
+        # In order to provide TRT CI ASAP, this test for trt part 
+        # is disabled temporarily. 
+        self.enable_trt = True
+        self.trt_parameters = SlicePluginTRTTest3.TensorRTParam(
+            1 << 30, 32, 1, AnalysisConfig.Precision.Half, False, False)
+        self.fetch_list = [out]
+
+    def test_check_output(self):
+        use_gpu = [False]
+        if core.is_compiled_with_cuda():
+            use_gpu.append(True)
+        for i in range(len(use_gpu)):
+            self.check_output_with_option(use_gpu[i])
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/launch_function_helper.py b/python/paddle/fluid/tests/unittests/launch_function_helper.py
index 13041827ffeabd3d6b79e4f34a67bd09624e54f6..046268444018799ca4d7f5530cbb6b1c707e062f 100644
--- a/python/paddle/fluid/tests/unittests/launch_function_helper.py
+++ b/python/paddle/fluid/tests/unittests/launch_function_helper.py
@@ -15,7 +15,8 @@ from multiprocessing import Pool, Process
 import os
 import socket
 from contextlib import closing
-import psutil
+import time
+import sys
 
 
 def launch_func(func, env_dict):
@@ -25,19 +26,36 @@ def launch_func(func, env_dict):
     return proc
 
 
-def wait(procs, timeout=None):
-    # wait
-    decents = []
+def wait(procs, timeout=30):
+    error = False
+    begin = time.time()
+    while True:
+        alive = False
+        for p in procs:
+            p.join(timeout=10)
+            if p.exitcode is None:
+                alive = True
+                continue
+            elif p.exitcode != 0:
+                error = True
+                break
+
+        if not alive:
+            break
+
+        if error:
+            break
+
+        if timeout is not None and time.time() - begin >= timeout:
+            error = True
+            break
+
     for p in procs:
-        for child in psutil.Process(p.pid).children(recursive=True):
-            decents.append(child)
-
-    gone, alive = psutil.wait_procs(decents, timeout=timeout)
-    for p in alive:
-        p.kill()
-    for p in gone:
-        if p.returncode != 0:
-            sys.exit(1)
+        if p.is_alive():
+            p.terminate()
+
+    if error:
+        sys.exit(1)
 
 
 def _find_free_port(port_set):
diff --git a/python/paddle/fluid/tests/unittests/mkldnn/check_flags_use_mkldnn.py b/python/paddle/fluid/tests/unittests/mkldnn/check_flags_use_mkldnn.py
new file mode 100644
index 0000000000000000000000000000000000000000..8f5715a0d0afcf59ebbe1cc95a6b06dead64c6e2
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/mkldnn/check_flags_use_mkldnn.py
@@ -0,0 +1,48 @@
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import unicode_literals
+from __future__ import print_function
+
+import numpy as np
+import paddle.fluid as fluid
+import os
+from paddle.fluid.layer_helper import LayerHelper
+
+
+def check():
+    print("check: fluid.core.globals()['FLAGS_use_mkldnn']=",
+          fluid.core.globals()["FLAGS_use_mkldnn"])
+    print("check: fluid.get_flags('FLAGS_use_mkldnn')=",
+          fluid.get_flags(['FLAGS_use_mkldnn']))
+    print("check: DNNL_VERBOSE=", os.environ['DNNL_VERBOSE'])
+    a_np = np.random.uniform(-2, 2, (10, 20, 30)).astype(np.float32)
+    helper = LayerHelper(fluid.unique_name.generate(str("test")), act="relu")
+    func = helper.append_activation
+    with fluid.dygraph.guard(fluid.core.CPUPlace()):
+        a = fluid.dygraph.to_variable(a_np)
+        res1 = func(a)
+        res2 = np.maximum(a_np, 0)
+    assert (np.array_equal(res1.numpy(), res2))
+
+
+if __name__ == '__main__':
+    try:
+        check()
+        for k, v in sorted(os.environ.items()):
+            print(k + ':', v)
+        print('\n')
+    except Exception as e:
+        print(e)
+        print(type(e))
diff --git a/python/paddle/fluid/tests/unittests/mkldnn/test_flags_use_mkldnn.py b/python/paddle/fluid/tests/unittests/mkldnn/test_flags_use_mkldnn.py
new file mode 100644
index 0000000000000000000000000000000000000000..69676d0d70bdd523652c30c4cf066dc6982c46d4
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/mkldnn/test_flags_use_mkldnn.py
@@ -0,0 +1,58 @@
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import unicode_literals
+from __future__ import print_function
+
+import unittest
+import os
+import sys
+import subprocess
+
+
+class TestFlagsUseMkldnn(unittest.TestCase):
+    def setUp(self):
+        self._python_interp = sys.executable
+        self._python_interp += " check_flags_use_mkldnn.py"
+
+        self.env = os.environ.copy()
+        self.env[str("GLOG_v")] = str("3")
+        self.env[str("DNNL_VERBOSE")] = str("1")
+        self.env[str("FLAGS_use_mkldnn")] = str("1")
+
+    def test_flags_use_mkl_dnn(self):
+        cmd = self._python_interp
+
+        proc = subprocess.Popen(
+            cmd.split(" "),
+            stdout=subprocess.PIPE,
+            stderr=subprocess.PIPE,
+            env=self.env)
+
+        out, err = proc.communicate()
+        returncode = proc.returncode
+
+        print('out', out)
+        print('err', err)
+
+        assert returncode == 0
+        # in python3, type(out) is 'bytes', need use encode
+        assert out.find(
+            "dnnl_verbose,exec,cpu,eltwise,jit:avx512_common,forward_training,"
+            "data_f32::blocked:abc:f0 diff_undef::undef::f0,,alg:eltwise_relu".
+            encode()) != -1
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/mkldnn/test_fusion_gru_int8_mkldnn_op.py b/python/paddle/fluid/tests/unittests/mkldnn/test_fusion_gru_int8_mkldnn_op.py
new file mode 100644
index 0000000000000000000000000000000000000000..ff4531f0e250e325f39ef69161c8d1ee751a2336
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/mkldnn/test_fusion_gru_int8_mkldnn_op.py
@@ -0,0 +1,145 @@
+#   Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+import numpy as np
+from paddle.fluid.tests.unittests.op_test import OpTest
+from paddle.fluid.tests.unittests.test_fusion_gru_op import fusion_gru
+from paddle.fluid.tests.unittests.test_fusion_lstm_op import fc, ACTIVATION
+
+
+class TestFusionGRUINT8MKLDNNOp(OpTest):
+    def set_confs(self):
+        pass
+
+    def setUp(self):
+        self.op_type = "fusion_gru"
+        self.lod = [[2, 4, 3]]
+        self.IC = 3
+        self.OC = 5
+        self.is_reverse = False
+        self.with_h0 = False
+        self.with_bias = True
+        self.act_state = 'tanh'
+        self.act_gate = 'sigmoid'
+        self.origin_mode = True
+        self.use_mkldnn = True
+        self.force_fp32_output = True
+        self.error_margin = 1e-5
+        self.set_confs()
+
+        # RNN dimensions
+        T = sum(self.lod[0])
+        N = len(self.lod[0])
+
+        # Input data
+        x_f32 = np.random.rand(T, self.IC).astype('float32') * 2 - 1
+        scale_data = 63
+        shift_data = 64
+        x_u8 = (x_f32 * scale_data + shift_data).astype(np.uint8)
+
+        # WeightX/WeightH data
+        wx = np.random.rand(self.IC, 3 * self.OC).astype('float32') * 2 - 1
+        wh = np.random.rand(self.OC, 3 * self.OC).astype('float32') * 2 - 1
+
+        # Calculating weight scales
+        # scales = 63 / max(abs(channel_wise(weightsX + weightsH)))
+        # WeightX data shape in PP: [IC, 3 * OC]
+        # WeightH data shape in PP: [OC, 2 * OC] + [OC, OC]
+        # Scales shape in oneDNN:   [3, OC]
+        scale_ur = 63 / np.max(np.abs(
+            np.concatenate(
+                [
+                    wx[:, :2 * self.OC], wh.flatten()[:2 * self.OC * self.OC]
+                    .reshape(self.OC, 2 * self.OC)
+                ],
+                axis=0)),
+                               axis=0)
+        scale_o = 63 / np.max(np.abs(
+            np.concatenate(
+                [
+                    wx[:, 2 * self.OC:], wh.flatten()[2 * self.OC * self.OC:]
+                    .reshape(self.OC, self.OC)
+                ],
+                axis=0)),
+                              axis=0)
+
+        scale_weights = np.concatenate([scale_ur, scale_o]).astype('float')
+
+        bias = np.random.rand(
+            1, 3 * self.OC).astype('float32') if self.with_bias else np.zeros(
+                (1, 3 * self.OC), dtype='float32')
+        h0 = np.random.rand(
+            N, self.OC).astype('float32') if self.with_h0 else np.zeros(
+                (N, self.OC), dtype='float32')
+
+        _, _, _, hidden_f32 = fusion_gru(x_f32, self.lod, h0, wx, wh, bias,
+                                         self.is_reverse, self.origin_mode,
+                                         ACTIVATION[self.act_state],
+                                         ACTIVATION[self.act_gate])
+
+        self.inputs = {'X': (x_u8, self.lod), 'WeightX': wx, 'WeightH': wh}
+
+        if self.with_bias:
+            self.inputs['Bias'] = bias
+
+        if self.with_h0:
+            self.inputs['H0'] = h0
+
+        if self.force_fp32_output:
+            self.error_margin = 1e-1
+            self.outputs = {'Hidden': (hidden_f32, self.lod)}
+        else:
+            self.error_margin = 1
+            hidden_u8 = (hidden_f32 * scale_data + shift_data).astype(np.uint8)
+            self.outputs = {'Hidden': (hidden_u8, self.lod)}
+
+        self.attrs = {
+            'activation': self.act_state,
+            'gate_activation': self.act_gate,
+            'is_reverse': self.is_reverse,
+            'origin_mode': self.origin_mode,
+            'use_mkldnn': self.use_mkldnn,
+            'force_fp32_output': self.force_fp32_output,
+            'Scale_data': scale_data,
+            'Shift_data': shift_data,
+            'Scale_weights': scale_weights
+        }
+
+    def test_check_output(self):
+        self.check_output(check_dygraph=False, atol=self.error_margin)
+
+
+class TestFusionGRUINT8MKLDNNOp2(TestFusionGRUINT8MKLDNNOp):
+    def set_confs(self):
+        self.force_fp32_output = False
+
+
+class TestFusionGRUINT8MKLDNNOp3(TestFusionGRUINT8MKLDNNOp):
+    def set_confs(self):
+        self.origin_mode = False
+
+
+class TestFusionGRUINT8MKLDNNOp4(TestFusionGRUINT8MKLDNNOp):
+    def set_confs(self):
+        self.with_bias = False
+
+
+class TestFusionGRUINT8MKLDNNOp5(TestFusionGRUINT8MKLDNNOp):
+    def set_confs(self):
+        self.with_h0 = False
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/parallel_dygraph_mnist.py b/python/paddle/fluid/tests/unittests/parallel_dygraph_mnist.py
index aff13f0b555299d1c7b453b61be79f5a356a5416..b083e76897cd96cea93d7b90898541de1226ac15 100644
--- a/python/paddle/fluid/tests/unittests/parallel_dygraph_mnist.py
+++ b/python/paddle/fluid/tests/unittests/parallel_dygraph_mnist.py
@@ -114,8 +114,8 @@ class TestMnist(TestParallelDyGraphRunnerBase):
         model = MNIST()
         train_reader = paddle.batch(
             paddle.dataset.mnist.train(), batch_size=2, drop_last=True)
-        opt = fluid.optimizer.Adam(
-            learning_rate=1e-3, parameter_list=model.parameters())
+        opt = paddle.optimizer.Adam(
+            learning_rate=1e-3, parameters=model.parameters())
         return model, train_reader, opt
 
     def run_one_loop(self, model, opt, data):
diff --git a/python/paddle/fluid/tests/unittests/test_activation_op.py b/python/paddle/fluid/tests/unittests/test_activation_op.py
index ab61a5b3cfccb0e885debe9786ae91a9754e9345..f6ba03194aa909279aa2cd884fc575041b01a4cd 100755
--- a/python/paddle/fluid/tests/unittests/test_activation_op.py
+++ b/python/paddle/fluid/tests/unittests/test_activation_op.py
@@ -128,7 +128,7 @@ class TestLogSigmoid(TestActivation):
 
 
 class TestLogSigmoidAPI(unittest.TestCase):
-    # test paddle.nn.LogSigmoid, paddle.nn.functional.logsigmoid
+    # test paddle.nn.LogSigmoid, paddle.nn.functional.log_sigmoid
     def setUp(self):
         self.x_np = np.random.uniform(-1, 1, [11, 17]).astype('float32')
         self.place=paddle.CUDAPlace(0) if core.is_compiled_with_cuda() \
@@ -137,36 +137,45 @@ class TestLogSigmoidAPI(unittest.TestCase):
     def test_static_api(self):
         with paddle.static.program_guard(paddle.static.Program()):
             x = paddle.data('X', [11, 17])
-            out1 = F.logsigmoid(x)
+            out1 = F.log_sigmoid(x)
             m = paddle.nn.LogSigmoid()
             out2 = m(x)
             exe = paddle.static.Executor(self.place)
             res = exe.run(feed={'X': self.x_np}, fetch_list=[out1, out2])
         out_ref = np.log(1 / (1 + np.exp(-self.x_np)))
         for r in res:
-            self.assertEqual(np.allclose(out_ref, r), True)
+            self.assertTrue(np.allclose(out_ref, r))
 
     def test_dygraph_api(self):
         paddle.disable_static(self.place)
         x = paddle.to_tensor(self.x_np)
-        out1 = F.logsigmoid(x)
+        out1 = F.log_sigmoid(x)
         m = paddle.nn.LogSigmoid()
         out2 = m(x)
         out_ref = np.log(1 / (1 + np.exp(-self.x_np)))
         for r in [out1, out2]:
-            self.assertEqual(np.allclose(out_ref, r.numpy()), True)
+            self.assertTrue(np.allclose(out_ref, r.numpy()))
         paddle.enable_static()
 
+    def test_fluid_api(self):
+        with paddle.static.program_guard(paddle.static.Program()):
+            x = paddle.data('X', [11, 17])
+            out = paddle.fluid.layers.logsigmoid(x)
+            exe = paddle.static.Executor(self.place)
+            res = exe.run(feed={'X': self.x_np}, fetch_list=[out])
+        out_ref = np.log(1 / (1 + np.exp(-self.x_np)))
+        self.assertTrue(np.allclose(out_ref, res[0]))
+
     def test_errors(self):
         with paddle.static.program_guard(paddle.static.Program()):
             # The input type must be Variable.
-            self.assertRaises(TypeError, F.logsigmoid, 1)
+            self.assertRaises(TypeError, F.log_sigmoid, 1)
             # The input dtype must be float16, float32, float64.
             x_int32 = paddle.data(name='x_int32', shape=[11, 17], dtype='int32')
-            self.assertRaises(TypeError, F.logsigmoid, x_int32)
+            self.assertRaises(TypeError, F.log_sigmoid, x_int32)
             # support the input dtype is float16
             x_fp16 = paddle.data(name='x_fp16', shape=[11, 17], dtype='float16')
-            F.logsigmoid(x_fp16)
+            F.log_sigmoid(x_fp16)
 
 
 class TestTanh(TestActivation, TestParameter):
diff --git a/python/paddle/fluid/tests/unittests/test_adam_op.py b/python/paddle/fluid/tests/unittests/test_adam_op.py
index 14e83fccd655527d8f3012365e4757d23236a445..47bf8f49e39b6451ee480d461e83324b89cacee2 100644
--- a/python/paddle/fluid/tests/unittests/test_adam_op.py
+++ b/python/paddle/fluid/tests/unittests/test_adam_op.py
@@ -448,7 +448,6 @@ class TestAdamOpV2(unittest.TestCase):
 
     def test_adam_op_with_state_dict(self):
 
-        import paddle
         paddle.disable_static()
         emb = paddle.nn.Embedding(10, 10)
 
@@ -517,6 +516,20 @@ class TestAdamOpV2(unittest.TestCase):
             adam = paddle.optimizer.Adam(
                 0.1, epsilon=-1, parameters=linear.parameters())
 
+    def test_adam_op_with_sparse_input_and_weight_decay(self):
+
+        paddle.disable_static()
+        x_data = np.arange(0, 10).reshape((10, 1)).astype(np.int64)
+        x = paddle.to_tensor(x_data, stop_gradient=False)
+        emb = paddle.nn.Embedding(10, 10, sparse=True)
+        adam = paddle.optimizer.Adam(
+            0.001, parameters=emb.parameters(), weight_decay=0.01)
+
+        with self.assertRaises(RuntimeError):
+            out = emb(x)
+            out.backward()
+            adam.step()
+
 
 if __name__ == "__main__":
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_adaptive_avg_pool1d.py b/python/paddle/fluid/tests/unittests/test_adaptive_avg_pool1d.py
index 5a135cea52903a0d896df2d446b58d99e5a18993..424406c15bb18bade54a9b11bfdd96862d4df85c 100644
--- a/python/paddle/fluid/tests/unittests/test_adaptive_avg_pool1d.py
+++ b/python/paddle/fluid/tests/unittests/test_adaptive_avg_pool1d.py
@@ -94,6 +94,10 @@ class TestPool1d_API(unittest.TestCase):
             result = ada_max_pool1d_dg(input)
             self.assertTrue(np.allclose(result.numpy(), result_np))
 
+            result = paddle.nn.functional.common.interpolate(
+                input, mode="area", size=16)
+            self.assertTrue(np.allclose(result.numpy(), result_np))
+
     def check_adaptive_avg_static_results(self, place):
         with fluid.program_guard(fluid.Program(), fluid.Program()):
             input = fluid.data(name="input", shape=[2, 3, 32], dtype="float32")
diff --git a/python/paddle/fluid/tests/unittests/test_adaptive_avg_pool2d.py b/python/paddle/fluid/tests/unittests/test_adaptive_avg_pool2d.py
index 55c30e3d2ade0725e6debcdd0a69ca4eee622aec..e3c70884ebcf116feb4f5b0aa808c71e4b7f8c4e 100644
--- a/python/paddle/fluid/tests/unittests/test_adaptive_avg_pool2d.py
+++ b/python/paddle/fluid/tests/unittests/test_adaptive_avg_pool2d.py
@@ -163,6 +163,9 @@ class TestAdaptiveAvgPool2dAPI(unittest.TestCase):
             out_5 = paddle.nn.functional.adaptive_avg_pool2d(
                 x=x, output_size=[None, 3])
 
+            out_6 = paddle.nn.functional.interpolate(
+                x=x, mode="area", size=[2, 5])
+
             assert np.allclose(out_1.numpy(), self.res_1_np)
 
             assert np.allclose(out_2.numpy(), self.res_2_np)
@@ -173,6 +176,8 @@ class TestAdaptiveAvgPool2dAPI(unittest.TestCase):
 
             assert np.allclose(out_5.numpy(), self.res_5_np)
 
+            assert np.allclose(out_6.numpy(), self.res_3_np)
+
 
 class TestAdaptiveAvgPool2dClassAPI(unittest.TestCase):
     def setUp(self):
diff --git a/python/paddle/fluid/tests/unittests/test_adaptive_avg_pool3d.py b/python/paddle/fluid/tests/unittests/test_adaptive_avg_pool3d.py
index c04ee660667edaff01d7029e83b912c05429a15f..a3c9dd91a69ea83b08c3f817403620460333b5e9 100755
--- a/python/paddle/fluid/tests/unittests/test_adaptive_avg_pool3d.py
+++ b/python/paddle/fluid/tests/unittests/test_adaptive_avg_pool3d.py
@@ -178,6 +178,9 @@ class TestAdaptiveAvgPool3dAPI(unittest.TestCase):
             out_5 = paddle.nn.functional.adaptive_avg_pool3d(
                 x=x, output_size=[None, 3, None])
 
+            out_6 = paddle.nn.functional.interpolate(
+                x=x, mode="area", size=[2, 3, 5])
+
             assert np.allclose(out_1.numpy(), self.res_1_np)
 
             assert np.allclose(out_2.numpy(), self.res_2_np)
@@ -188,6 +191,8 @@ class TestAdaptiveAvgPool3dAPI(unittest.TestCase):
 
             assert np.allclose(out_5.numpy(), self.res_5_np)
 
+            assert np.allclose(out_6.numpy(), self.res_3_np)
+
 
 class TestAdaptiveAvgPool3dClassAPI(unittest.TestCase):
     def setUp(self):
diff --git a/python/paddle/fluid/tests/unittests/test_affine_grid_op.py b/python/paddle/fluid/tests/unittests/test_affine_grid_op.py
index 55612d71a17a7ae9801535bf5a35c83b100aab30..d3e990ca13eb2911ea04ed546b91f58e2db4e440 100644
--- a/python/paddle/fluid/tests/unittests/test_affine_grid_op.py
+++ b/python/paddle/fluid/tests/unittests/test_affine_grid_op.py
@@ -49,7 +49,6 @@ class TestAffineGridOp(OpTest):
         self.initTestCase()
         self.op_type = "affine_grid"
         theta = np.random.randint(1, 3, self.theta_shape).astype("float32")
-        theta = np.ones(self.theta_shape).astype("float32")
         self.inputs = {'Theta': theta}
         self.attrs = {
             "use_cudnn": self.use_cudnn,
diff --git a/python/paddle/fluid/tests/unittests/test_allgather.py b/python/paddle/fluid/tests/unittests/test_allgather.py
index 877ae6f6e16c2269d7674c38b1ec30ad02f453c0..9bb34d3db4388d5a4f109ef20d2199ee7431dae8 100644
--- a/python/paddle/fluid/tests/unittests/test_allgather.py
+++ b/python/paddle/fluid/tests/unittests/test_allgather.py
@@ -15,9 +15,12 @@
 from __future__ import print_function
 import unittest
 import numpy as np
+import paddle
 
 from test_collective_base import TestDistBase
 
+paddle.enable_static()
+
 
 class TestAllGatherOp(TestDistBase):
     def _setup_config(self):
diff --git a/python/paddle/fluid/tests/unittests/test_allreduce.py b/python/paddle/fluid/tests/unittests/test_allreduce.py
index e0b6422a67b408840be9b96210b6003165dcb3a8..660f559535cd8f7f81499d1ea7244b033c12f08c 100644
--- a/python/paddle/fluid/tests/unittests/test_allreduce.py
+++ b/python/paddle/fluid/tests/unittests/test_allreduce.py
@@ -15,9 +15,12 @@
 from __future__ import print_function
 import unittest
 import numpy as np
+import paddle
 
 from test_collective_base import TestDistBase
 
+paddle.enable_static()
+
 
 class TestAllReduceOp(TestDistBase):
     def _setup_config(self):
diff --git a/python/paddle/fluid/tests/unittests/test_amp_check_finite_and_scale_op.py b/python/paddle/fluid/tests/unittests/test_amp_check_finite_and_scale_op.py
index 70863d3857c43c84a583f0ccf7b9bd733fdb4fd0..fbacaa3d5ce10bdad6dd87fdfc04c1173aff18ff 100644
--- a/python/paddle/fluid/tests/unittests/test_amp_check_finite_and_scale_op.py
+++ b/python/paddle/fluid/tests/unittests/test_amp_check_finite_and_scale_op.py
@@ -18,9 +18,9 @@ from op_test import OpTest, skip_check_grad_ci
 import paddle.fluid as fluid
 
 
-class TestAmpCheckFiniteAndScaleOp(OpTest):
+class TestCheckFiniteAndUnscaleOp(OpTest):
     def setUp(self):
-        self.op_type = "amp_check_finite_and_scale"
+        self.op_type = "check_finite_and_unscale"
         self.init_dtype()
         x = np.random.random((1024, 1024)).astype(self.dtype)
         scale = np.random.random((1)).astype(self.dtype)
@@ -28,7 +28,7 @@ class TestAmpCheckFiniteAndScaleOp(OpTest):
         self.inputs = {'X': [('x0', x)], 'Scale': scale}
         self.outputs = {
             'FoundInfinite': np.array([0]),
-            'Out': [('out0', x * scale)],
+            'Out': [('out0', x / scale)],
         }
 
     def init_dtype(self):
@@ -38,9 +38,9 @@ class TestAmpCheckFiniteAndScaleOp(OpTest):
         self.check_output()
 
 
-class TestAmpCheckFiniteAndScaleOpWithNan(OpTest):
+class TestCheckFiniteAndUnscaleOpWithNan(OpTest):
     def setUp(self):
-        self.op_type = "amp_check_finite_and_scale"
+        self.op_type = "check_finite_and_unscale"
         self.init_dtype()
         x = np.random.random((1024, 1024)).astype(self.dtype)
         x[128][128] = np.nan
@@ -61,9 +61,9 @@ class TestAmpCheckFiniteAndScaleOpWithNan(OpTest):
         self.check_output(no_check_set=['Out'])
 
 
-class TestAmpCheckFiniteAndScaleOpWithInf(OpTest):
+class TestCheckFiniteAndUnscaleOpWithInf(OpTest):
     def setUp(self):
-        self.op_type = "amp_check_finite_and_scale"
+        self.op_type = "check_finite_and_unscale"
         self.init_dtype()
         x = np.random.random((1024, 1024)).astype(self.dtype)
         x[128][128] = np.inf
diff --git a/python/paddle/fluid/tests/unittests/test_arg_min_max_v2_op.py b/python/paddle/fluid/tests/unittests/test_arg_min_max_v2_op.py
index 0fd9863948aedb64052e8fa0668f03600ae3197c..74f76030a29d2c9ce27278b61548c8877c1467ad 100644
--- a/python/paddle/fluid/tests/unittests/test_arg_min_max_v2_op.py
+++ b/python/paddle/fluid/tests/unittests/test_arg_min_max_v2_op.py
@@ -322,6 +322,20 @@ class TestArgMinMaxOpError(unittest.TestCase):
 
             self.assertRaises(TypeError, test_argmin_axis_type)
 
+            def test_argmax_dtype_type():
+                data = paddle.static.data(
+                    name="test_argmax", shape=[10], dtype="float32")
+                output = paddle.argmax(x=data, dtype=None)
+
+            self.assertRaises(ValueError, test_argmax_dtype_type)
+
+            def test_argmin_dtype_type():
+                data = paddle.static.data(
+                    name="test_argmin", shape=[10], dtype="float32")
+                output = paddle.argmin(x=data, dtype=None)
+
+            self.assertRaises(ValueError, test_argmin_dtype_type)
+
 
 if __name__ == '__main__':
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_argsort_op.py b/python/paddle/fluid/tests/unittests/test_argsort_op.py
index 2a8e0e6c7f0bcf4a779b4c098cd4af816e976205..e324f0ec3d37f6ea1cf257cac9c7e72969cd8971 100644
--- a/python/paddle/fluid/tests/unittests/test_argsort_op.py
+++ b/python/paddle/fluid/tests/unittests/test_argsort_op.py
@@ -348,57 +348,99 @@ class TestArgsortErrorOnGPU(TestArgsortErrorOnCPU):
 
 
 class TestArgsort(unittest.TestCase):
+    def init(self):
+        self.input_shape = [10000, ]
+        self.axis = 0
+
     def setUp(self):
+        self.init()
         if core.is_compiled_with_cuda():
             self.place = core.CUDAPlace(0)
         else:
             self.place = core.CPUPlace()
-        self.data = np.random.rand(2, 3, 4).astype("float32")
+        self.data = np.random.rand(*self.input_shape)
 
-    def test_api_0(self):
+    def test_api(self):
         with fluid.program_guard(fluid.Program()):
-            input = fluid.data(name="input", shape=[2, 3, 4], dtype="float32")
-            output = paddle.argsort(x=input)
-            exe = fluid.Executor(self.place)
-            result, = exe.run(feed={'input': self.data}, fetch_list=[output])
-            np_result = np.argsort(self.data)
-            self.assertEqual((result == np_result).all(), True)
+            input = fluid.data(
+                name="input", shape=self.input_shape, dtype="float64")
+
+            output = paddle.argsort(input, axis=self.axis)
+            output2 = paddle.argsort(input, axis=self.axis, descending=True)
 
-    def test_api_1(self):
-        with fluid.program_guard(fluid.Program()):
-            input = fluid.data(name="input", shape=[2, 3, 4], dtype="float32")
-            output = paddle.argsort(x=input, axis=1)
             exe = fluid.Executor(self.place)
-            result, = exe.run(feed={'input': self.data}, fetch_list=[output])
-            np_result = np.argsort(self.data, axis=1)
+            result, result2 = exe.run(feed={'input': self.data},
+                                      fetch_list=[output, output2])
+
+            np_result = np.argsort(self.data, axis=self.axis)
             self.assertEqual((result == np_result).all(), True)
 
+            np_result2 = np.argsort(-self.data, axis=self.axis)
+            self.assertEqual((result2 == np_result2).all(), True)
+
+
+class TestArgsort2(TestArgsort):
+    def init(self):
+        self.input_shape = [10000, 1]
+        self.axis = 0
+
+
+class TestArgsort3(TestArgsort):
+    def init(self):
+        self.input_shape = [1, 10000]
+        self.axis = 1
+
+
+class TestArgsort4(TestArgsort):
+    def init(self):
+        self.input_shape = [2, 3, 4]
+        self.axis = 1
+
+
+class TestArgsortImperative(unittest.TestCase):
+    def init(self):
+        self.input_shape = [10000, ]
+        self.axis = 0
 
-class TestArgsortDygraph(unittest.TestCase):
     def setUp(self):
-        self.input_data = np.random.rand(10, 10)
+        self.init()
+        self.input_data = np.random.rand(*self.input_shape)
         if core.is_compiled_with_cuda():
             self.place = core.CUDAPlace(0)
         else:
             self.place = core.CPUPlace()
 
-    def test_api_0(self):
+    def test_api(self):
         paddle.disable_static(self.place)
-        var_x = paddle.to_variable(self.input_data)
-        out = paddle.argsort(var_x)
-        self.assertEqual((np.argsort(self.input_data) == out.numpy()).all(),
-                         True)
-        paddle.enable_static()
+        var_x = paddle.to_tensor(self.input_data)
+        out = paddle.argsort(var_x, axis=self.axis)
+        expect = np.argsort(self.input_data, axis=self.axis)
+        self.assertEqual((expect == out.numpy()).all(), True)
+
+        out2 = paddle.argsort(var_x, axis=self.axis, descending=True)
+        expect2 = np.argsort(-self.input_data, axis=self.axis)
+        self.assertEqual((expect2 == out2.numpy()).all(), True)
 
-    def test_api_1(self):
-        paddle.disable_static(self.place)
-        var_x = paddle.to_variable(self.input_data)
-        out = paddle.argsort(var_x, axis=-1)
-        self.assertEqual(
-            (np.argsort(
-                self.input_data, axis=-1) == out.numpy()).all(), True)
         paddle.enable_static()
 
 
+class TestArgsortImperative2(TestArgsortImperative):
+    def init(self):
+        self.input_shape = [10000, 1]
+        self.axis = 0
+
+
+class TestArgsortImperative3(TestArgsortImperative):
+    def init(self):
+        self.input_shape = [1, 10000]
+        self.axis = 1
+
+
+class TestArgsortImperative2(TestArgsortImperative):
+    def init(self):
+        self.input_shape = [2, 3, 4]
+        self.axis = 1
+
+
 if __name__ == "__main__":
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_auto_checkpoint.py b/python/paddle/fluid/tests/unittests/test_auto_checkpoint.py
index fd009db5fd00133c5bad7c8c52662002ebd03fa8..3f33120d1f79f089d7511621611141683f0a03cd 100644
--- a/python/paddle/fluid/tests/unittests/test_auto_checkpoint.py
+++ b/python/paddle/fluid/tests/unittests/test_auto_checkpoint.py
@@ -31,6 +31,7 @@ from paddle.io import Dataset, BatchSampler, DataLoader
 
 from paddle.fluid.tests.unittests.auto_checkpoint_utils import AutoCheckpointBase, get_logger
 
+paddle.enable_static()
 logger = get_logger()
 
 
diff --git a/python/paddle/fluid/tests/unittests/test_auto_checkpoint1.py b/python/paddle/fluid/tests/unittests/test_auto_checkpoint1.py
index 55173325f621f7333a7c3ca32a9c55becee72e5a..fca1baf85e56e1f531dc3c5f64a7af0bda18836c 100644
--- a/python/paddle/fluid/tests/unittests/test_auto_checkpoint1.py
+++ b/python/paddle/fluid/tests/unittests/test_auto_checkpoint1.py
@@ -32,6 +32,7 @@ from paddle.io import Dataset, BatchSampler, DataLoader
 from paddle.fluid.tests.unittests.auto_checkpoint_utils import AutoCheckpointBase, get_logger
 from paddle.fluid.tests.unittests.test_auto_checkpoint import AutoCheckPointACLBase
 
+paddle.enable_static()
 logger = get_logger()
 
 
diff --git a/python/paddle/fluid/tests/unittests/test_auto_checkpoint2.py b/python/paddle/fluid/tests/unittests/test_auto_checkpoint2.py
index 5d72fa01008af55a83d7b9a19747a8d96fb74b2b..0c17807a689e6793af6d81467d73a5727d546698 100644
--- a/python/paddle/fluid/tests/unittests/test_auto_checkpoint2.py
+++ b/python/paddle/fluid/tests/unittests/test_auto_checkpoint2.py
@@ -32,6 +32,7 @@ from paddle.io import Dataset, BatchSampler, DataLoader
 from paddle.fluid.tests.unittests.auto_checkpoint_utils import AutoCheckpointBase, get_logger
 from paddle.fluid.tests.unittests.test_auto_checkpoint import AutoCheckPointACLBase
 
+paddle.enable_static()
 logger = get_logger()
 
 
diff --git a/python/paddle/fluid/tests/unittests/test_auto_checkpoint3.py b/python/paddle/fluid/tests/unittests/test_auto_checkpoint3.py
index 5382f7e328ed1afa2d7516cd0d8db2db659aadd7..ca103be59b96714fe6762e517a665c298082334f 100644
--- a/python/paddle/fluid/tests/unittests/test_auto_checkpoint3.py
+++ b/python/paddle/fluid/tests/unittests/test_auto_checkpoint3.py
@@ -32,6 +32,7 @@ from paddle.io import Dataset, BatchSampler, DataLoader
 from paddle.fluid.tests.unittests.auto_checkpoint_utils import AutoCheckpointBase, get_logger
 from paddle.fluid.tests.unittests.test_auto_checkpoint import AutoCheckPointACLBase
 
+paddle.enable_static()
 logger = get_logger()
 
 
diff --git a/python/paddle/fluid/tests/unittests/test_auto_checkpoint_dist_basic.py b/python/paddle/fluid/tests/unittests/test_auto_checkpoint_dist_basic.py
index 90db9595d92ef602c03fa7dd104484a4f6101a87..3eeff91ff2d830f6dcedbae291342f9a6ecf4878 100644
--- a/python/paddle/fluid/tests/unittests/test_auto_checkpoint_dist_basic.py
+++ b/python/paddle/fluid/tests/unittests/test_auto_checkpoint_dist_basic.py
@@ -32,6 +32,7 @@ from paddle.io import Dataset, BatchSampler, DataLoader
 from paddle.fluid.tests.unittests.auto_checkpoint_utils import AutoCheckpointBase, get_logger
 from paddle.fluid.tests.unittests.test_auto_checkpoint import AutoCheckPointACLBase
 
+paddle.enable_static()
 logger = get_logger()
 
 
@@ -67,13 +68,13 @@ class AutoCheckpointTestDist(AutoCheckPointACLBase):
         save_dir = "./run_save_0"
         fs.delete(save_dir)
 
-        #basic
+        # basic
         exe, main_prog, startup_prog = self._generate()
 
         compiled, data_loader, optimizer, loss, image, label = \
             self._init_env(exe, main_prog, startup_prog, minimize=False)
 
-        #fleet
+        # fleet
         os.environ["TRAINING_ROLE"] = "TRAINER"
         os.environ["PADDLE_TRAINER_ID"] = "0"
         os.environ["PADDLE_TRAINER_ENDPOINTS"] = "127.0.0.1:6070"
diff --git a/python/paddle/fluid/tests/unittests/test_auto_checkpoint_multiple.py b/python/paddle/fluid/tests/unittests/test_auto_checkpoint_multiple.py
index 8c10cd0e9922859bf3bad2015587fc0a6b2ba5da..f8c12f8905112cd5f768ea04cae21b19c90f46f6 100644
--- a/python/paddle/fluid/tests/unittests/test_auto_checkpoint_multiple.py
+++ b/python/paddle/fluid/tests/unittests/test_auto_checkpoint_multiple.py
@@ -32,6 +32,7 @@ from paddle.io import Dataset, BatchSampler, DataLoader
 from paddle.fluid.tests.unittests.auto_checkpoint_utils import AutoCheckpointBase, get_logger
 from paddle.fluid.tests.unittests.test_auto_checkpoint import AutoCheckPointACLBase
 
+paddle.enable_static()
 logger = get_logger()
 
 
diff --git a/python/paddle/fluid/tests/unittests/test_batch_norm_op_v2.py b/python/paddle/fluid/tests/unittests/test_batch_norm_op_v2.py
index 5c705378e515eec4c950f6996e2789df603fcda3..2af0b31d6fc26c59803f29dcdc54979491767dd2 100644
--- a/python/paddle/fluid/tests/unittests/test_batch_norm_op_v2.py
+++ b/python/paddle/fluid/tests/unittests/test_batch_norm_op_v2.py
@@ -43,6 +43,21 @@ class TestBatchNorm(unittest.TestCase):
             x_data_4 = np.random.random(size=(2, 1, 3, 3)).astype('float32')
             x_data_3 = np.random.random(size=(2, 1, 3)).astype('float32')
 
+            def error1d_dataformat():
+                x_data_4 = np.random.random(size=(2, 1, 3, 3)).astype('float32')
+                batch_norm1d = paddle.nn.BatchNorm1d(1, data_format='NCDHW')
+                batch_norm1d(fluid.dygraph.to_variable(x_data_4))
+
+            def error2d_dataformat():
+                x_data_3 = np.random.random(size=(2, 1, 3)).astype('float32')
+                batch_norm2d = paddle.nn.BatchNorm2d(1, data_format='NCDHW')
+                batch_norm2d(fluid.dygraph.to_variable(x_data_3))
+
+            def error3d_dataformat():
+                x_data_4 = np.random.random(size=(2, 1, 3, 3)).astype('float32')
+                batch_norm3d = paddle.nn.BatchNorm3d(1, data_format='NCL')
+                batch_norm3d(fluid.dygraph.to_variable(x_data_4))
+
             def error1d():
                 x_data_4 = np.random.random(size=(2, 1, 3, 3)).astype('float32')
                 batch_norm1d = paddle.nn.BatchNorm1d(1)
@@ -62,6 +77,9 @@ class TestBatchNorm(unittest.TestCase):
                 self.assertRaises(ValueError, error1d)
                 self.assertRaises(ValueError, error2d)
                 self.assertRaises(ValueError, error3d)
+                self.assertRaises(ValueError, error1d_dataformat)
+                self.assertRaises(ValueError, error2d_dataformat)
+                self.assertRaises(ValueError, error3d_dataformat)
 
     def test_dygraph(self):
         places = [fluid.CPUPlace()]
diff --git a/python/paddle/fluid/tests/unittests/test_bicubic_interp_v2_op.py b/python/paddle/fluid/tests/unittests/test_bicubic_interp_v2_op.py
index 01daea32167d28edbb46d6854872976aed79494e..b1ec74411987a73cf2e6a7d60aecce6c87ed598e 100644
--- a/python/paddle/fluid/tests/unittests/test_bicubic_interp_v2_op.py
+++ b/python/paddle/fluid/tests/unittests/test_bicubic_interp_v2_op.py
@@ -53,6 +53,8 @@ def value_bound(input, w, h, x, y):
 def bicubic_interp_np(input,
                       out_h,
                       out_w,
+                      scale_h=0,
+                      scale_w=0,
                       out_size=None,
                       actual_shape=None,
                       align_corners=True,
@@ -73,13 +75,19 @@ def bicubic_interp_np(input,
         if (align_corners):
             ratio_h = (in_h - 1.0) / (out_h - 1.0)
         else:
-            ratio_h = 1.0 * in_h / out_h
+            if scale_h > 0:
+                ratio_h = 1.0 / scale_h
+            else:
+                ratio_h = 1.0 * in_h / out_h
 
     if out_w > 1:
         if (align_corners):
             ratio_w = (in_w - 1.0) / (out_w - 1.0)
         else:
-            ratio_w = 1.0 * in_w / out_w
+            if scale_w > 0:
+                ratio_w = 1.0 / scale_w
+            else:
+                ratio_w = 1.0 * in_w / out_w
 
     out = np.zeros((batch_size, channel, out_h, out_w))
 
@@ -128,7 +136,8 @@ class TestBicubicInterpOp(OpTest):
         self.init_test_case()
         self.op_type = "bicubic_interp_v2"
         input_np = np.random.random(self.input_shape).astype("float64")
-
+        scale_h = 0
+        scale_w = 0
         if self.data_layout == "NCHW":
             in_h = self.input_shape[2]
             in_w = self.input_shape[3]
@@ -151,9 +160,9 @@ class TestBicubicInterpOp(OpTest):
             out_h = self.out_h
             out_w = self.out_w
 
-        output_np = bicubic_interp_np(input_np, out_h, out_w, self.out_size,
-                                      self.actual_shape, self.align_corners,
-                                      self.data_layout)
+        output_np = bicubic_interp_np(input_np, out_h, out_w, scale_h, scale_w,
+                                      self.out_size, self.actual_shape,
+                                      self.align_corners, self.data_layout)
         self.inputs = {'X': input_np}
         if self.out_size is not None:
             self.inputs['OutSize'] = self.out_size
@@ -480,10 +489,34 @@ class TestBicubicOpError(unittest.TestCase):
                 out = interpolate(
                     x,
                     size=None,
-                    mode='trilinear',
+                    mode='bicubic',
                     align_corners=False,
                     scale_factor=[1, 2, 2])
 
+            def test_size_and_scale():
+                x = fluid.data(name="x", shape=[2, 3, 6, 6], dtype="float32")
+                out = interpolate(
+                    x,
+                    size=None,
+                    mode='bicubic',
+                    align_corners=False,
+                    scale_factor=None)
+
+            def test_size_and_scale2():
+                x = fluid.data(
+                    name="input", shape=[2, 3, 6, 9, 4], dtype="float32")
+                out = interpolate(
+                    x,
+                    size=[2, 2, 2],
+                    mode='trilinear',
+                    align_corners=False,
+                    scale_factor=2.0)
+
+            def test_size_type():
+                x = fluid.data(name="x", shape=[2, 3, 6, 6], dtype="float32")
+                out = interpolate(
+                    x, size={2, 2}, mode='bicubic', align_corners=False)
+
             self.assertRaises(ValueError, test_mode_type)
             self.assertRaises(ValueError, test_input_shape)
             self.assertRaises(TypeError, test_align_corcers)
@@ -498,6 +531,9 @@ class TestBicubicOpError(unittest.TestCase):
             self.assertRaises(ValueError, test_align_corners_and_nearest)
             self.assertRaises(ValueError, test_scale_shape)
             self.assertRaises(ValueError, test_scale_value)
+            self.assertRaises(ValueError, test_size_and_scale)
+            self.assertRaises(ValueError, test_size_and_scale2)
+            self.assertRaises(TypeError, test_size_type)
 
 
 if __name__ == "__main__":
diff --git a/python/paddle/fluid/tests/unittests/test_bilinear_interp_v2_op.py b/python/paddle/fluid/tests/unittests/test_bilinear_interp_v2_op.py
index d139a53c7e2ccc68964457f3142b4ed890d339f2..9fc4971fec23923a40080613612d3a1843a86d2e 100755
--- a/python/paddle/fluid/tests/unittests/test_bilinear_interp_v2_op.py
+++ b/python/paddle/fluid/tests/unittests/test_bilinear_interp_v2_op.py
@@ -26,6 +26,8 @@ import paddle
 def bilinear_interp_np(input,
                        out_h,
                        out_w,
+                       scale_w=0,
+                       scale_h=0,
                        out_size=None,
                        actual_shape=None,
                        align_corners=True,
@@ -47,12 +49,18 @@ def bilinear_interp_np(input,
         if (align_corners):
             ratio_h = (in_h - 1.0) / (out_h - 1.0)
         else:
-            ratio_h = 1.0 * in_h / out_h
+            if scale_h > 0:
+                ratio_h = 1.0 / scale_h
+            else:
+                ratio_h = 1.0 * in_h / out_h
     if out_w > 1:
         if (align_corners):
             ratio_w = (in_w - 1.0) / (out_w - 1.0)
         else:
-            ratio_w = 1.0 * in_w / out_w
+            if scale_w > 0:
+                ratio_w = 1.0 / scale_w
+            else:
+                ratio_w = 1.0 * in_w / out_w
 
     out = np.zeros((batch_size, channel, out_h, out_w))
 
@@ -110,7 +118,8 @@ class TestBilinearInterpOp(OpTest):
         else:
             in_h = self.input_shape[1]
             in_w = self.input_shape[2]
-
+        scale_h = 0
+        scale_w = 0
         if self.scale:
             if isinstance(self.scale, float) or isinstance(self.scale, int):
                 if self.scale > 0.:
@@ -126,9 +135,9 @@ class TestBilinearInterpOp(OpTest):
             out_h = self.out_h
             out_w = self.out_w
 
-        output_np = bilinear_interp_np(input_np, out_h, out_w, self.out_size,
-                                       self.actual_shape, self.align_corners,
-                                       self.align_mode, self.data_layout)
+        output_np = bilinear_interp_np(
+            input_np, out_h, out_w, 0, 0, self.out_size, self.actual_shape,
+            self.align_corners, self.align_mode, self.data_layout)
         self.inputs = {'X': input_np}
         if self.out_size is not None:
             self.inputs['OutSize'] = self.out_size
@@ -238,6 +247,17 @@ class TestBilinearInterpCase6(TestBilinearInterpOp):
         self.align_mode = 1
 
 
+class TestBilinearInterpCase7(TestBilinearInterpOp):
+    def init_test_case(self):
+        self.interp_method = 'bilinear'
+        self.input_shape = [1, 1, 32, 64]
+        self.out_h = 64
+        self.out_w = 32
+        self.scale = [2.0, 0.5]
+        self.align_corners = False
+        self.align_mode = 1
+
+
 class TestBilinearInterpSame(TestBilinearInterpOp):
     def init_test_case(self):
         self.interp_method = 'bilinear'
@@ -298,9 +318,9 @@ class TestBilinearInterpOpUint8(OpTest):
             out_h = self.out_h
             out_w = self.out_w
 
-        output_np = bilinear_interp_np(input_np, out_h, out_w, self.out_size,
-                                       self.actual_shape, self.align_corners,
-                                       self.align_mode)
+        output_np = bilinear_interp_np(input_np, out_h, out_w, 0, 0,
+                                       self.out_size, self.actual_shape,
+                                       self.align_corners, self.align_mode)
         self.inputs = {'X': input_np}
         if self.out_size is not None:
             self.inputs['OutSize'] = self.out_size
@@ -481,8 +501,9 @@ class TestBilinearInterpOp_attr_tensor(OpTest):
             if isinstance(self.scale, list) and len(self.scale) == 1:
                 self.scale = [self.scale[0], self.scale[0]]
             self.attrs['scale'] = self.scale
-        output_np = bilinear_interp_np(input_np, out_h, out_w, self.out_size,
-                                       self.actual_shape, self.align_corners)
+        output_np = bilinear_interp_np(input_np, out_h, out_w, 0, 0,
+                                       self.out_size, self.actual_shape,
+                                       self.align_corners)
         self.outputs = {'Out': output_np}
 
     def test_check_output(self):
diff --git a/python/paddle/fluid/tests/unittests/test_broadcast.py b/python/paddle/fluid/tests/unittests/test_broadcast.py
index 029e881d6f69ec0781c1d8ad8e66a9b6fd48cec1..8b8cdb1235ce3830277cfe661bad84aba423e24b 100644
--- a/python/paddle/fluid/tests/unittests/test_broadcast.py
+++ b/python/paddle/fluid/tests/unittests/test_broadcast.py
@@ -15,9 +15,12 @@
 from __future__ import print_function
 import unittest
 import numpy as np
+import paddle
 
 from test_collective_base import TestDistBase
 
+paddle.enable_static()
+
 
 class TestCBroadcastOp(TestDistBase):
     def _setup_config(self):
diff --git a/python/paddle/fluid/tests/unittests/test_broadcast_error.py b/python/paddle/fluid/tests/unittests/test_broadcast_error.py
new file mode 100644
index 0000000000000000000000000000000000000000..517de67fd6dddf1d0a74df6ffed659720862b20c
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/test_broadcast_error.py
@@ -0,0 +1,38 @@
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+import numpy as np
+from op_test import OpTest
+import paddle.fluid.core as core
+
+
+class TestBroadcastOpCpu(OpTest):
+    def setUp(self):
+        self.op_type = "broadcast"
+        input = np.random.random((100, 2)).astype("float32")
+        np_out = input[:]
+        self.inputs = {"X": input}
+        self.attrs = {"sync_mode": False, "root": 0}
+        self.outputs = {"Out": np_out}
+
+    def test_check_output_cpu(self):
+        try:
+            self.check_output_with_place(place=core.CPUPlace())
+        except:
+            print("do not support cpu test, skip")
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_buffer_shared_memory_reuse_pass.py b/python/paddle/fluid/tests/unittests/test_buffer_shared_memory_reuse_pass.py
index 43d485a0a6d24be6e8db32f16fe96a70bb229858..2c9168df472f493a16c19ad1b121ec0d126b6306 100644
--- a/python/paddle/fluid/tests/unittests/test_buffer_shared_memory_reuse_pass.py
+++ b/python/paddle/fluid/tests/unittests/test_buffer_shared_memory_reuse_pass.py
@@ -36,6 +36,7 @@ class InplaceTestBase(unittest.TestCase):
         self.fuse_all_optimizer_ops = False
 
     def setUp(self):
+        paddle.enable_static()
         self.initParameter()
         if self.use_cuda and fluid.core.is_compiled_with_cuda():
             self.device_count = fluid.core.get_cuda_device_count()
diff --git a/python/paddle/fluid/tests/unittests/test_calc_gradient.py b/python/paddle/fluid/tests/unittests/test_calc_gradient.py
index 3e8c449d8995ca90401861e93f2fb987d1c6967d..fdfaf6a3113bbb9a50a79de7ef4ac4c3251d5759 100644
--- a/python/paddle/fluid/tests/unittests/test_calc_gradient.py
+++ b/python/paddle/fluid/tests/unittests/test_calc_gradient.py
@@ -15,7 +15,7 @@
 from __future__ import print_function
 
 import unittest
-
+import numpy as np
 import paddle.fluid as fluid
 import paddle.fluid.layers as layers
 from paddle.fluid.backward import calc_gradient
@@ -81,5 +81,22 @@ class TestDoubleGrad(unittest.TestCase):
         self.assertEqual(12, out[0])
 
 
+class TestGradientWithPrune(unittest.TestCase):
+    def test_prune(self):
+        x = fluid.data(name='x', shape=[3], dtype='float32')
+        x.stop_gradient = False
+        x1, x2, x3 = fluid.layers.split(x, dim=0, num_or_sections=3)
+        y = x1 * 2
+        x1_grad = fluid.gradients(y, x)
+
+        exe = fluid.Executor(fluid.CPUPlace())
+        main = fluid.default_main_program()
+        exe.run(fluid.default_startup_program())
+        out = exe.run(main,
+                      feed={'x': np.ones([3]).astype('float32')},
+                      fetch_list=[x1_grad])
+        self.assertTrue(np.array_equal(out[0], [2., 0., 0.]))
+
+
 if __name__ == "__main__":
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_checkpoint_saver.py b/python/paddle/fluid/tests/unittests/test_checkpoint_saver.py
index ad75f2aa8bc06d2af24f5d22eb126a3558cd6f74..4c1b1e0f0bf9034263d2c36747cdae301da24215 100644
--- a/python/paddle/fluid/tests/unittests/test_checkpoint_saver.py
+++ b/python/paddle/fluid/tests/unittests/test_checkpoint_saver.py
@@ -21,8 +21,7 @@ from paddle.fluid.incubate.checkpoint.checkpoint_saver import CheckpointSaver
 import os
 import sys
 
-from paddle.fluid.incubate.fleet.utils.fs import LocalFS
-from paddle.fluid.incubate.fleet.utils.hdfs import HDFSClient
+from paddle.distributed.fleet.utils.fs import LocalFS, HDFSClient
 from paddle.fluid.incubate.checkpoint.checkpoint_saver import CheckpointSaver
 
 
diff --git a/python/paddle/fluid/tests/unittests/test_clip_op.py b/python/paddle/fluid/tests/unittests/test_clip_op.py
index 2e1f9d41747e3a99b4b4a0650a52973459b85c7b..b56d9f6668e8bcbd37443fb88b1f5f4dd40a2511 100644
--- a/python/paddle/fluid/tests/unittests/test_clip_op.py
+++ b/python/paddle/fluid/tests/unittests/test_clip_op.py
@@ -138,8 +138,9 @@ class TestClipAPI(unittest.TestCase):
         out_6 = paddle.clip(images, max=max)
         out_7 = paddle.clip(images, max=-1.)
         out_8 = paddle.clip(images)
+        out_9 = paddle.clip(paddle.cast(images, 'float64'), min=0.2, max=0.9)
 
-        res1, res2, res3, res4, res5, res6, res7, res8 = exe.run(
+        res1, res2, res3, res4, res5, res6, res7, res8, res9 = exe.run(
             fluid.default_main_program(),
             feed={
                 "image": data,
@@ -147,7 +148,7 @@ class TestClipAPI(unittest.TestCase):
                 "max": np.array([0.8]).astype('float32')
             },
             fetch_list=[
-                out_1, out_2, out_3, out_4, out_5, out_6, out_7, out_8
+                out_1, out_2, out_3, out_4, out_5, out_6, out_7, out_8, out_9
             ])
 
         self.assertTrue(np.allclose(res1, data.clip(0.2, 0.8)))
@@ -158,6 +159,8 @@ class TestClipAPI(unittest.TestCase):
         self.assertTrue(np.allclose(res6, data.clip(max=0.8)))
         self.assertTrue(np.allclose(res7, data.clip(max=-1)))
         self.assertTrue(np.allclose(res8, data))
+        self.assertTrue(
+            np.allclose(res9, data.astype(np.float64).clip(0.2, 0.9)))
 
     def test_clip_dygraph(self):
         place = fluid.CUDAPlace(0) if fluid.core.is_compiled_with_cuda(
diff --git a/python/paddle/fluid/tests/unittests/test_collect_fpn_proposals_op.py b/python/paddle/fluid/tests/unittests/test_collect_fpn_proposals_op.py
index 034bb7f8dc7e00a321b6c6a5a4776fa4f7398ab5..a2f56c428012c615dcf55b6832a54ca473771d08 100644
--- a/python/paddle/fluid/tests/unittests/test_collect_fpn_proposals_op.py
+++ b/python/paddle/fluid/tests/unittests/test_collect_fpn_proposals_op.py
@@ -33,10 +33,14 @@ class TestCollectFPNProposalstOp(OpTest):
                     for i in range(self.num_level)]
         self.inputs = {
             'MultiLevelRois': inputs_x,
-            "MultiLevelScores": self.scores_input
+            "MultiLevelScores": self.scores_input,
+            'MultiLevelRoIsNum': []
         }
         self.attrs = {'post_nms_topN': self.post_nms_top_n, }
-        self.outputs = {'FpnRois': (self.rois, [self.lod])}
+        self.outputs = {
+            'FpnRois': (self.rois, [self.lod]),
+            'RoisNum': np.array(self.lod).astype('int32')
+        }
 
     def init_test_case(self):
         self.post_nms_top_n = 20
@@ -96,5 +100,32 @@ class TestCollectFPNProposalstOp(OpTest):
         self.check_output(check_dygraph=False)
 
 
+class TestCollectFPNProposalstOpWithRoisNum(TestCollectFPNProposalstOp):
+    def set_data(self):
+        self.init_test_case()
+        self.make_rois()
+        self.scores_input = [('y%d' % i,
+                              (self.scores[i].reshape(-1, 1), self.rois_lod[i]))
+                             for i in range(self.num_level)]
+        self.rois, self.lod = self.calc_rois_collect()
+        inputs_x = [('x%d' % i, (self.roi_inputs[i][:, 1:], self.rois_lod[i]))
+                    for i in range(self.num_level)]
+        rois_num_per_level = [
+            ('rois%d' % i, np.array(self.rois_lod[i][0]).astype('int32'))
+            for i in range(self.num_level)
+        ]
+
+        self.inputs = {
+            'MultiLevelRois': inputs_x,
+            "MultiLevelScores": self.scores_input,
+            'MultiLevelRoIsNum': rois_num_per_level
+        }
+        self.attrs = {'post_nms_topN': self.post_nms_top_n, }
+        self.outputs = {
+            'FpnRois': (self.rois, [self.lod]),
+            'RoisNum': np.array(self.lod).astype('int32')
+        }
+
+
 if __name__ == '__main__':
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_collective_allgather_api.py b/python/paddle/fluid/tests/unittests/test_collective_allgather_api.py
index 71777df4651ea26c7cf5dfc7231018288c2670e2..dbf77fafcc47d0b45b95e02819384c2a1d10f98f 100644
--- a/python/paddle/fluid/tests/unittests/test_collective_allgather_api.py
+++ b/python/paddle/fluid/tests/unittests/test_collective_allgather_api.py
@@ -15,9 +15,12 @@
 from __future__ import print_function
 import unittest
 import numpy as np
+import paddle
 
 from test_collective_api_base import TestDistBase
 
+paddle.enable_static()
+
 
 class TestCollectiveAllgatherAPI(TestDistBase):
     def _setup_config(self):
diff --git a/python/paddle/fluid/tests/unittests/test_collective_allreduce_api.py b/python/paddle/fluid/tests/unittests/test_collective_allreduce_api.py
index 24dd7cacff6adc56eb059a7bec016a1d3e322825..a405da80adaf0f2c3b6698bd175797670a748c62 100644
--- a/python/paddle/fluid/tests/unittests/test_collective_allreduce_api.py
+++ b/python/paddle/fluid/tests/unittests/test_collective_allreduce_api.py
@@ -15,9 +15,12 @@
 from __future__ import print_function
 import unittest
 import numpy as np
+import paddle
 
 from test_collective_api_base import TestDistBase
 
+paddle.enable_static()
+
 
 class TestCollectiveAllreduceAPI(TestDistBase):
     def _setup_config(self):
diff --git a/python/paddle/fluid/tests/unittests/test_collective_barrier_api.py b/python/paddle/fluid/tests/unittests/test_collective_barrier_api.py
index ebf86f6ae14f1ecbdb3711378c84a3c1ce4967fb..d0a67baa61e69b09cc6578e2edb9df46df03549f 100644
--- a/python/paddle/fluid/tests/unittests/test_collective_barrier_api.py
+++ b/python/paddle/fluid/tests/unittests/test_collective_barrier_api.py
@@ -15,9 +15,12 @@
 from __future__ import print_function
 import unittest
 import numpy as np
+import paddle
 
 from test_collective_api_base import TestDistBase
 
+paddle.enable_static()
+
 
 class TestCollectiveBarrierAPI(TestDistBase):
     def _setup_config(self):
diff --git a/python/paddle/fluid/tests/unittests/test_collective_broadcast_api.py b/python/paddle/fluid/tests/unittests/test_collective_broadcast_api.py
index b1cf4f1ac4c822ad578f5ee0e0268324de5e5e25..702e04311570ef5cdd450a59f471c7688579a494 100644
--- a/python/paddle/fluid/tests/unittests/test_collective_broadcast_api.py
+++ b/python/paddle/fluid/tests/unittests/test_collective_broadcast_api.py
@@ -15,9 +15,12 @@
 from __future__ import print_function
 import unittest
 import numpy as np
+import paddle
 
 from test_collective_api_base import TestDistBase
 
+paddle.enable_static()
+
 
 class TestCollectiveBroadcastAPI(TestDistBase):
     def _setup_config(self):
diff --git a/python/paddle/fluid/tests/unittests/test_collective_reduce.py b/python/paddle/fluid/tests/unittests/test_collective_reduce.py
index 36837d6a227febd02e6ef1e2aeb905de19ca8acc..c0627467428109891ed71e1bd6f5576694ff59d6 100644
--- a/python/paddle/fluid/tests/unittests/test_collective_reduce.py
+++ b/python/paddle/fluid/tests/unittests/test_collective_reduce.py
@@ -15,9 +15,12 @@
 from __future__ import print_function
 import unittest
 import numpy as np
+import paddle
 
 from test_collective_base import TestDistBase
 
+paddle.enable_static()
+
 
 class TestCReduceOp(TestDistBase):
     def _setup_config(self):
diff --git a/python/paddle/fluid/tests/unittests/test_collective_reduce_api.py b/python/paddle/fluid/tests/unittests/test_collective_reduce_api.py
index bf3975f3fc1c6959ffbb28a51543ebfef00c52e5..8d28c794f023a6945893342a53386f6ffb8a6052 100644
--- a/python/paddle/fluid/tests/unittests/test_collective_reduce_api.py
+++ b/python/paddle/fluid/tests/unittests/test_collective_reduce_api.py
@@ -15,9 +15,12 @@
 from __future__ import print_function
 import unittest
 import numpy as np
+import paddle
 
 from test_collective_api_base import TestDistBase
 
+paddle.enable_static()
+
 
 class TestCollectiveReduceAPI(TestDistBase):
     def _setup_config(self):
diff --git a/python/paddle/fluid/tests/unittests/test_collective_scatter.py b/python/paddle/fluid/tests/unittests/test_collective_scatter.py
index 7fe3ce73359559c0f9b4e0e3990032ce693aab8a..ea34d1cab5a5a573c7053b956eb1474e5fb44179 100644
--- a/python/paddle/fluid/tests/unittests/test_collective_scatter.py
+++ b/python/paddle/fluid/tests/unittests/test_collective_scatter.py
@@ -15,9 +15,12 @@
 from __future__ import print_function
 import unittest
 import numpy as np
+import paddle
 
 from test_collective_base import TestDistBase
 
+paddle.enable_static()
+
 
 class TestCScatterOp(TestDistBase):
     def _setup_config(self):
diff --git a/python/paddle/fluid/tests/unittests/test_collective_scatter_api.py b/python/paddle/fluid/tests/unittests/test_collective_scatter_api.py
index cae842b396111f004b7ce52ce3f40c20ebe57263..3a37da52b8e9270c27749eb10252134ea97a6b46 100644
--- a/python/paddle/fluid/tests/unittests/test_collective_scatter_api.py
+++ b/python/paddle/fluid/tests/unittests/test_collective_scatter_api.py
@@ -15,9 +15,12 @@
 from __future__ import print_function
 import unittest
 import numpy as np
+import paddle
 
 from test_collective_api_base import TestDistBase
 
+paddle.enable_static()
+
 
 class TestCollectiveScatterAPI(TestDistBase):
     def _setup_config(self):
diff --git a/python/paddle/fluid/tests/unittests/test_communicator_geo.py b/python/paddle/fluid/tests/unittests/test_communicator_geo.py
index 30207340a27db0c1d00ab982cbac716e4b639c7e..d9fc9262b311f949a1a89cd079517c5c93d0d28d 100644
--- a/python/paddle/fluid/tests/unittests/test_communicator_geo.py
+++ b/python/paddle/fluid/tests/unittests/test_communicator_geo.py
@@ -28,6 +28,8 @@ import paddle.fluid as fluid
 import paddle.distributed.fleet.base.role_maker as role_maker
 import paddle.distributed.fleet as fleet
 
+paddle.enable_static()
+
 
 class TestCommunicatorGeoEnd2End(unittest.TestCase):
     def net(self):
@@ -140,6 +142,7 @@ import paddle.distributed.fleet as fleet
 
 from test_communicator_geo import TestCommunicatorGeoEnd2End
 
+paddle.enable_static()
 
 class RunServer(TestCommunicatorGeoEnd2End):
     def runTest(self):
diff --git a/python/paddle/fluid/tests/unittests/test_communicator_half_async.py b/python/paddle/fluid/tests/unittests/test_communicator_half_async.py
index 542d1874179ec53b0a4701e941f9748d0bc14766..391588780f342dc17ea821334e28f941f9ce359a 100644
--- a/python/paddle/fluid/tests/unittests/test_communicator_half_async.py
+++ b/python/paddle/fluid/tests/unittests/test_communicator_half_async.py
@@ -29,6 +29,8 @@ import paddle.fluid.incubate.fleet.base.role_maker as role_maker
 from paddle.fluid.incubate.fleet.parameter_server.distribute_transpiler import fleet
 from paddle.fluid.incubate.fleet.parameter_server.distribute_transpiler.distributed_strategy import StrategyFactory
 
+paddle.enable_static()
+
 
 class TestCommunicatorHalfAsyncEnd2End(unittest.TestCase):
     def net(self):
@@ -120,6 +122,7 @@ from test_communicator_half_async import TestCommunicatorHalfAsyncEnd2End
 from paddle.fluid.incubate.fleet.parameter_server.distribute_transpiler import fleet
 from paddle.fluid.incubate.fleet.parameter_server.distribute_transpiler.distributed_strategy import StrategyFactory
 
+paddle.enable_static()
 
 class RunServer(TestCommunicatorHalfAsyncEnd2End):
     def runTest(self):
diff --git a/python/paddle/fluid/tests/unittests/test_compare_op.py b/python/paddle/fluid/tests/unittests/test_compare_op.py
index cfad50409802d4f3d35c9da3b22597c681da91b1..25ae65aa7c968b2e6f1f9429d1a4e4e618fe7033 100644
--- a/python/paddle/fluid/tests/unittests/test_compare_op.py
+++ b/python/paddle/fluid/tests/unittests/test_compare_op.py
@@ -38,6 +38,7 @@ def create_test_class(op_type, typename, callback):
             self.check_output()
 
         def test_errors(self):
+            paddle.enable_static()
             with program_guard(Program(), Program()):
                 x = fluid.layers.data(name='x', shape=[2], dtype='int32')
                 y = fluid.layers.data(name='y', shape=[2], dtype='int32')
@@ -80,6 +81,7 @@ def create_paddle_case(op_type, callback):
                 self.place = paddle.CUDAPlace(0)
 
         def test_api(self):
+            paddle.enable_static()
             with program_guard(Program(), Program()):
                 x = fluid.data(name='x', shape=[4], dtype='int64')
                 y = fluid.data(name='y', shape=[4], dtype='int64')
@@ -92,6 +94,7 @@ def create_paddle_case(op_type, callback):
             self.assertEqual((res == self.real_result).all(), True)
 
         def test_broadcast_api_1(self):
+            paddle.enable_static()
             with program_guard(Program(), Program()):
                 x = paddle.static.data(
                     name='x', shape=[1, 2, 1, 3], dtype='int32')
@@ -108,6 +111,7 @@ def create_paddle_case(op_type, callback):
             self.assertEqual((res == real_result).all(), True)
 
         def test_attr_name(self):
+            paddle.enable_static()
             with program_guard(Program(), Program()):
                 x = fluid.layers.data(name='x', shape=[4], dtype='int32')
                 y = fluid.layers.data(name='y', shape=[4], dtype='int32')
@@ -130,6 +134,7 @@ create_paddle_case('not_equal', lambda _a, _b: _a != _b)
 
 class TestCompareOpError(unittest.TestCase):
     def test_errors(self):
+        paddle.enable_static()
         with program_guard(Program(), Program()):
             # The input x and y of compare_op must be Variable.
             x = fluid.layers.data(name='x', shape=[1], dtype="float32")
@@ -140,6 +145,7 @@ class TestCompareOpError(unittest.TestCase):
 
 class API_TestElementwise_Equal(unittest.TestCase):
     def test_api(self):
+        paddle.enable_static()
         with fluid.program_guard(fluid.Program(), fluid.Program()):
             label = fluid.layers.assign(np.array([3, 3], dtype="int32"))
             limit = fluid.layers.assign(np.array([3, 2], dtype="int32"))
@@ -159,5 +165,31 @@ class API_TestElementwise_Equal(unittest.TestCase):
         self.assertEqual((res == np.array([True, True])).all(), True)
 
 
+class TestCompareOpPlace(unittest.TestCase):
+    def test_place_1(self):
+        paddle.enable_static()
+        place = paddle.CPUPlace()
+        if core.is_compiled_with_cuda():
+            place = paddle.CUDAPlace(0)
+        label = fluid.layers.assign(np.array([3, 3], dtype="int32"))
+        limit = fluid.layers.assign(np.array([3, 2], dtype="int32"))
+        out = fluid.layers.less_than(label, limit, force_cpu=True)
+        exe = fluid.Executor(place)
+        res, = exe.run(fetch_list=[out])
+        self.assertEqual((res == np.array([False, False])).all(), True)
+
+    def test_place_2(self):
+        place = paddle.CPUPlace()
+        data_place = place
+        if core.is_compiled_with_cuda():
+            place = paddle.CUDAPlace(0)
+            data_place = paddle.CUDAPinnedPlace()
+        paddle.disable_static(place)
+        data = np.array([9], dtype="int64")
+        data_tensor = paddle.to_tensor(data, place=data_place)
+        result = data_tensor == 0
+        self.assertEqual((result.numpy() == np.array([False])).all(), True)
+
+
 if __name__ == '__main__':
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_conv1d_layer.py b/python/paddle/fluid/tests/unittests/test_conv1d_layer.py
index da527b26bf0608da5a648d92b492ff27cf2802f0..35fce9e9d6ba9d7a2f264bdd5c1f3deb7a2a67e9 100644
--- a/python/paddle/fluid/tests/unittests/test_conv1d_layer.py
+++ b/python/paddle/fluid/tests/unittests/test_conv1d_layer.py
@@ -44,7 +44,7 @@ class Conv1dTestCase(unittest.TestCase):
         self.spartial_shape = spartial_shape
         self.filter_size = filter_size
         self.data_format = data_format
-        self.channel_last = (self.data_format == "NHWC")
+        self.channel_last = (self.data_format == "NLC")
 
         self.padding = padding
         self.padding_mode = padding_mode
@@ -147,6 +147,14 @@ class Conv1dErrorTestCase(Conv1dTestCase):
                 self.paddle_nn_layer()
 
 
+class Conv1dTypeErrorTestCase(Conv1dTestCase):
+    def runTest(self):
+        place = fluid.CPUPlace()
+        with dg.guard(place):
+            with self.assertRaises(TypeError):
+                self.paddle_nn_layer()
+
+
 def add_cases(suite):
     suite.addTest(Conv1dTestCase(methodName='runTest'))
     suite.addTest(Conv1dTestCase(methodName='runTest', stride=[1], dilation=2))
@@ -161,6 +169,7 @@ def add_cases(suite):
         Conv1dTestCase(
             methodName='runTest', padding=2, data_format='NLC'))
     suite.addTest(Conv1dTestCase(methodName='runTest', padding=[1]))
+    suite.addTest(Conv1dTestCase(methodName='runTest', padding=[1, 2]))
     suite.addTest(Conv1dTestCase(methodName='runTest', padding=2))
     suite.addTest(Conv1dTestCase(methodName='runTest'))
     suite.addTest(
@@ -178,7 +187,7 @@ def add_cases(suite):
 
 def add_error_cases(suite):
     suite.addTest(
-        Conv1dErrorTestCase(
+        Conv1dTypeErrorTestCase(
             methodName='runTest', padding_mode="reflect", padding="valid"))
     suite.addTest(
         Conv1dErrorTestCase(
diff --git a/python/paddle/fluid/tests/unittests/test_conv1d_transpose_layer.py b/python/paddle/fluid/tests/unittests/test_conv1d_transpose_layer.py
index 73227dd3610376d85fcfc70bb2653dfd927427fd..4c98aacd209dab8e5dc9e7744922a927700c4bb3 100644
--- a/python/paddle/fluid/tests/unittests/test_conv1d_transpose_layer.py
+++ b/python/paddle/fluid/tests/unittests/test_conv1d_transpose_layer.py
@@ -201,6 +201,7 @@ def add_cases(suite):
         ConvTranspose1dTestCase(
             methodName='runTest', data_format="NLC", stride=3,
             output_padding=2))
+    suite.addTest(ConvTranspose1dTestCase(methodName='runTest', padding=[1, 2]))
 
 
 def add_error_cases(suite):
diff --git a/python/paddle/fluid/tests/unittests/test_cross_entropy_loss.py b/python/paddle/fluid/tests/unittests/test_cross_entropy_loss.py
index 4982cd195820811b9a8ec3fe6d01955234032120..c6190590108876ba97feb4dad0c31884727ec978 100644
--- a/python/paddle/fluid/tests/unittests/test_cross_entropy_loss.py
+++ b/python/paddle/fluid/tests/unittests/test_cross_entropy_loss.py
@@ -26,7 +26,7 @@ def stable_softmax(x):
     return exps / np.sum(exps)
 
 
-def log_softmax(x, axis=-1):
+def log_softmax(x, axis=1):
     softmax_out = np.apply_along_axis(stable_softmax, axis, x)
     return np.log(softmax_out)
 
diff --git a/python/paddle/fluid/tests/unittests/test_cuda_random_seed.py b/python/paddle/fluid/tests/unittests/test_cuda_random_seed.py
new file mode 100644
index 0000000000000000000000000000000000000000..0c2520038a82a0b9427b2cbe1d4010a1bc8e040c
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/test_cuda_random_seed.py
@@ -0,0 +1,163 @@
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Test cloud role maker."""
+
+from __future__ import print_function
+import os
+import unittest
+import paddle.fluid.generator as generator
+
+import time  # temp for debug
+import paddle.fluid as fluid
+import numpy as np
+import paddle
+import paddle.fluid.core as core
+
+
+class TestGeneratorSeed(unittest.TestCase):
+    """
+    Test cases for cpu generator seed.
+    """
+
+    def test_gen_dropout_dygraph(self):
+        gen = paddle.manual_seed(12343)
+
+        fluid.enable_dygraph()
+
+        gen.manual_seed(111111111)
+        st = paddle.get_cuda_rng_state()
+
+        x = fluid.layers.uniform_random(
+            [2, 10], dtype="float32", min=0.0, max=1.0)
+        x_again = fluid.layers.uniform_random(
+            [2, 10], dtype="float32", min=0.0, max=1.0)
+        x_third = fluid.layers.uniform_random(
+            [2, 10], dtype="float32", min=0.0, max=1.0)
+        print("x: {}".format(x.numpy()))
+        print("x_again: {}".format(x_again.numpy()))
+        x = x + x_again + x_third
+        y = fluid.layers.dropout(x, 0.5)
+
+        paddle.set_cuda_rng_state(st)
+
+        x1 = fluid.layers.uniform_random(
+            [2, 10], dtype="float32", min=0.0, max=1.0)
+        x1_again = fluid.layers.uniform_random(
+            [2, 10], dtype="float32", min=0.0, max=1.0)
+        x1_third = fluid.layers.uniform_random(
+            [2, 10], dtype="float32", min=0.0, max=1.0)
+        x1 = x1 + x1_again + x1_third
+        y1 = fluid.layers.dropout(x1, 0.5)
+        y_np = y.numpy()
+        y1_np = y1.numpy()
+
+        if core.is_compiled_with_cuda():
+            print(">>>>>>> dropout dygraph >>>>>>>")
+            self.assertTrue(np.allclose(y_np, y1_np))
+
+    def test_generator_gaussian_random_dygraph(self):
+        """Test Generator seed."""
+        fluid.enable_dygraph()
+
+        paddle.manual_seed(12312321111)
+        x = fluid.layers.gaussian_random([120], dtype="float32")
+        st1 = paddle.get_cuda_rng_state()
+        x1 = fluid.layers.gaussian_random([120], dtype="float32")
+        paddle.set_cuda_rng_state(st1)
+        x2 = fluid.layers.gaussian_random([120], dtype="float32")
+        paddle.manual_seed(12312321111)
+        x3 = fluid.layers.gaussian_random([120], dtype="float32")
+        x_np = x.numpy()
+        x1_np = x1.numpy()
+        x2_np = x2.numpy()
+        x3_np = x3.numpy()
+
+        if core.is_compiled_with_cuda():
+            print(">>>>>>> gaussian random dygraph >>>>>>>")
+            self.assertTrue(np.allclose(x1_np, x2_np))
+            self.assertTrue(np.allclose(x_np, x3_np))
+
+    def test_generator_randint_dygraph(self):
+        """Test Generator seed."""
+
+        fluid.enable_dygraph()
+
+        gen = paddle.manual_seed(12312321111)
+        x = paddle.randint(low=10, shape=[10], dtype="int32")
+        st1 = gen.get_state()
+        x1 = paddle.randint(low=10, shape=[10], dtype="int32")
+        gen.set_state(st1)
+        x2 = paddle.randint(low=10, shape=[10], dtype="int32")
+        paddle.manual_seed(12312321111)
+        x3 = paddle.randint(low=10, shape=[10], dtype="int32")
+        x_np = x.numpy()
+        x1_np = x1.numpy()
+        x2_np = x2.numpy()
+        x3_np = x3.numpy()
+
+        if core.is_compiled_with_cuda():
+            print(">>>>>>> randint dygraph >>>>>>>")
+            self.assertTrue(np.allclose(x1_np, x2_np))
+            self.assertTrue(np.allclose(x_np, x3_np))
+
+    def test_gen_TruncatedNormal_initializer(self):
+        fluid.disable_dygraph()
+
+        gen = paddle.manual_seed(123123143)
+        cur_state = paddle.get_cuda_rng_state()
+
+        startup_program = fluid.Program()
+        train_program = fluid.Program()
+        with fluid.program_guard(train_program, startup_program):
+            # example 1:
+            # attr shape is a list which doesn't contain tensor Variable.
+            x = fluid.layers.uniform_random(shape=[2, 10])
+            result_1 = fluid.layers.fc(
+                input=x,
+                size=10,
+                param_attr=fluid.initializer.TruncatedNormal(
+                    loc=0.0, scale=2.0))
+            result_2 = fluid.layers.fc(
+                input=x,
+                size=10,
+                param_attr=fluid.initializer.TruncatedNormal(
+                    loc=0.0, scale=2.0))
+
+            exe = fluid.Executor(fluid.CPUPlace())
+            exe.run(startup_program)
+            out1 = exe.run(train_program,
+                           feed={},
+                           fetch_list=[result_1, result_2])
+
+        paddle.manual_seed(123123143)
+        with fluid.program_guard(train_program, startup_program):
+            exe.run(startup_program)
+            out2 = exe.run(train_program,
+                           feed={},
+                           fetch_list=[result_1, result_2])
+
+        out1_res1 = np.array(out1[0])
+        out1_res2 = np.array(out1[1])
+        out2_res1 = np.array(out2[0])
+        out2_res2 = np.array(out2[1])
+
+        if core.is_compiled_with_cuda():
+            print(">>>>>>> truncated normal static >>>>>>>")
+            self.assertTrue(np.allclose(out1_res1, out2_res1))
+            self.assertTrue(np.allclose(out1_res2, out2_res2))
+            self.assertTrue(not np.allclose(out1_res2, out1_res1))
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_dataset.py b/python/paddle/fluid/tests/unittests/test_dataset.py
index 582bb3dcc681921cdbf2111dcd26b299f06a3058..208956b825ed1d78aeacf85fc052210e42d247ce 100644
--- a/python/paddle/fluid/tests/unittests/test_dataset.py
+++ b/python/paddle/fluid/tests/unittests/test_dataset.py
@@ -38,26 +38,22 @@ class TestDataset(unittest.TestCase):
     def test_dataset_create(self):
         """ Testcase for dataset create. """
         try:
-            dataset = paddle.distributed.fleet.DatasetFactory().create_dataset(
-                "InMemoryDataset")
+            dataset = paddle.distributed.InMemoryDataset()
         except:
             self.assertTrue(False)
 
         try:
-            dataset = paddle.distributed.fleet.DatasetFactory().create_dataset(
-                "QueueDataset")
+            dataset = paddle.distributed.QueueDataset()
         except:
             self.assertTrue(False)
 
         try:
-            dataset = paddle.distributed.fleet.DatasetFactory().create_dataset(
-                "FileInstantDataset")
+            dataset = paddle.distributed.fleet.dataset.FileInstantDataset()
         except:
             self.assertTrue(False)
 
         try:
-            dataset = paddle.distributed.fleet.DatasetFactory().create_dataset(
-                "MyOwnDataset")
+            dataset = paddle.distributed.fleet.dataset.MyOwnDataset()
             self.assertTrue(False)
         except:
             self.assertTrue(True)
@@ -95,18 +91,18 @@ class TestDataset(unittest.TestCase):
                 name=slot, shape=[1], dtype="int64", lod_level=1)
             slots_vars.append(var)
 
-        dataset = paddle.distributed.fleet.DatasetFactory().create_dataset(
-            "InMemoryDataset")
-        dataset.set_batch_size(32)
-        dataset.set_thread(3)
+        dataset = paddle.distributed.InMemoryDataset()
+        dataset.init(
+            batch_size=32, thread_num=3, pipe_command="cat", use_var=slots_vars)
+        dataset.update_settings(pipe_command="cat1")
+        dataset._init_distributed_settings(
+            parse_ins_id=True,
+            parse_content=True,
+            fea_eval=True,
+            candidate_size=10000)
         dataset.set_filelist(
             ["test_run_with_dump_a.txt", "test_run_with_dump_b.txt"])
-        dataset.set_parse_ins_id(True)
-        dataset.set_parse_content(True)
-        dataset.set_pipe_command("cat")
-        dataset.set_use_var(slots_vars)
         dataset.load_into_memory()
-        dataset.set_fea_eval(10000, True)
         dataset.local_shuffle()
 
         exe = fluid.Executor(fluid.CPUPlace())
@@ -176,14 +172,14 @@ class TestDataset(unittest.TestCase):
                 name=slot, shape=[1], dtype="int64", lod_level=1)
             slots_vars.append(var)
 
-        dataset = paddle.distributed.fleet.DatasetFactory().create_dataset(
-            "InMemoryDataset")
-        dataset.set_batch_size(32)
-        dataset.set_thread(3)
+        dataset = paddle.distributed.InMemoryDataset()
+        dataset.init(
+            batch_size=32,
+            thread_num=3,
+            pipe_command="cat",
+            download_cmd="cat",
+            use_var=slots_vars)
         dataset.set_filelist([filename1, filename2])
-        dataset.set_pipe_command("cat")
-        dataset.set_download_cmd("cat")
-        dataset.set_use_var(slots_vars)
         dataset.load_into_memory()
         exe = fluid.Executor(fluid.CPUPlace())
         exe.run(fluid.default_startup_program())
@@ -228,22 +224,19 @@ class TestDataset(unittest.TestCase):
                 name=slot, shape=[1], dtype="int64", lod_level=1)
             slots_vars.append(var)
 
-        dataset = paddle.distributed.fleet.DatasetFactory().create_dataset(
-            "InMemoryDataset")
-        dataset.set_batch_size(32)
-        dataset.set_thread(3)
+        dataset = paddle.distributed.InMemoryDataset()
+        dataset.init(
+            batch_size=32, thread_num=3, pipe_command="cat", use_var=slots_vars)
+        dataset._init_distributed_settings(fea_eval=True, candidate_size=1)
         dataset.set_filelist([
             "test_in_memory_dataset_run_a.txt",
             "test_in_memory_dataset_run_b.txt"
         ])
-        dataset.set_pipe_command("cat")
-        dataset.set_use_var(slots_vars)
         dataset.load_into_memory()
-        dataset.set_fea_eval(1, True)
         dataset.slots_shuffle(["slot1"])
         dataset.local_shuffle()
-        dataset.set_generate_unique_feasigns(True, 15)
-        dataset.generate_local_tables_unlock(0, 11, 1, 25, 15)
+        dataset._set_generate_unique_feasigns(True, 15)
+        dataset._generate_local_tables_unlock(0, 11, 1, 25, 15)
         exe = fluid.Executor(fluid.CPUPlace())
         exe.run(fluid.default_startup_program())
         if self.use_data_loader:
@@ -300,17 +293,14 @@ class TestDataset(unittest.TestCase):
                     name=slot, shape=[1], dtype="float32", lod_level=1)
                 slots_vars.append(var)
 
-        dataset = paddle.distributed.fleet.DatasetFactory().create_dataset(
-            "InMemoryDataset")
-        dataset.set_batch_size(32)
-        dataset.set_thread(1)
-        dataset.set_parse_ins_id(True)
+        dataset = paddle.distributed.InMemoryDataset()
+        dataset.init(
+            batch_size=32, thread_num=1, pipe_command="cat", use_var=slots_vars)
+        dataset._init_distributed_settings(parse_ins_id=True)
         dataset.set_filelist([
             "test_in_memory_dataset_masterpatch_a.txt",
             "test_in_memory_dataset_masterpatch_b.txt"
         ])
-        dataset.set_pipe_command("cat")
-        dataset.set_use_var(slots_vars)
         dataset.load_into_memory()
         dataset.local_shuffle()
 
@@ -325,7 +315,8 @@ class TestDataset(unittest.TestCase):
             except Exception as e:
                 self.assertTrue(False)
 
-        dataset.set_merge_by_lineid(2)
+        #dataset._set_merge_by_lineid(2)
+        dataset.update_settings(merge_size=2)
         dataset.dataset.merge_by_lineid()
 
         os.remove("./test_in_memory_dataset_masterpatch_a.txt")
@@ -367,17 +358,14 @@ class TestDataset(unittest.TestCase):
                 name="slot4", shape=[1], dtype="float32", lod_level=0)
             slots_vars = [var1, var2, var3, var4]
 
-        dataset = paddle.distributed.fleet.DatasetFactory().create_dataset(
-            "InMemoryDataset")
-        dataset.set_batch_size(32)
-        dataset.set_thread(1)
-        dataset.set_parse_ins_id(True)
+        dataset = paddle.distributed.InMemoryDataset()
+        dataset.init(
+            batch_size=32, thread_num=1, pipe_command="cat", use_var=slots_vars)
+        dataset._init_distributed_settings(parse_ins_id=True)
         dataset.set_filelist([
             "test_in_memory_dataset_masterpatch1_a.txt",
             "test_in_memory_dataset_masterpatch1_b.txt"
         ])
-        dataset.set_pipe_command("cat")
-        dataset.set_use_var(slots_vars)
         dataset.load_into_memory()
         dataset.local_shuffle()
 
@@ -392,7 +380,7 @@ class TestDataset(unittest.TestCase):
             except Exception as e:
                 self.assertTrue(False)
 
-        dataset.set_merge_by_lineid(2)
+        dataset._set_merge_by_lineid(2)
         dataset.dataset.merge_by_lineid()
 
         os.remove("./test_in_memory_dataset_masterpatch1_a.txt")
@@ -423,16 +411,13 @@ class TestDataset(unittest.TestCase):
                 name=slot, shape=[1], dtype="float32", lod_level=1)
             slots_vars.append(var)
 
-        dataset = paddle.distributed.fleet.DatasetFactory().create_dataset(
-            "InMemoryDataset")
-        dataset.set_batch_size(32)
-        dataset.set_thread(3)
+        dataset = paddle.distributed.InMemoryDataset()
+        dataset.init(
+            batch_size=32, thread_num=3, pipe_command="cat", use_var=slots_vars)
         dataset.set_filelist([
             "test_in_memory_dataset_run_a.txt",
             "test_in_memory_dataset_run_b.txt"
         ])
-        dataset.set_pipe_command("cat")
-        dataset.set_use_var(slots_vars)
         dataset.load_into_memory()
         dataset.local_shuffle()
 
@@ -473,9 +458,9 @@ class TestDataset(unittest.TestCase):
                 except Exception as e:
                     self.assertTrue(False)
 
-        dataset.set_merge_by_lineid(2)
-        dataset.set_parse_ins_id(False)
-        dataset.set_fleet_send_sleep_seconds(2)
+        dataset._set_merge_by_lineid(2)
+        dataset._set_parse_ins_id(False)
+        dataset._set_fleet_send_sleep_seconds(2)
         dataset.preload_into_memory()
         dataset.wait_preload_done()
         dataset.release_memory()
@@ -483,10 +468,25 @@ class TestDataset(unittest.TestCase):
         dataset.wait_preload_done()
         dataset.dataset.merge_by_lineid()
         dataset.release_memory()
-        dataset.set_merge_by_lineid(30)
-        dataset.set_parse_ins_id(False)
+        dataset._set_merge_by_lineid(30)
+        dataset._set_parse_ins_id(False)
         dataset.load_into_memory()
         dataset.dataset.merge_by_lineid()
+        dataset.update_settings(
+            batch_size=1,
+            thread_num=2,
+            input_type=1,
+            pipe_command="cat",
+            use_var=[],
+            fs_name="",
+            fs_ugi="",
+            download_cmd="cat",
+            merge_size=-1,
+            parse_ins_id=False,
+            parse_content=False,
+            fleet_send_batch_size=2,
+            fleet_send_sleep_seconds=2,
+            fea_eval=True)
         fleet_ptr = fluid.core.Fleet()
         fleet_ptr.set_client2client_config(1, 1, 1)
         fleet_ptr.get_cache_threshold(0)
@@ -517,14 +517,11 @@ class TestDataset(unittest.TestCase):
                 name=slot, shape=[1], dtype="int64", lod_level=1)
             slots_vars.append(var)
 
-        dataset = paddle.distributed.fleet.DatasetFactory().create_dataset(
-            "QueueDataset")
-        dataset.set_batch_size(32)
-        dataset.set_thread(3)
+        dataset = paddle.distributed.QueueDataset()
+        dataset.init(
+            batch_size=32, thread_num=3, pipe_command="cat", use_var=slots_vars)
         dataset.set_filelist(
             ["test_queue_dataset_run_a.txt", "test_queue_dataset_run_b.txt"])
-        dataset.set_pipe_command("cat")
-        dataset.set_use_var(slots_vars)
 
         exe = fluid.Executor(fluid.CPUPlace())
         exe.run(fluid.default_startup_program())
@@ -543,12 +540,9 @@ class TestDataset(unittest.TestCase):
                 except Exception as e:
                     self.assertTrue(False)
 
-        dataset2 = paddle.distributed.fleet.DatasetFactory().create_dataset(
-            "QueueDataset")
-        dataset2.set_use_var(slots_vars)
-        dataset2.set_batch_size(32)
-        dataset2.set_thread(3)
-        dataset2.set_pipe_command("cat")
+        dataset2 = paddle.distributed.QueueDataset()
+        dataset2.init(
+            batch_size=32, thread_num=3, pipe_command="cat", use_var=slots_vars)
         dataset.set_filelist([])
         try:
             exe.train_from_dataset(fluid.default_main_program(), dataset2)
@@ -585,14 +579,11 @@ class TestDataset(unittest.TestCase):
                 name=slot, shape=[1], dtype="float32", lod_level=1)
             slots_vars.append(var)
 
-        dataset = paddle.distributed.fleet.DatasetFactory().create_dataset(
-            "QueueDataset")
-        dataset.set_batch_size(32)
-        dataset.set_thread(3)
+        dataset = paddle.distributed.QueueDataset()
+        dataset.init(
+            batch_size=32, thread_num=3, pipe_command="cat", use_var=slots_vars)
         dataset.set_filelist(
             ["test_queue_dataset_run_a.txt", "test_queue_dataset_run_b.txt"])
-        dataset.set_pipe_command("cat")
-        dataset.set_use_var(slots_vars)
 
         exe = fluid.Executor(fluid.CPUPlace() if not core.is_compiled_with_cuda(
         ) else fluid.CUDAPlace(0))
@@ -641,15 +632,15 @@ class TestDataset(unittest.TestCase):
                 name=slot, shape=[None, 1], dtype="int64", lod_level=1)
             slots_vars.append(var)
 
-        dataset = paddle.distributed.fleet.DatasetFactory().create_dataset(
-            "InMemoryDataset")
-        dataset.set_input_type(1)
-        dataset.set_batch_size(1)
-        dataset.set_thread(2)
+        dataset = paddle.distributed.InMemoryDataset()
+        dataset.init(
+            batch_size=1,
+            thread_num=2,
+            input_type=1,
+            pipe_command="cat",
+            use_var=slots_vars)
         dataset.set_filelist(
             ["test_queue_dataset_run_a.txt", "test_queue_dataset_run_b.txt"])
-        dataset.set_pipe_command("cat")
-        dataset.set_use_var(slots_vars)
         dataset.load_into_memory()
 
         exe = fluid.Executor(fluid.CPUPlace() if not core.is_compiled_with_cuda(
@@ -721,13 +712,10 @@ class TestDatasetWithFetchHandler(unittest.TestCase):
             inputs(list): inputs of get_dataset
             files(list): files of  get_dataset
         """
-        dataset = paddle.distributed.fleet.DatasetFactory().create_dataset(
-            "QueueDataset")
-        dataset.set_batch_size(32)
-        dataset.set_thread(3)
+        dataset = paddle.distributed.QueueDataset()
+        dataset.init(
+            batch_size=32, thread_num=3, pipe_command="cat", use_var=inputs)
         dataset.set_filelist(files)
-        dataset.set_pipe_command("cat")
-        dataset.set_use_var(inputs)
         return dataset
 
     def setUp(self):
@@ -879,16 +867,17 @@ class TestDataset2(unittest.TestCase):
             except ImportError as e:
                 print("warning: no mpi4py")
             exe.run(startup_program)
-            dataset = paddle.distributed.fleet.DatasetFactory().create_dataset(
-                "InMemoryDataset")
-            dataset.set_batch_size(32)
-            dataset.set_thread(3)
+            dataset = paddle.distributed.InMemoryDataset()
+
+            dataset.init(
+                batch_size=32,
+                thread_num=3,
+                pipe_command="cat",
+                use_var=slots_vars)
             dataset.set_filelist([
                 "test_in_memory_dataset2_run_a.txt",
                 "test_in_memory_dataset2_run_b.txt"
             ])
-            dataset.set_pipe_command("cat")
-            dataset.set_use_var(slots_vars)
             dataset.load_into_memory()
             fleet._opt_info = None
             fleet._fleet_ptr = None
@@ -949,16 +938,16 @@ class TestDataset2(unittest.TestCase):
             except ImportError as e:
                 print("warning: no mpi4py")
             exe.run(startup_program)
-            dataset = paddle.distributed.fleet.DatasetFactory().create_dataset(
-                "InMemoryDataset")
-            dataset.set_batch_size(32)
-            dataset.set_thread(3)
+            dataset = paddle.distributed.InMemoryDataset()
+            dataset.init(
+                batch_size=32,
+                thread_num=3,
+                pipe_command="cat",
+                use_var=slots_vars)
             dataset.set_filelist([
                 "test_in_memory_dataset2_run2_a.txt",
                 "test_in_memory_dataset2_run2_b.txt"
             ])
-            dataset.set_pipe_command("cat")
-            dataset.set_use_var(slots_vars)
             dataset.load_into_memory()
             try:
                 dataset.global_shuffle(fleet)
@@ -966,14 +955,11 @@ class TestDataset2(unittest.TestCase):
                 print("warning: catch expected error")
             fleet._opt_info = None
             fleet._fleet_ptr = None
-            dataset = paddle.distributed.fleet.DatasetFactory().create_dataset(
-                "InMemoryDataset")
-            dataset.set_rank_offset("")
-            dataset.set_pv_batch_size(1)
-            dataset.set_hdfs_config("", "")
+            dataset = paddle.distributed.InMemoryDataset()
+            dataset.init(fs_name="", fs_ugi="")
             d = paddle.distributed.fleet.DatasetBase()
             try:
-                dataset.set_feed_type("MultiSlotInMemoryDataFeed")
+                dataset._set_feed_type("MultiSlotInMemoryDataFeed")
             except:
                 print("warning: catch expected error")
             dataset.thread_num = 0
@@ -981,9 +967,6 @@ class TestDataset2(unittest.TestCase):
                 dataset._prepare_to_run()
             except:
                 print("warning: catch expected error")
-            dataset.set_parse_logkey(True)
-            dataset.set_merge_by_sid(True)
-            dataset.set_enable_pv_merge(True)
             try:
                 dataset.preprocess_instance()
             except:
@@ -996,16 +979,15 @@ class TestDataset2(unittest.TestCase):
                 dataset.postprocess_instance()
             except:
                 print("warning: catch expected error")
-            dataset.set_fleet_send_batch_size(1024)
+            dataset._set_fleet_send_batch_size(1024)
             try:
                 dataset.global_shuffle()
             except:
                 print("warning: catch expected error")
-            dataset.get_pv_data_size()
+            #dataset.get_pv_data_size()
             dataset.get_memory_data_size()
             dataset.get_shuffle_data_size()
-            dataset = paddle.distributed.fleet.DatasetFactory().create_dataset(
-                "QueueDataset")
+            dataset = paddle.distributed.QueueDataset()
             try:
                 dataset.local_shuffle()
             except:
@@ -1027,6 +1009,120 @@ class TestDataset2(unittest.TestCase):
         os.remove("./test_in_memory_dataset2_run2_a.txt")
         os.remove("./test_in_memory_dataset2_run2_b.txt")
 
+    def test_bosps_dataset_fleet2(self):
+        """
+        Testcase for InMemoryDataset from create to run.
+        """
+        with open("test_in_memory_dataset2_run2_a.txt", "w") as f:
+            data = "1 1 2 3 3 4 5 5 5 5 1 1\n"
+            data += "1 2 2 3 4 4 6 6 6 6 1 2\n"
+            data += "1 3 2 3 5 4 7 7 7 7 1 3\n"
+            f.write(data)
+        with open("test_in_memory_dataset2_run2_b.txt", "w") as f:
+            data = "1 4 2 3 3 4 5 5 5 5 1 4\n"
+            data += "1 5 2 3 4 4 6 6 6 6 1 5\n"
+            data += "1 6 2 3 5 4 7 7 7 7 1 6\n"
+            data += "1 7 2 3 6 4 8 8 8 8 1 7\n"
+            f.write(data)
+
+        train_program = fluid.Program()
+        startup_program = fluid.Program()
+        scope = fluid.Scope()
+        from paddle.fluid.incubate.fleet.parameter_server.pslib import fleet
+        with fluid.program_guard(train_program, startup_program):
+            slots = ["slot1_ff", "slot2_ff", "slot3_ff", "slot4_ff"]
+            slots_vars = []
+            for slot in slots:
+                var = fluid.layers.data(\
+                    name=slot, shape=[1], dtype="float32", lod_level=1)
+                slots_vars.append(var)
+            fake_cost = \
+                fluid.layers.elementwise_sub(slots_vars[0], slots_vars[-1])
+            fake_cost = fluid.layers.mean(fake_cost)
+        with fluid.scope_guard(scope):
+            place = fluid.CPUPlace()
+            exe = fluid.Executor(place)
+            try:
+                fleet.init()
+            except ImportError as e:
+                print("warning: no mpi4py")
+            adam = fluid.optimizer.Adam(learning_rate=0.000005)
+            try:
+                adam = fleet.distributed_optimizer(
+                    adam,
+                    strategy={
+                        "fs_uri": "fs_uri_xxx",
+                        "fs_user": "fs_user_xxx",
+                        "fs_passwd": "fs_passwd_xxx",
+                        "fs_hadoop_bin": "fs_hadoop_bin_xxx"
+                    })
+                adam.minimize([fake_cost], [scope])
+            except AttributeError as e:
+                print("warning: no mpi")
+            except ImportError as e:
+                print("warning: no mpi4py")
+            exe.run(startup_program)
+            dataset = paddle.distributed.fleet.BoxPSDataset()
+            dataset.init(
+                batch_size=32,
+                thread_num=3,
+                pipe_command="cat",
+                use_var=slots_vars)
+            dataset.set_filelist([
+                "test_in_memory_dataset2_run2_a.txt",
+                "test_in_memory_dataset2_run2_b.txt"
+            ])
+            dataset.load_into_memory()
+            try:
+                dataset.global_shuffle(fleet)
+            except:
+                print("warning: catch expected error")
+            fleet._opt_info = None
+            fleet._fleet_ptr = None
+            dataset = paddle.distributed.fleet.BoxPSDataset()
+            dataset.init(
+                rank_offset="",
+                pv_batch_size=1,
+                fs_name="",
+                fs_ugi="",
+                data_feed_type="MultiSlotInMemoryDataFeed",
+                parse_logkey=True,
+                merge_by_sid=True,
+                enable_pv_merge=True)
+            d = paddle.distributed.fleet.DatasetBase()
+            try:
+                dataset._set_feed_type("MultiSlotInMemoryDataFeed")
+            except:
+                print("warning: catch expected error")
+            dataset.thread_num = 0
+            try:
+                dataset._prepare_to_run()
+            except:
+                print("warning: catch expected error")
+            dataset._set_parse_logkey(True)
+            dataset._set_merge_by_sid(True)
+            dataset._set_enable_pv_merge(True)
+            try:
+                dataset.preprocess_instance()
+            except:
+                print("warning: catch expected error")
+            try:
+                dataset.set_current_phase(1)
+            except:
+                print("warning: catch expected error")
+            try:
+                dataset.postprocess_instance()
+            except:
+                print("warning: catch expected error")
+            dataset._set_fleet_send_batch_size(1024)
+            try:
+                dataset.global_shuffle()
+            except:
+                print("warning: catch expected error")
+            #dataset.get_pv_data_size()
+            dataset.get_memory_data_size()
+            dataset.get_shuffle_data_size()
+
 
 if __name__ == '__main__':
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_dataset_dataloader.py b/python/paddle/fluid/tests/unittests/test_dataset_dataloader.py
index c13c33f209f0f7d0fff95bdfb5b4e551a145b87e..9195ac277b93ade31b50682a4c3553c3664093f3 100644
--- a/python/paddle/fluid/tests/unittests/test_dataset_dataloader.py
+++ b/python/paddle/fluid/tests/unittests/test_dataset_dataloader.py
@@ -97,9 +97,11 @@ class DatasetLoaderTestBase(unittest.TestCase):
 
     def check_batch_number(self, place, randomize_batch_num=False):
         main_prog, startup_prog, feeds = self.build_network()
-        dataset = paddle.distributed.fleet.DatasetFactory().create_dataset(
-            self.dataset_name)
-        dataset.set_batch_size(BATCH_SIZE)
+        if self.dataset_name == "QueueDataset":
+            dataset = paddle.distributed.QueueDataset()
+        else:
+            dataset = paddle.distributed.InMemoryDataset()
+        dataset._set_batch_size(BATCH_SIZE)
 
         if isinstance(place, fluid.CPUPlace):
             file_num = 10
@@ -128,8 +130,8 @@ class DatasetLoaderTestBase(unittest.TestCase):
                 fake_reader(batch_num=BATCH_NUM + random_delta_batch_size[i]))
 
         dataset.set_filelist(filelist)
-        dataset.set_use_var(feeds)
-        dataset.set_pipe_command("cat")
+        dataset._set_use_var(feeds)
+        dataset._set_pipe_command("cat")
         if self.dataset_name == 'InMemoryDataset':
             dataset.load_into_memory()
 
diff --git a/python/paddle/fluid/tests/unittests/test_decoupled_py_reader_data_check.py b/python/paddle/fluid/tests/unittests/test_decoupled_py_reader_data_check.py
index 4d767709ef56f11d6790c85206b544d63883841e..b2cb3141aad48ddb59887b99a7d02ce56ca74493 100644
--- a/python/paddle/fluid/tests/unittests/test_decoupled_py_reader_data_check.py
+++ b/python/paddle/fluid/tests/unittests/test_decoupled_py_reader_data_check.py
@@ -37,7 +37,7 @@ class TestClass(unittest.TestCase):
                     low=0, high=9, size=label_shape).astype('int64')
                 yield img, label
 
-        reader = fluid.io.cache(fake_reader)
+        reader = paddle.reader.cache(fake_reader)
         batch_reader = fluid.io.batch(reader, batch_size=batch_size)
 
         places = [fluid.CPUPlace()]
diff --git a/python/paddle/fluid/tests/unittests/test_deprecated_decorator.py b/python/paddle/fluid/tests/unittests/test_deprecated_decorator.py
new file mode 100755
index 0000000000000000000000000000000000000000..2a80e20d692c88497e7edccd6eca5509e3522871
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/test_deprecated_decorator.py
@@ -0,0 +1,152 @@
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+import paddle
+import numpy as np
+import paddle
+import paddle.fluid as fluid
+from paddle.static import Program, program_guard
+import unittest
+import paddle.fluid.core as core
+import sys
+
+LOWEST_WARNING_POSTION = 3
+ERROR_WARNING_POSTION = sys.maxsize
+
+# custom paddle version
+paddle.version.major = '1'
+paddle.version.minor = '8'
+paddle.version.patch = '0'
+paddle.version.rc = '0'
+paddle.__version__ = '1.8.0'
+paddle.version.full_version = '1.8.0'
+print("current paddle version: ", paddle.__version__)
+
+paddle.disable_static()
+
+
+def get_warning_index(api):
+    """
+    Given an paddle API, return the index of the Warinng information in its doc string if exists; 
+    If Warinng information doesn't exist, return the default ERROR_WARNING_POSTION, sys.maxsize.
+
+    Args:
+        API (python object)
+
+    Returns:
+        index (int): the index of the Warinng information in its doc string if exists.
+    """
+
+    doc_lst = api.__doc__.splitlines()
+    for idx, val in enumerate(doc_lst):
+        if val.startswith("Warning: ") and val.endswith(
+                " instead."
+        ) and "and will be removed in future versions." in val:
+            return idx
+    return ERROR_WARNING_POSTION
+
+
+class TestDeprecatedDocorator(unittest.TestCase):
+    """
+    tests for paddle's Deprecated Docorator.
+    test_fluid_data: test for old fluid.data API.
+    test_fluid_elementwise_mul: test for old fluid.layers.elementwise_xxx APIs.
+    test_new_multiply: test for new api, which should not insert warning information.
+    test_ops_elementwise_mul: test for C++ elementwise_mul op, which should not insert warning information.
+    """
+
+    def test_fluid_data(self):
+        """
+        test old fluid elementwise_mul api, it should fire Warinng function, 
+        which insert the Warinng info on top of API's doc string.
+        """
+        # Initialization
+        x = fluid.data(name='x', shape=[3, 2, 1], dtype='float32')
+
+        # expected
+        expected = LOWEST_WARNING_POSTION
+
+        # captured        
+        captured = get_warning_index(fluid.data)
+
+        # testting
+        self.assertGreater(expected, captured)
+
+    def test_fluid_elementwise_mul(self):
+        """
+        test old fluid elementwise_mul api, it should trigger Warinng function, 
+        which insert the Warinng info on top of API's doc string.
+        """
+
+        # Initialization
+        a = np.random.uniform(0.1, 1, [51, 76]).astype(np.float32)
+        b = np.random.uniform(0.1, 1, [51, 76]).astype(np.float32)
+        x = paddle.to_tensor(a)
+        y = paddle.to_tensor(b)
+        res = fluid.layers.elementwise_mul(x, y)
+
+        # expected
+        expected = LOWEST_WARNING_POSTION
+
+        # captured   
+        captured = get_warning_index(fluid.layers.elementwise_mul)
+
+        # testting
+        self.assertGreater(expected, captured)
+
+    def test_new_multiply(self):
+        """
+        Test for new multiply api, expected result should be False.
+        """
+
+        a = np.random.uniform(0.1, 1, [51, 76]).astype(np.float32)
+        b = np.random.uniform(0.1, 1, [51, 76]).astype(np.float32)
+        x = paddle.to_tensor(a)
+        y = paddle.to_tensor(b)
+        res = paddle.multiply(x, y)
+
+        # expected
+        expected = LOWEST_WARNING_POSTION
+
+        # captured        
+        captured = get_warning_index(paddle.multiply)
+
+        # testting
+        self.assertLess(expected, captured)
+
+    def test_ops_elementwise_mul(self):
+        """
+        Test for new C++ elementwise_op, expected result should be True, 
+        because not matter what fluid.layers.elementwise_mul is deprecated.
+        """
+
+        a = np.random.uniform(0.1, 1, [51, 76]).astype(np.float32)
+        b = np.random.uniform(0.1, 1, [51, 76]).astype(np.float32)
+        x = paddle.to_tensor(a)
+        y = paddle.to_tensor(b)
+        res = core.ops.elementwise_mul(x, y)
+
+        # expected
+        expected = LOWEST_WARNING_POSTION
+
+        # captured        
+        captured = get_warning_index(fluid.layers.elementwise_mul)
+
+        # testting
+        self.assertGreater(expected, captured)
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_device_guard.py b/python/paddle/fluid/tests/unittests/test_device_guard.py
index eb8861f1bc462e61d4bd685fca39452bfd8a8c4e..330065ecd92f1537e10df7b993682da821a46548 100644
--- a/python/paddle/fluid/tests/unittests/test_device_guard.py
+++ b/python/paddle/fluid/tests/unittests/test_device_guard.py
@@ -33,6 +33,14 @@ def execute(main_program, startup_program):
     exe.run(main_program)
 
 
+def get_vaild_warning_num(warning, w):
+    num = 0
+    for i in range(len(w)):
+        if warning in str(w[i].message):
+            num += 1
+    return num
+
+
 class TestDeviceGuard(unittest.TestCase):
     def test_device_guard(self):
         main_program = fluid.Program()
@@ -133,7 +141,10 @@ class TestDeviceGuard(unittest.TestCase):
                         i = fluid.layers.increment(x=i, value=1, in_place=True)
                         fluid.layers.less_than(x=i, y=loop_len, cond=cond)
 
-        assert len(w) == 1
+        warning = "The Op(while) is not support to set device."
+        warning_num = get_vaild_warning_num(warning, w)
+        assert warning_num == 1
+
         all_ops = main_program.global_block().ops
         device_attr_name = core.op_proto_and_checker_maker.kOpDeviceAttrName()
         for op in all_ops:
@@ -169,7 +180,10 @@ class TestDeviceGuard(unittest.TestCase):
                         shape=[1], value=4.0, dtype='float32')
                     result = fluid.layers.less_than(x=x, y=y, force_cpu=False)
 
-        assert len(w) == 2
+        warning = "\'device_guard\' has higher priority when they are used at the same time."
+        warning_num = get_vaild_warning_num(warning, w)
+        assert warning_num == 2
+
         all_ops = main_program.global_block().ops
         device_attr_name = core.op_proto_and_checker_maker.kOpDeviceAttrName()
         for op in all_ops:
diff --git a/python/paddle/fluid/tests/unittests/test_directory_migration.py b/python/paddle/fluid/tests/unittests/test_directory_migration.py
index 2919ec5e9ca97b1d59af46a54b2d702cb6de4a14..529fff158c55fc30248b9f5a88c8c615a8b55c79 100644
--- a/python/paddle/fluid/tests/unittests/test_directory_migration.py
+++ b/python/paddle/fluid/tests/unittests/test_directory_migration.py
@@ -43,7 +43,7 @@ class TestDirectory(unittest.TestCase):
             'paddle.distributed.prepare_context', 'paddle.DataParallel',
             'paddle.jit', 'paddle.jit.TracedLayer', 'paddle.jit.to_static',
             'paddle.jit.ProgramTranslator', 'paddle.jit.TranslatedLayer',
-            'paddle.jit.save', 'paddle.jit.load', 'paddle.jit.SaveLoadConfig',
+            'paddle.jit.save', 'paddle.jit.load', 'paddle.SaveLoadConfig',
             'paddle.NoamDecay', 'paddle.PiecewiseDecay',
             'paddle.NaturalExpDecay', 'paddle.ExponentialDecay',
             'paddle.InverseTimeDecay', 'paddle.PolynomialDecay',
diff --git a/python/paddle/fluid/tests/unittests/test_dist_allreduce_op.py b/python/paddle/fluid/tests/unittests/test_dist_allreduce_op.py
index fbeff20c63b2f4a3f01ac4131ac7063aff0204cf..2adf6e41931816688051132ee38215814a427378 100644
--- a/python/paddle/fluid/tests/unittests/test_dist_allreduce_op.py
+++ b/python/paddle/fluid/tests/unittests/test_dist_allreduce_op.py
@@ -15,6 +15,9 @@
 from __future__ import print_function
 import unittest
 from test_dist_base import TestDistBase
+import paddle
+
+paddle.enable_static()
 
 
 class TestDistMnistNCCL2(TestDistBase):
diff --git a/python/paddle/fluid/tests/unittests/test_dist_base.py b/python/paddle/fluid/tests/unittests/test_dist_base.py
index faff81fa84fb5fa66c9ff14f782d2301e3964672..f4d368b6b6f52f3071320eaffbeedc8d14d63d2e 100644
--- a/python/paddle/fluid/tests/unittests/test_dist_base.py
+++ b/python/paddle/fluid/tests/unittests/test_dist_base.py
@@ -488,6 +488,50 @@ class TestParallelDyGraphRunnerBase(object):
             model.clear_gradients()
         return out_losses
 
+    def run_gpu_fleet_api_trainer(self, args):
+        import paddle.distributed.fleet as fleet
+        import paddle.distributed.fleet.base.role_maker as role_maker
+        # 1. enable dygraph
+        paddle.disable_static()
+
+        # 2. init seed
+        seed = 90
+        paddle.static.default_startup_program().random_seed = seed
+        paddle.static.default_main_program().random_seed = seed
+        np.random.seed(seed)
+        random.seed = seed
+        # get trainer id
+        args.trainer_id = paddle.distributed.get_rank()
+
+        # 3. init parallel env
+        if args.update_method == "nccl2":
+            fleet.init(is_collective=True)
+
+        # 4. train model
+        model, train_reader, opt = self.get_model()
+        if args.update_method == "nccl2":
+            opt = fleet.distributed_optimizer(opt)
+            model = fleet.distributed_model(model)
+
+        out_losses = []
+        for step_id, data in enumerate(train_reader()):
+            data = self._get_data(data, args)
+            if step_id == RUN_STEP:
+                break
+            loss = self.run_one_loop(model, opt, data)
+            out_losses.append(loss.numpy())
+
+            if args.update_method == "nccl2":
+                loss = model.scale_loss(loss)
+
+            loss.backward()
+            if args.update_method == "nccl2":
+                model.apply_collective_grads()
+
+            opt.step()
+            opt.clear_grad()
+        print_to_out(out_losses)
+
 
 def runtime_main(test_class):
     parser = argparse.ArgumentParser(description='Run dist test.')
@@ -687,7 +731,8 @@ class TestDistBase(unittest.TestCase):
             envs['COVERAGE_FILE'] = os.getenv('COVERAGE_FILE', '')
             cmd += " -m coverage run --branch -p"
 
-        cmd += " %s --role trainer --lr %f" % (model, self._lr)
+        cmd += " %s --role trainer --update_method local --lr %f" % (model,
+                                                                     self._lr)
 
         if batch_size != DEFAULT_BATCH_SIZE:
             cmd += " --batch_size %d" % batch_size
@@ -850,6 +895,7 @@ class TestDistBase(unittest.TestCase):
         if self.__use_cuda:
             tr_cmd += " --use_cuda"
             env.update({
+                "FLAGS_selected_gpus": "{}".format(0),
                 "CUDA_VISIBLE_DEVICES": "{}".format(trainer_id % 2),
                 "PADDLE_TRAINERS_NUM": "{}".format(trainer_num),
                 "PADDLE_TRAINER_ID": "{}".format(trainer_id),
diff --git a/python/paddle/fluid/tests/unittests/test_dist_fleet_a_sync_optimizer_async.py b/python/paddle/fluid/tests/unittests/test_dist_fleet_a_sync_optimizer_async.py
index 9df55a6b873e28a6e479fd05b31074802eb19bb7..a82612b0ed2a6700dd157ddd6263cae2a879c274 100644
--- a/python/paddle/fluid/tests/unittests/test_dist_fleet_a_sync_optimizer_async.py
+++ b/python/paddle/fluid/tests/unittests/test_dist_fleet_a_sync_optimizer_async.py
@@ -19,6 +19,8 @@ import unittest
 import paddle
 import paddle.distributed.fleet.base.role_maker as role_maker
 
+paddle.enable_static()
+
 
 class TestFleetGradientMergeMetaOptimizer(unittest.TestCase):
     def setUp(self):
diff --git a/python/paddle/fluid/tests/unittests/test_dist_fleet_a_sync_optimizer_auto.py b/python/paddle/fluid/tests/unittests/test_dist_fleet_a_sync_optimizer_auto.py
new file mode 100644
index 0000000000000000000000000000000000000000..5b7e0fb94c662f4aa47fbaad964e03c576c97807
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/test_dist_fleet_a_sync_optimizer_auto.py
@@ -0,0 +1,69 @@
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+import paddle
+import os
+import paddle.distributed.fleet.base.role_maker as role_maker
+import time
+
+paddle.enable_static()
+
+
+class TestFleetGradientMergeMetaOptimizer(unittest.TestCase):
+    def setUp(self):
+        os.environ["PADDLE_PSERVER_NUMS"] = "2"
+        os.environ["PADDLE_TRAINERS_NUM"] = "2"
+        os.environ["POD_IP"] = "127.0.0.1"
+        os.environ["PADDLE_PORT"] = "36001"
+        os.environ["PADDLE_TRAINER_ID"] = "0"
+        os.environ["PADDLE_TRAINERS_NUM"] = "2"
+        os.environ["PADDLE_PSERVERS_IP_PORT_LIST"] = \
+            "127.0.0.1:36001,127.0.0.2:36001"
+
+    def test_a_sync_optimizer1(self):
+        os.environ["TRAINING_ROLE"] = "TRAINER"
+        import paddle.distributed.fleet as fleet
+
+        main_program = paddle.fluid.Program()
+        startup_program = paddle.fluid.Program()
+
+        paddle.fluid.framework.switch_main_program(main_program)
+        paddle.fluid.framework.switch_startup_program(startup_program)
+
+        fleet.init(role_maker.PaddleCloudRoleMaker())
+        input_x = paddle.fluid.layers.data(
+            name="x", shape=[32], dtype='float32')
+        input_y = paddle.fluid.layers.data(name="y", shape=[1], dtype='int64')
+
+        fc_1 = paddle.fluid.layers.fc(input=input_x, size=64, act='tanh')
+        fc_2 = paddle.fluid.layers.fc(input=fc_1, size=64, act='tanh')
+        prediction = paddle.fluid.layers.fc(input=[fc_2], size=2, act='softmax')
+        cost = paddle.fluid.layers.cross_entropy(
+            input=prediction, label=input_y)
+        avg_cost = paddle.fluid.layers.mean(x=cost)
+
+        strategy = paddle.distributed.fleet.DistributedStrategy()
+        strategy.auto = True
+        optimizer = paddle.fluid.optimizer.Adam(learning_rate=0.01)
+        optimizer = fleet.distributed_optimizer(optimizer, strategy=strategy)
+        optimizer.minimize(avg_cost)
+
+        self.assertTrue(optimizer.user_defined_strategy.a_sync)
+        a_sync_configs = optimizer.user_defined_strategy.a_sync_configs
+        self.assertTrue(a_sync_configs['k_steps'] == 0)
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_dist_fleet_a_sync_optimizer_auto_async.py b/python/paddle/fluid/tests/unittests/test_dist_fleet_a_sync_optimizer_auto_async.py
new file mode 100644
index 0000000000000000000000000000000000000000..3dff9d0f9d82530cade09a737d448fca4bf4f960
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/test_dist_fleet_a_sync_optimizer_auto_async.py
@@ -0,0 +1,81 @@
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+import paddle
+import os
+import paddle.distributed.fleet.base.role_maker as role_maker
+import time
+
+paddle.enable_static()
+
+
+class TestFleetGradientMergeMetaOptimizer(unittest.TestCase):
+    def setUp(self):
+        os.environ["PADDLE_PSERVER_NUMS"] = "2"
+        os.environ["PADDLE_TRAINERS_NUM"] = "2"
+        os.environ["POD_IP"] = "127.0.0.1"
+        os.environ["PADDLE_PORT"] = "36001"
+        os.environ["PADDLE_TRAINER_ID"] = "0"
+        os.environ["PADDLE_TRAINERS_NUM"] = "2"
+        os.environ["PADDLE_PSERVERS_IP_PORT_LIST"] = \
+            "127.0.0.1:36001,127.0.0.2:36001"
+
+    def test_a_sync_optimizer3(self):
+        os.environ["TRAINING_ROLE"] = "TRAINER"
+        import paddle.distributed.fleet as fleet
+
+        main_program = paddle.fluid.Program()
+        startup_program = paddle.fluid.Program()
+
+        paddle.fluid.framework.switch_main_program(main_program)
+        paddle.fluid.framework.switch_startup_program(startup_program)
+
+        fleet.init(role_maker.PaddleCloudRoleMaker())
+        input_x = paddle.fluid.layers.data(
+            name="x",
+            shape=[-1, 1],
+            dtype="int64",
+            lod_level=1,
+            append_batch_size=False)
+        x_embedding = paddle.fluid.layers.embedding(
+            is_distributed=False,
+            input=input_x,
+            size=[1000000000, 100000],
+            param_attr=paddle.fluid.ParamAttr(
+                name="embedding",
+                initializer=paddle.fluid.initializer.Constant(value=0.01)),
+            is_sparse=True)
+        input_y = paddle.fluid.layers.data(name="y", shape=[1], dtype='int64')
+
+        fc_1 = paddle.fluid.layers.fc(input=x_embedding, size=64, act='tanh')
+        fc_2 = paddle.fluid.layers.fc(input=fc_1, size=64, act='tanh')
+        prediction = paddle.fluid.layers.fc(input=[fc_2], size=2, act='softmax')
+        cost = paddle.fluid.layers.cross_entropy(
+            input=prediction, label=input_y)
+        avg_cost = paddle.fluid.layers.mean(x=cost)
+
+        strategy = paddle.distributed.fleet.DistributedStrategy()
+        strategy.auto = True
+        optimizer = paddle.fluid.optimizer.SGD(learning_rate=0.01)
+        optimizer = fleet.distributed_optimizer(optimizer, strategy=strategy)
+        optimizer.minimize(avg_cost)
+
+        self.assertTrue(optimizer.user_defined_strategy.a_sync)
+        a_sync_configs = optimizer.user_defined_strategy.a_sync_configs
+        self.assertTrue(a_sync_configs['k_steps'] == 0)
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_dist_fleet_a_sync_optimizer_auto_geo.py b/python/paddle/fluid/tests/unittests/test_dist_fleet_a_sync_optimizer_auto_geo.py
new file mode 100644
index 0000000000000000000000000000000000000000..bdfa3a9a7d57869466b895f23674b6e8ef83310f
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/test_dist_fleet_a_sync_optimizer_auto_geo.py
@@ -0,0 +1,69 @@
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+import paddle
+import os
+import paddle.distributed.fleet.base.role_maker as role_maker
+import time
+
+paddle.enable_static()
+
+
+class TestFleetGradientMergeMetaOptimizer(unittest.TestCase):
+    def setUp(self):
+        os.environ["PADDLE_PSERVER_NUMS"] = "2"
+        os.environ["PADDLE_TRAINERS_NUM"] = "2"
+        os.environ["POD_IP"] = "127.0.0.1"
+        os.environ["PADDLE_PORT"] = "36001"
+        os.environ["PADDLE_TRAINER_ID"] = "0"
+        os.environ["PADDLE_TRAINERS_NUM"] = "2"
+        os.environ["PADDLE_PSERVERS_IP_PORT_LIST"] = \
+            "127.0.0.1:36001,127.0.0.2:36001"
+
+    def test_a_sync_optimizer2(self):
+        os.environ["TRAINING_ROLE"] = "TRAINER"
+        import paddle.distributed.fleet as fleet
+
+        main_program = paddle.fluid.Program()
+        startup_program = paddle.fluid.Program()
+
+        paddle.fluid.framework.switch_main_program(main_program)
+        paddle.fluid.framework.switch_startup_program(startup_program)
+
+        fleet.init(role_maker.PaddleCloudRoleMaker())
+        input_x = paddle.fluid.layers.data(
+            name="x", shape=[32], dtype='float32')
+        input_y = paddle.fluid.layers.data(name="y", shape=[1], dtype='int64')
+
+        fc_1 = paddle.fluid.layers.fc(input=input_x, size=64, act='tanh')
+        fc_2 = paddle.fluid.layers.fc(input=fc_1, size=64, act='tanh')
+        prediction = paddle.fluid.layers.fc(input=[fc_2], size=2, act='softmax')
+        cost = paddle.fluid.layers.cross_entropy(
+            input=prediction, label=input_y)
+        avg_cost = paddle.fluid.layers.mean(x=cost)
+
+        strategy = paddle.distributed.fleet.DistributedStrategy()
+        strategy.auto = True
+        optimizer = paddle.fluid.optimizer.SGD(learning_rate=0.01)
+        optimizer = fleet.distributed_optimizer(optimizer, strategy=strategy)
+        optimizer.minimize(avg_cost)
+
+        self.assertTrue(optimizer.user_defined_strategy.a_sync)
+        a_sync_configs = optimizer.user_defined_strategy.a_sync_configs
+        self.assertTrue(a_sync_configs['k_steps'] == 800)
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_dist_fleet_a_sync_optimizer_geo.py b/python/paddle/fluid/tests/unittests/test_dist_fleet_a_sync_optimizer_geo.py
index 59ca41a11e325cfb66a3a3eaadb4eca6f9764212..db73069bf7d42ac008f14b804bd7d31b808d92b9 100644
--- a/python/paddle/fluid/tests/unittests/test_dist_fleet_a_sync_optimizer_geo.py
+++ b/python/paddle/fluid/tests/unittests/test_dist_fleet_a_sync_optimizer_geo.py
@@ -18,6 +18,8 @@ import os
 import paddle.distributed.fleet.base.role_maker as role_maker
 import time
 
+paddle.enable_static()
+
 
 class TestFleetGradientMergeMetaOptimizer(unittest.TestCase):
     def setUp(self):
diff --git a/python/paddle/fluid/tests/unittests/test_dist_fleet_a_sync_optimizer_sync.py b/python/paddle/fluid/tests/unittests/test_dist_fleet_a_sync_optimizer_sync.py
index e0993e022e1b9570773634ec829b088c5ff145ea..b05a53c88bb9154b69640df6c39305a00e3c447b 100644
--- a/python/paddle/fluid/tests/unittests/test_dist_fleet_a_sync_optimizer_sync.py
+++ b/python/paddle/fluid/tests/unittests/test_dist_fleet_a_sync_optimizer_sync.py
@@ -19,6 +19,8 @@ import paddle.distributed.fleet as fleet
 import paddle.distributed.fleet.base.role_maker as role_maker
 import time
 
+paddle.enable_static()
+
 
 class TestFleetGradientMergeMetaOptimizer(unittest.TestCase):
     def setUp(self):
diff --git a/python/paddle/fluid/tests/unittests/test_dist_fleet_base.py b/python/paddle/fluid/tests/unittests/test_dist_fleet_base.py
index beb0069eb770f25d7834749ff9c188e5252e13c0..3a923dbf3f72e28c64c3f01d22d4d6f2d897f79b 100644
--- a/python/paddle/fluid/tests/unittests/test_dist_fleet_base.py
+++ b/python/paddle/fluid/tests/unittests/test_dist_fleet_base.py
@@ -76,9 +76,10 @@ class FleetDistRunnerBase(object):
         return role
 
     def build_strategy(self, args):
-        self.strategy = paddle.distributed.fleet.DistributedStrategy()
-        self.strategy.a_sync = False
-        if args.mode == "async":
+        if args.mode == "sync":
+            self.strategy = paddle.distributed.fleet.DistributedStrategy()
+            self.strategy.a_sync = False
+        elif args.mode == "async":
             self.strategy = paddle.distributed.fleet.DistributedStrategy()
             self.strategy.a_sync = True
         elif args.mode == "geo":
@@ -87,6 +88,10 @@ class FleetDistRunnerBase(object):
             self.strategy.a_sync_configs = {
                 "k_steps": args.geo_sgd_need_push_nums
             }
+        elif args.mode == "auto":
+            self.strategy = paddle.distributed.fleet.DistributedStrategy()
+            self.strategy.auto = True
+
         self.dump_param = os.getenv("dump_param", "").split(",")
         self.dump_fields = os.getenv("dump_fields", "").split(",")
         self.dump_fields_path = os.getenv("dump_fields_path", "")
@@ -232,14 +237,17 @@ class TestFleetBase(unittest.TestCase):
         tr0_pipe = open(tempfile.gettempdir() + "/tr0_err.log", "wb+")
         tr1_pipe = open(tempfile.gettempdir() + "/tr1_err.log", "wb+")
 
+        tr0_out = open(tempfile.gettempdir() + "/tr0_stdout.log", "wb+")
+        tr1_out = open(tempfile.gettempdir() + "/tr1_stdout.log", "wb+")
+
         tr0_proc = subprocess.Popen(
             tr0_cmd.strip().split(" "),
-            stdout=subprocess.PIPE,
+            stdout=tr0_out,
             stderr=tr0_pipe,
             env=required_envs)
         tr1_proc = subprocess.Popen(
             tr1_cmd.strip().split(" "),
-            stdout=subprocess.PIPE,
+            stdout=tr1_out,
             stderr=tr1_pipe,
             env=required_envs)
 
diff --git a/python/paddle/fluid/tests/unittests/test_dist_fleet_ctr.py b/python/paddle/fluid/tests/unittests/test_dist_fleet_ctr.py
index e2336caac1c07f555280b82ba8fcfa7e5ec7f5b8..02ca0588e7452d44817f6c288ea9cf77b80dbfe8 100644
--- a/python/paddle/fluid/tests/unittests/test_dist_fleet_ctr.py
+++ b/python/paddle/fluid/tests/unittests/test_dist_fleet_ctr.py
@@ -52,6 +52,38 @@ class TestDistMnistSync2x2(TestFleetBase):
             "dist_fleet_ctr.py", delta=1e-5, check_error_log=True)
 
 
+class TestDistMnistAuto2x2(TestFleetBase):
+    def _setup_config(self):
+        self._mode = "auto"
+        self._reader = "pyreader"
+
+    def check_with_place(self,
+                         model_file,
+                         delta=1e-3,
+                         check_error_log=False,
+                         need_envs={}):
+        required_envs = {
+            "PATH": os.getenv("PATH", ""),
+            "PYTHONPATH": os.getenv("PYTHONPATH", ""),
+            "LD_LIBRARY_PATH": os.getenv("LD_LIBRARY_PATH", ""),
+            "FLAGS_rpc_deadline": "5000",  # 5sec to fail fast
+            "http_proxy": "",
+            "CPU_NUM": "2"
+        }
+
+        required_envs.update(need_envs)
+
+        if check_error_log:
+            required_envs["GLOG_v"] = "3"
+            required_envs["GLOG_logtostderr"] = "1"
+
+        tr0_losses, tr1_losses = self._run_cluster(model_file, required_envs)
+
+    def test_dist_train(self):
+        self.check_with_place(
+            "dist_fleet_ctr.py", delta=1e-5, check_error_log=True)
+
+
 class TestDistMnistAsync2x2(TestFleetBase):
     def _setup_config(self):
         self._mode = "async"
diff --git a/python/paddle/fluid/tests/unittests/test_dist_fleet_geo.py b/python/paddle/fluid/tests/unittests/test_dist_fleet_geo.py
index 7d18e935f58b6588adbef913c10d3ad497f07b53..82a8f46a945b9d97a7c6c662f11edf82fbc68111 100644
--- a/python/paddle/fluid/tests/unittests/test_dist_fleet_geo.py
+++ b/python/paddle/fluid/tests/unittests/test_dist_fleet_geo.py
@@ -22,6 +22,9 @@ from paddle.fluid.incubate.fleet.parameter_server.distribute_transpiler import f
 from paddle.fluid.incubate.fleet.parameter_server.distribute_transpiler.distributed_strategy import StrategyFactory
 from test_dist_fleet_base import TestFleetBase
 from dist_fleet_simnet_bow import train_network
+import paddle
+
+paddle.enable_static()
 
 
 class TestDistGeoCtr_2x2(TestFleetBase):
diff --git a/python/paddle/fluid/tests/unittests/test_dist_fleet_heter_ctr.py b/python/paddle/fluid/tests/unittests/test_dist_fleet_heter_ctr.py
index c3ffd50dc8da16f4a19c8da5383fe7f763aa7a72..b3e38a421287611c43bb82d93b4df166e23f6484 100644
--- a/python/paddle/fluid/tests/unittests/test_dist_fleet_heter_ctr.py
+++ b/python/paddle/fluid/tests/unittests/test_dist_fleet_heter_ctr.py
@@ -18,6 +18,9 @@ import os
 import unittest
 import tempfile
 from test_dist_fleet_heter_base import TestFleetHeterBase
+import paddle
+
+paddle.enable_static()
 
 
 class TestDistHeterDatasetAsync2x2(TestFleetHeterBase):
@@ -36,13 +39,45 @@ class TestDistHeterDatasetAsync2x2(TestFleetHeterBase):
             "LD_LIBRARY_PATH": os.getenv("LD_LIBRARY_PATH", ""),
             "FLAGS_rpc_deadline": "5000",  # 5sec to fail fast
             "http_proxy": "",
-            "CPU_NUM": "1"
+            "CPU_NUM": "3"
+        }
+
+        required_envs.update(need_envs)
+
+        if check_error_log:
+            required_envs["GLOG_v"] = "3"
+            required_envs["GLOG_logtostderr"] = "1"
+
+        tr0_losses, tr1_losses = self._run_cluster(model_file, required_envs)
+
+    def test_dist_train(self):
+        self.check_with_place(
+            "dist_fleet_heter_ctr.py", delta=1e-5, check_error_log=True)
+
+
+class TestDistHeterPyreaderAsync2x2(TestFleetHeterBase):
+    def _setup_config(self):
+        self._mode = "async"
+        self._reader = "pyreader"
+
+    def check_with_place(self,
+                         model_file,
+                         delta=1e-3,
+                         check_error_log=False,
+                         need_envs={}):
+        required_envs = {
+            "PATH": os.getenv("PATH", ""),
+            "PYTHONPATH": os.getenv("PYTHONPATH", ""),
+            "LD_LIBRARY_PATH": os.getenv("LD_LIBRARY_PATH", ""),
+            "FLAGS_rpc_deadline": "5000",  # 5sec to fail fast
+            "http_proxy": "",
+            "CPU_NUM": "3"
         }
 
         required_envs.update(need_envs)
 
         if check_error_log:
-            required_envs["GLOG_v"] = "4"
+            required_envs["GLOG_v"] = "3"
             required_envs["GLOG_logtostderr"] = "1"
 
         tr0_losses, tr1_losses = self._run_cluster(model_file, required_envs)
diff --git a/python/paddle/fluid/tests/unittests/test_dist_fleet_heter_program.py b/python/paddle/fluid/tests/unittests/test_dist_fleet_heter_program.py
index 3369039661205ef78a3ec0254241c3ed80b771a9..00301f9b1c61dd12dc993e0b4c735479fe16daed 100644
--- a/python/paddle/fluid/tests/unittests/test_dist_fleet_heter_program.py
+++ b/python/paddle/fluid/tests/unittests/test_dist_fleet_heter_program.py
@@ -21,6 +21,9 @@ import paddle.fluid as fluid
 import paddle.distributed.fleet.base.role_maker as role_maker
 from paddle.distributed.fleet.base.util_factory import fleet_util
 from paddle.distributed.fleet import fleet
+import paddle
+
+paddle.enable_static()
 
 
 class TestDistFleetHeterProgram(unittest.TestCase):
diff --git a/python/paddle/fluid/tests/unittests/test_dist_fleet_ps.py b/python/paddle/fluid/tests/unittests/test_dist_fleet_ps.py
index 8132add37a673d9035ca108cc124f075b53226f1..d766e6bf2af714e04c6a04d8a8e627bcc631cee9 100644
--- a/python/paddle/fluid/tests/unittests/test_dist_fleet_ps.py
+++ b/python/paddle/fluid/tests/unittests/test_dist_fleet_ps.py
@@ -19,6 +19,9 @@ import paddle.fluid as fluid
 import paddle.fluid.incubate.fleet.base.role_maker as role_maker
 from paddle.fluid.incubate.fleet.parameter_server.distribute_transpiler import fleet
 from paddle.fluid.incubate.fleet.parameter_server.distribute_transpiler.distributed_strategy import StrategyFactory
+import paddle
+
+paddle.enable_static()
 
 # For Net
 base_lr = 0.2
diff --git a/python/paddle/fluid/tests/unittests/test_dist_fleet_ps2.py b/python/paddle/fluid/tests/unittests/test_dist_fleet_ps2.py
index e7b10be2349cce755267297025ca8520b6d494ee..218eb77d0b5653fb80bceae6714f85f2674df6cb 100644
--- a/python/paddle/fluid/tests/unittests/test_dist_fleet_ps2.py
+++ b/python/paddle/fluid/tests/unittests/test_dist_fleet_ps2.py
@@ -24,6 +24,8 @@ import paddle.fluid as fluid
 import paddle.distributed.fleet.base.role_maker as role_maker
 import paddle.distributed.fleet as fleet
 
+paddle.enable_static()
+
 # For Net
 base_lr = 0.2
 emb_lr = base_lr * 3
diff --git a/python/paddle/fluid/tests/unittests/test_dist_fleet_ps3.py b/python/paddle/fluid/tests/unittests/test_dist_fleet_ps3.py
index de4363f255ba8fd80b7caea11a03a28899c1c9e7..8d101a34b68e4b9b84caa7de8921bd1096e71944 100644
--- a/python/paddle/fluid/tests/unittests/test_dist_fleet_ps3.py
+++ b/python/paddle/fluid/tests/unittests/test_dist_fleet_ps3.py
@@ -19,6 +19,9 @@ import paddle.fluid as fluid
 import paddle.fluid.incubate.fleet.base.role_maker as role_maker
 from paddle.fluid.incubate.fleet.parameter_server.distribute_transpiler import fleet
 from paddle.fluid.incubate.fleet.parameter_server.distribute_transpiler.distributed_strategy import StrategyFactory
+import paddle
+
+paddle.enable_static()
 
 # For Net
 base_lr = 0.2
diff --git a/python/paddle/fluid/tests/unittests/test_dist_fleet_ps4.py b/python/paddle/fluid/tests/unittests/test_dist_fleet_ps4.py
index dc40b2eb5c6480fb22f28c66e2b8205575269b66..379bcaf684d53c2c72f6369e72418cdaaaf3ac84 100644
--- a/python/paddle/fluid/tests/unittests/test_dist_fleet_ps4.py
+++ b/python/paddle/fluid/tests/unittests/test_dist_fleet_ps4.py
@@ -19,6 +19,9 @@ import paddle.fluid as fluid
 import paddle.fluid.incubate.fleet.base.role_maker as role_maker
 from paddle.fluid.incubate.fleet.parameter_server.distribute_transpiler import fleet
 from paddle.fluid.incubate.fleet.parameter_server.distribute_transpiler.distributed_strategy import StrategyFactory
+import paddle
+
+paddle.enable_static()
 
 # For Net
 base_lr = 0.2
diff --git a/python/paddle/fluid/tests/unittests/test_dist_fleet_ps5.py b/python/paddle/fluid/tests/unittests/test_dist_fleet_ps5.py
index 5e525bdb54d4c2c00a96075533d0c0cd6074fe1b..fd069793473648a0dff731d66c85bd3fe61997c7 100644
--- a/python/paddle/fluid/tests/unittests/test_dist_fleet_ps5.py
+++ b/python/paddle/fluid/tests/unittests/test_dist_fleet_ps5.py
@@ -19,6 +19,9 @@ import paddle.fluid as fluid
 import paddle.fluid.incubate.fleet.base.role_maker as role_maker
 from paddle.fluid.incubate.fleet.parameter_server.distribute_transpiler import fleet
 from paddle.fluid.incubate.fleet.parameter_server.distribute_transpiler.distributed_strategy import StrategyFactory
+import paddle
+
+paddle.enable_static()
 
 # For Net
 base_lr = 0.2
diff --git a/python/paddle/fluid/tests/unittests/test_dist_fleet_simnet.py b/python/paddle/fluid/tests/unittests/test_dist_fleet_simnet.py
index ec34993905e3cfc4603ac48987a690b7fa8a5439..e0fa590db2abdd3d3c0ccaca2d599e66c75102ba 100644
--- a/python/paddle/fluid/tests/unittests/test_dist_fleet_simnet.py
+++ b/python/paddle/fluid/tests/unittests/test_dist_fleet_simnet.py
@@ -18,6 +18,9 @@ import os
 import unittest
 import tempfile
 from test_dist_fleet_base import TestFleetBase
+import paddle
+
+paddle.enable_static()
 
 
 class TestDistSimnetASync2x2(TestFleetBase):
diff --git a/python/paddle/fluid/tests/unittests/test_dist_mnist_backward_deps.py b/python/paddle/fluid/tests/unittests/test_dist_mnist_backward_deps.py
index 1f6274ec16488323c9f7e6b14a94e0d9182d7aca..23a2b8fd306070083a0fbec11c0709748b6ed6ac 100644
--- a/python/paddle/fluid/tests/unittests/test_dist_mnist_backward_deps.py
+++ b/python/paddle/fluid/tests/unittests/test_dist_mnist_backward_deps.py
@@ -15,6 +15,9 @@
 from __future__ import print_function
 import unittest
 from test_dist_base import TestDistBase
+import paddle
+
+paddle.enable_static()
 
 
 class TestDistMnistNCCL2BackWardDeps(TestDistBase):
diff --git a/python/paddle/fluid/tests/unittests/test_dist_mnist_batch_merge.py b/python/paddle/fluid/tests/unittests/test_dist_mnist_batch_merge.py
index 24c9b9a139733c0428e99fb8dfdc02c9cb38393e..4cf2cf5f3675480b6ef6f8e04561102fbfd1dccf 100644
--- a/python/paddle/fluid/tests/unittests/test_dist_mnist_batch_merge.py
+++ b/python/paddle/fluid/tests/unittests/test_dist_mnist_batch_merge.py
@@ -16,6 +16,9 @@ from __future__ import print_function
 import unittest
 from test_dist_base import TestDistBase
 import os
+import paddle
+
+paddle.enable_static()
 
 flag_name = os.path.splitext(__file__)[0]
 
diff --git a/python/paddle/fluid/tests/unittests/test_dist_mnist_dgc_nccl.py b/python/paddle/fluid/tests/unittests/test_dist_mnist_dgc_nccl.py
index 0b9b85d5d52c38f748679a92a99ec61c3dec7903..9bc48ac0a1b2d4eca90acc1cd9792696bfcb7a2e 100644
--- a/python/paddle/fluid/tests/unittests/test_dist_mnist_dgc_nccl.py
+++ b/python/paddle/fluid/tests/unittests/test_dist_mnist_dgc_nccl.py
@@ -18,6 +18,9 @@ from test_dist_base import TestDistBase
 
 import os
 import subprocess
+import paddle
+
+paddle.enable_static()
 flag_name = os.path.splitext(__file__)[0]
 
 
diff --git a/python/paddle/fluid/tests/unittests/test_dist_mnist_fleet_save.py b/python/paddle/fluid/tests/unittests/test_dist_mnist_fleet_save.py
index 7dac11535629379639e86f2a4d2583fb703d5bfb..7336794578ed7b80a182b6175ebb0eda4252041d 100644
--- a/python/paddle/fluid/tests/unittests/test_dist_mnist_fleet_save.py
+++ b/python/paddle/fluid/tests/unittests/test_dist_mnist_fleet_save.py
@@ -17,6 +17,9 @@ import shutil
 import os
 import unittest
 from test_dist_base import TestDistBase
+import paddle
+
+paddle.enable_static()
 
 
 class TestDistMnistFleetSave(TestDistBase):
diff --git a/python/paddle/fluid/tests/unittests/test_dist_mnist_fleetapi.py b/python/paddle/fluid/tests/unittests/test_dist_mnist_fleetapi.py
index d5ebe09adca01a339dd5da1c6e73c621a4a21a2d..255fd9b2855af579f419d1ada9044a445258746e 100644
--- a/python/paddle/fluid/tests/unittests/test_dist_mnist_fleetapi.py
+++ b/python/paddle/fluid/tests/unittests/test_dist_mnist_fleetapi.py
@@ -15,6 +15,9 @@
 from __future__ import print_function
 import unittest
 from test_dist_base import TestDistBase
+import paddle
+
+paddle.enable_static()
 
 
 class TestDistMnistNCCL2FleetApi(TestDistBase):
diff --git a/python/paddle/fluid/tests/unittests/test_dist_mnist_hallreduce.py b/python/paddle/fluid/tests/unittests/test_dist_mnist_hallreduce.py
index cc002582371d33fa29c0d738568212855e025023..356c5573f95308d9d2cbf93b4232b199f5ee2a5e 100644
--- a/python/paddle/fluid/tests/unittests/test_dist_mnist_hallreduce.py
+++ b/python/paddle/fluid/tests/unittests/test_dist_mnist_hallreduce.py
@@ -17,6 +17,9 @@ import unittest
 from test_dist_base import TestDistBase
 
 import os
+import paddle
+
+paddle.enable_static()
 flag_name = os.path.splitext(__file__)[0]
 
 
diff --git a/python/paddle/fluid/tests/unittests/test_dist_mnist_multi_comm.py b/python/paddle/fluid/tests/unittests/test_dist_mnist_multi_comm.py
index f43ccc8becb8fd76735618e75c80a27f1f54c8c3..d9e6be8609d273dd7a149ff59a350da4c9dede20 100644
--- a/python/paddle/fluid/tests/unittests/test_dist_mnist_multi_comm.py
+++ b/python/paddle/fluid/tests/unittests/test_dist_mnist_multi_comm.py
@@ -17,6 +17,9 @@ import unittest
 from test_dist_base import TestDistBase
 
 import os
+import paddle
+
+paddle.enable_static()
 flag_name = os.path.splitext(__file__)[0]
 
 
diff --git a/python/paddle/fluid/tests/unittests/test_dist_mnist_pg.py b/python/paddle/fluid/tests/unittests/test_dist_mnist_pg.py
index d063f8473e0f50256dc424429ce1244a4b893ccf..28ef31875dbdeda83ab1d8de272e0b515c3cda83 100644
--- a/python/paddle/fluid/tests/unittests/test_dist_mnist_pg.py
+++ b/python/paddle/fluid/tests/unittests/test_dist_mnist_pg.py
@@ -15,6 +15,9 @@
 from __future__ import print_function
 import unittest
 from test_dist_base import TestDistBase
+import paddle
+
+paddle.enable_static()
 
 
 class TestDistMnistNCCL2(TestDistBase):
diff --git a/python/paddle/fluid/tests/unittests/test_dist_mnist_ring_allreduce.py b/python/paddle/fluid/tests/unittests/test_dist_mnist_ring_allreduce.py
index fd15020275bdce1a6424f3134ff089bd761ee1b1..4436064dc28ed1276481378c70aa3b306486e0c8 100644
--- a/python/paddle/fluid/tests/unittests/test_dist_mnist_ring_allreduce.py
+++ b/python/paddle/fluid/tests/unittests/test_dist_mnist_ring_allreduce.py
@@ -15,6 +15,9 @@
 from __future__ import print_function
 import unittest
 from test_dist_base import TestDistBase
+import paddle
+
+paddle.enable_static()
 
 
 class TestDistMnistNCCL2(TestDistBase):
diff --git a/python/paddle/fluid/tests/unittests/test_dist_mnist_with_program.py b/python/paddle/fluid/tests/unittests/test_dist_mnist_with_program.py
index 4f4941aa217b985c829391e9e8652d91f72b0c98..d55582fbb4dbb51b8b541579543015909e85aad8 100644
--- a/python/paddle/fluid/tests/unittests/test_dist_mnist_with_program.py
+++ b/python/paddle/fluid/tests/unittests/test_dist_mnist_with_program.py
@@ -15,6 +15,9 @@
 from __future__ import print_function
 import unittest
 from test_dist_base import TestDistBase
+import paddle
+
+paddle.enable_static()
 
 
 class TestDistMnistLocalSGDFleetApi(TestDistBase):
diff --git a/python/paddle/fluid/tests/unittests/test_dist_op.py b/python/paddle/fluid/tests/unittests/test_dist_op.py
index 1f46e0e7f9ca97409a7c6ea634ed96421e593f5f..0f71027d274018a48e769a28ff9679204251c1d3 100644
--- a/python/paddle/fluid/tests/unittests/test_dist_op.py
+++ b/python/paddle/fluid/tests/unittests/test_dist_op.py
@@ -19,6 +19,8 @@ import paddle
 import paddle.fluid as fluid
 import paddle.fluid.core as core
 
+paddle.enable_static()
+
 
 def dist(x, y, p):
     if p == 0.:
diff --git a/python/paddle/fluid/tests/unittests/test_dist_se_resnext_nccl.py b/python/paddle/fluid/tests/unittests/test_dist_se_resnext_nccl.py
index dbf0319d3054f097f9e3b0e85a81a47581fddbbc..64217135be735cb0bd752e240a787c42c2bb4944 100644
--- a/python/paddle/fluid/tests/unittests/test_dist_se_resnext_nccl.py
+++ b/python/paddle/fluid/tests/unittests/test_dist_se_resnext_nccl.py
@@ -18,6 +18,9 @@ from test_dist_base import TestDistBase
 import os
 
 import os
+import paddle
+
+paddle.enable_static()
 flag_name = os.path.splitext(__file__)[0]
 
 
diff --git a/python/paddle/fluid/tests/unittests/test_dist_transpiler_async_decay.py b/python/paddle/fluid/tests/unittests/test_dist_transpiler_async_decay.py
index 1062123948481a4164a12a4bed818b964923006f..dd5c393f49c3f2a52414091fa3d3349e25362ae8 100644
--- a/python/paddle/fluid/tests/unittests/test_dist_transpiler_async_decay.py
+++ b/python/paddle/fluid/tests/unittests/test_dist_transpiler_async_decay.py
@@ -17,6 +17,9 @@ from __future__ import print_function
 import unittest
 import gc
 import paddle.fluid as fluid
+import paddle
+
+paddle.enable_static()
 
 
 class TranspilerAsyncLRDecayTest(unittest.TestCase):
@@ -113,8 +116,8 @@ class TranspilerAsyncLRDecayTest(unittest.TestCase):
                          ["listen_and_serv"])
         # block1: sum,cast,scale,floor,fill_constant,elementwise_pow,scale
         self.assertEqual([op.type for op in pserver.blocks[1].ops], [
-            "sum", "cast", "fill_constant", "elementwise_div", "floor",
-            "fill_constant", "elementwise_pow", "scale"
+            "sum", "cast", "scale", "floor", "fill_constant", "elementwise_pow",
+            "scale"
         ])
 
         # block1~2: optimize pass
diff --git a/python/paddle/fluid/tests/unittests/test_dist_transpiler_config.py b/python/paddle/fluid/tests/unittests/test_dist_transpiler_config.py
index c8d0d840872a8af4dd5230fd3a33961490ebdb0a..e6bc99fc2257c6561d24cac71a37fa840ff966ab 100644
--- a/python/paddle/fluid/tests/unittests/test_dist_transpiler_config.py
+++ b/python/paddle/fluid/tests/unittests/test_dist_transpiler_config.py
@@ -15,6 +15,9 @@
 import unittest
 import paddle.fluid as fluid
 import gc
+import paddle
+
+paddle.enable_static()
 
 gc.set_debug(gc.DEBUG_COLLECTABLE)
 
diff --git a/python/paddle/fluid/tests/unittests/test_distribute_fpn_proposals_op.py b/python/paddle/fluid/tests/unittests/test_distribute_fpn_proposals_op.py
index 55b21f1a722f822f1bfcb7bbbda645109092b8a3..ec0125b28ed1b870025adbfd2bd4ba78244bcc11 100644
--- a/python/paddle/fluid/tests/unittests/test_distribute_fpn_proposals_op.py
+++ b/python/paddle/fluid/tests/unittests/test_distribute_fpn_proposals_op.py
@@ -35,9 +35,10 @@ class TestDistributeFPNProposalsOp(OpTest):
         }
         output = [('out%d' % i, self.rois_fpn[i])
                   for i in range(len(self.rois_fpn))]
+
         self.outputs = {
             'MultiFpnRois': output,
-            'RestoreIndex': self.rois_idx_restore.reshape(-1, 1)
+            'RestoreIndex': self.rois_idx_restore.reshape(-1, 1),
         }
 
     def init_test_case(self):
@@ -117,5 +118,34 @@ class TestDistributeFPNProposalsOp(OpTest):
         self.check_output()
 
 
+class TestDistributeFPNProposalsOpWithRoisNum(TestDistributeFPNProposalsOp):
+    def set_data(self):
+        self.init_test_case()
+        self.make_rois()
+        self.rois_fpn, self.rois_idx_restore = self.calc_rois_distribute()
+        self.inputs = {
+            'FpnRois': (self.rois[:, 1:5], self.rois_lod),
+            'RoisNum': np.array(self.rois_lod[0]).astype('int32')
+        }
+        self.attrs = {
+            'max_level': self.roi_max_level,
+            'min_level': self.roi_min_level,
+            'refer_scale': self.canonical_scale,
+            'refer_level': self.canonical_level
+        }
+        output = [('out%d' % i, self.rois_fpn[i])
+                  for i in range(len(self.rois_fpn))]
+        rois_num_per_level = [
+            ('rois_num%d' % i, np.array(self.rois_fpn[i][1][0]).astype('int32'))
+            for i in range(len(self.rois_fpn))
+        ]
+
+        self.outputs = {
+            'MultiFpnRois': output,
+            'RestoreIndex': self.rois_idx_restore.reshape(-1, 1),
+            'MultiLevelRoIsNum': rois_num_per_level
+        }
+
+
 if __name__ == '__main__':
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_distribution.py b/python/paddle/fluid/tests/unittests/test_distribution.py
index 533ad9604cf0d879371796fb197e61e931fb479f..47a1c407230527d53327ba57d7b5d7a979bd7d49 100644
--- a/python/paddle/fluid/tests/unittests/test_distribution.py
+++ b/python/paddle/fluid/tests/unittests/test_distribution.py
@@ -40,8 +40,11 @@ class DistributionNumpy():
 
 class UniformNumpy(DistributionNumpy):
     def __init__(self, low, high):
-        self.low = np.array(low).astype('float32')
-        self.high = np.array(high).astype('float32')
+        self.low = np.array(low)
+        self.high = np.array(high)
+        if str(self.low.dtype) not in ['float32', 'float64']:
+            self.low = self.low.astype('float32')
+            self.high = self.high.astype('float32')
 
     def sample(self, shape):
         shape = tuple(shape) + (self.low + self.high).shape
@@ -49,13 +52,13 @@ class UniformNumpy(DistributionNumpy):
                            (self.high - self.low))
 
     def log_prob(self, value):
-        lb = np.less(self.low, value).astype('float32')
-        ub = np.less(value, self.high).astype('float32')
+        lb = np.less(self.low, value).astype(self.low.dtype)
+        ub = np.less(value, self.high).astype(self.low.dtype)
         return np.log(lb * ub) - np.log(self.high - self.low)
 
     def probs(self, value):
-        lb = np.less(self.low, value).astype('float32')
-        ub = np.less(value, self.high).astype('float32')
+        lb = np.less(self.low, value).astype(self.low.dtype)
+        ub = np.less(value, self.high).astype(self.low.dtype)
         return (lb * ub) / (self.high - self.low)
 
     def entropy(self):
@@ -64,8 +67,11 @@ class UniformNumpy(DistributionNumpy):
 
 class NormalNumpy(DistributionNumpy):
     def __init__(self, loc, scale):
-        self.loc = np.array(loc).astype('float32')
-        self.scale = np.array(scale).astype('float32')
+        self.loc = np.array(loc)
+        self.scale = np.array(scale)
+        if str(self.loc.dtype) not in ['float32', 'float64']:
+            self.loc = self.loc.astype('float32')
+            self.scale = self.scale.astype('float32')
 
     def sample(self, shape):
         shape = tuple(shape) + (self.loc + self.scale).shape
@@ -83,8 +89,8 @@ class NormalNumpy(DistributionNumpy):
                       (2. * var)) / (math.sqrt(2 * math.pi) * self.scale)
 
     def entropy(self):
-        return 0.5 + 0.5 * np.log(np.array(2. * math.pi).astype(
-            'float32')) + np.log(self.scale)
+        return 0.5 + 0.5 * np.log(
+            np.array(2. * math.pi).astype(self.loc.dtype)) + np.log(self.scale)
 
     def kl_divergence(self, other):
         var_ratio = (self.scale / other.scale)
@@ -94,724 +100,571 @@ class NormalNumpy(DistributionNumpy):
         return 0.5 * (var_ratio + t1 - 1 - np.log(var_ratio))
 
 
-class DistributionTest(unittest.TestCase):
-    def setUp(self, use_gpu=False):
+class UniformTest(unittest.TestCase):
+    def setUp(self, use_gpu=False, batch_size=5, dims=6):
         self.use_gpu = use_gpu
         if not use_gpu:
-            place = fluid.CPUPlace()
+            self.place = fluid.CPUPlace()
             self.gpu_id = -1
         else:
-            place = fluid.CUDAPlace(0)
+            self.place = fluid.CUDAPlace(0)
             self.gpu_id = 0
-        self.executor = fluid.Executor(place)
-
-    def build_normal_common_net(self, batch_size, dims, sample_shape, loc_float,
-                                scale_float, other_loc_float, other_scale_float,
-                                scale_np, other_scale_np, loc_np, other_loc_np,
-                                loc, scale, other_loc, other_scale, values):
-        """Generate Normal object and get the output of its methods including
-        ``sample``, ``entropy``, ``log_prob``, ``probs`` and ``kl_divergence``.
-        Parameters ``loc`` and ``scale`` have different data types to test different situations.
-
-        Args:
-          batch_size(int): The first dimension of the shape of parameters(loc and scale).
-          dims(int): The second dimension of the shape of parameters.
-          sample_shape(int): The sample value used in ``sample`` method.
-          loc_float(float): Generated in function ``get_normal_random_input``, loc is a float number.
-          scale_float(float): Generated in function ``get_normal_random_input``, scale is a float number.
-          other_loc_float(float): Generated in function ``get_normal_random_input``, other_loc is a
-            float number. It is the first parameter in another Normal object used in ``kl_divergence``
-            method.
-          other_scale_float(float): Generated in function ``get_normal_random_input``, other_scale is a
-            float number. It is the second parameter in another Normal object used in ``kl_divergence``
-            method.
-          scale_np(numpy.ndarray): Generated in function ``get_normal_random_input``, An numpy array
-            whose shape is [batch_size, dims].
-          other_scale_np(numpy.ndarray): Generated in function ``get_normal_random_input``, other_scale_np
-            is an numpy array. It is the second parameter in another Normal object used in ``kl_divergence``
-            method.
-          loc_np(numpy.ndarray): Generated in function ``get_normal_random_input``, An numpy array
-            whose shape is [batch_size, dims].
-          other_loc_np(numpy.ndarray): Generated in function ``get_normal_random_input``, other_loc_np
-            is an numpy array. It is the first parameter in another Normal object used in ``kl_divergence``
-            method.
-          loc(Tensor): In dynamic mode, loc is generated in ``build_normal_dygraph``, it's a Tensor filled
-            with ``loc_np`` data. In static mode, loc is generated in ``build_normal_static``, ``layers.data``
-             method is used to get a Placeholder whose shape is [dims].
-          scale(Tensor): In dynamic mode, scale is generated in ``build_normal_dygraph``, it's a Tensor filled
-            with ``scale_np`` data. In static mode, scale is generated in ``build_normal_static``, ``layers.data``
-             method is used to get a Placeholder whose shape is [dims].
-          other_loc(Tensor): In dynamic mode, other_loc is generated in ``build_normal_dygraph``, it's a Tensor
-            filled with ``other_loc_np`` data. In static mode, other_loc is generated in ``build_normal_static``,
-             ``layers.data`` method is used to get a Placeholder whose shape is [dims]. It is the first parameter
-              in another Normal object used in ``kl_divergence`` method.
-          other_scale(Tensor): In dynamic mode, other_scale is generated in ``build_normal_dygraph``, it's a Tensor
-            filled with ``other_scale_np`` data. In static mode, other_scale is generated in ``build_normal_static``,
-             ``layers.data`` method is used to get a Placeholder whose shape is [dims]. It is the second parameter
-              in another Normal object used in ``kl_divergence`` method.
-          values(Tensor): In dynamic mode, values is generated in ``build_normal_dygraph``, it's a Tensor filled with
-             ``values_np`` data. In static mode, values is generated in ``build_normal_static``, ``layers.data``
-             method is used to get a Placeholder whose shape is [dims].
-
-        Returns:
-          List: The elements of the list are the output of sample, entropy, log_prob, probs, kl_divergence methods.
-          The inputs' type of these methods can be float, np.ndarray and Tensor. And broadcast will be considered.
-
-        """
-        normal_int = Normal(int(loc_float), int(scale_float))
-        normal_float = Normal(loc_float, scale_float)
-        other_normal_float = Normal(other_loc_float, other_scale_float)
-
-        normal_float_np_broadcast = Normal(loc_float, scale_np)
-        other_normal_float_np_broadcast = Normal(other_loc_float,
-                                                 other_scale_np)
-
-        normal_np = Normal(loc_np, scale_np)
-        other_normal_np = Normal(other_loc_np, other_scale_np)
-
-        normal_variable = Normal(loc, scale)
-        other_normal_variable = Normal(other_loc, other_scale)
-
-        sample_int = normal_int.sample([batch_size, dims])
-        sample_float = normal_float.sample([batch_size, dims])
-        sample_float_np_broadcast = normal_float_np_broadcast.sample(
-            [batch_size, dims])
-        sample_np = normal_np.sample([batch_size, dims])
-        sample_variable = normal_variable.sample([batch_size, dims])
-
-        sample_int_diff = normal_int.sample([sample_shape])
-        sample_float_diff = normal_float.sample([sample_shape])
-        sample_float_np_broadcast_diff = normal_float_np_broadcast.sample(
-            [sample_shape])
-        sample_np_diff = normal_np.sample([sample_shape])
-        sample_variable_diff = normal_variable.sample([sample_shape])
-
-        entropy_int = normal_int.entropy()
-        entropy_float = normal_float.entropy()
-        entropy_float_np_broadcast = normal_float_np_broadcast.entropy()
-        entropy_np = normal_np.entropy()
-        entropy_variable = normal_variable.entropy()
-
-        lp_float_np_broadcast = normal_float_np_broadcast.log_prob(values)
-        lp_np = normal_np.log_prob(values)
-        lp_variable = normal_variable.log_prob(values)
-
-        p_float_np_broadcast = normal_float_np_broadcast.probs(values)
-        p_np = normal_np.probs(values)
-        p_variable = normal_variable.probs(values)
-
-        kl_float = normal_float.kl_divergence(other_normal_float)
-        kl_float_np_broadcast = normal_float_np_broadcast.kl_divergence(
-            other_normal_float_np_broadcast)
-        kl_np = normal_np.kl_divergence(other_normal_np)
-        kl_variable = normal_variable.kl_divergence(other_normal_variable)
-
-        fetch_list = [
-            sample_int, sample_float, sample_float_np_broadcast, sample_np,
-            sample_variable, sample_int_diff, sample_float_diff,
-            sample_float_np_broadcast_diff, sample_np_diff,
-            sample_variable_diff, entropy_int, entropy_float,
-            entropy_float_np_broadcast, entropy_np, entropy_variable,
-            lp_float_np_broadcast, lp_np, lp_variable, p_float_np_broadcast,
-            p_np, p_variable, kl_float, kl_float_np_broadcast, kl_np,
-            kl_variable
-        ]
-        return fetch_list
-
-    def build_normal_static(self, test_program, batch_size, dims, sample_shape,
-                            loc_float, scale_float, other_loc_float,
-                            other_scale_float, scale_np, other_scale_np, loc_np,
-                            other_loc_np, values_np):
-        """
-        In static mode, generate feed data of Normal network, and get output fetch_list using
-        ``build_normal_common_net``.
-
-        Args:
-          test_program: In static mode, the Program object.
-          other args can refer to function ``build_normal_common_net``.
-
-        Returns:
-          feed_vars: The feed data of Normal network in static mode.
-          fetch_list: The output is generated by function ``build_normal_common_net``.
-        """
-        with fluid.program_guard(test_program):
-            loc = layers.data(name='loc', shape=[dims], dtype='float32')
-            scale = layers.data(name='scale', shape=[dims], dtype='float32')
-
-            other_loc = layers.data(
-                name='other_loc', shape=[dims], dtype='float32')
-            other_scale = layers.data(
-                name='other_scale', shape=[dims], dtype='float32')
 
-            values = layers.data(name='values', shape=[dims], dtype='float32')
+        self.init_numpy_data(batch_size, dims)
 
-            fetch_list = self.build_normal_common_net(
-                batch_size, dims, sample_shape, loc_float, scale_float,
-                other_loc_float, other_scale_float, scale_np, other_scale_np,
-                loc_np, other_loc_np, loc, scale, other_loc, other_scale,
-                values)
+        paddle.disable_static(self.place)
+        self.init_dynamic_data(batch_size, dims)
 
-        feed_vars = {
-            'loc': loc_np,
-            'scale': scale_np,
-            'other_loc': other_loc_np,
-            'other_scale': other_scale_np,
-            'values': values_np
-        }
-        return feed_vars, fetch_list
-
-    def build_normal_dygraph(self, batch_size, dims, sample_shape, loc_float,
-                             scale_float, other_loc_float, other_scale_float,
-                             scale_np, other_scale_np, loc_np, other_loc_np,
-                             values_np):
-        """
-        In dynamic mode, generate input data of Normal network, and get output fetch_list using
-        ``build_normal_common_net``.
-
-        Args:
-          refer to function ``build_normal_common_net``.
-
-        Returns:
-          fetch_list_numpy: The output is generated by function ``build_normal_common_net``. Transform
-          these tensor to numpy.ndarray.
-        """
-        loc = paddle.to_tensor(loc_np)
-        scale = paddle.to_tensor(scale_np)
-        other_loc = paddle.to_tensor(other_loc_np)
-        other_scale = paddle.to_tensor(other_scale_np)
-        values = paddle.to_tensor(values_np)
-
-        fetch_list = self.build_normal_common_net(
-            batch_size, dims, sample_shape, loc_float, scale_float,
-            other_loc_float, other_scale_float, scale_np, other_scale_np,
-            loc_np, other_loc_np, loc, scale, other_loc, other_scale, values)
-        fetch_list_numpy = [t.numpy() for t in fetch_list]
-        return fetch_list_numpy
-
-    def get_normal_random_input(self, batch_size, dims):
-        """
-        Generate input data ``loc`` and ``scale`` used in Normal network.
-
-        Args:
-          refer to function ``build_normal_common_net``.
-
-        Returns:
-          List: Different data type of ``loc`` and ``scale``, including float, numpy.ndarray.
-          By the way, ``other_loc`` and ``other_scale`` are used in ``kl_divergence`` method.
-          refer to ``args`` in function ``build_normal_common_net``.
-        """
-        loc_np = np.random.randn(batch_size, dims).astype('float32')
-        other_loc_np = np.random.randn(batch_size, dims).astype('float32')
-
-        loc_float = (np.random.ranf() - 0.5) * 4
-        scale_float = (np.random.ranf() - 0.5) * 4
-        while scale_float < 0:
-            scale_float = (np.random.ranf() - 0.5) * 4
-
-        other_loc_float = (np.random.ranf() - 0.5) * 4
-        other_scale_float = (np.random.ranf() - 0.5) * 4
-        while other_scale_float < 0:
-            other_scale_float = (np.random.ranf() - 0.5) * 4
-
-        scale_np = np.random.randn(batch_size, dims).astype('float32')
-        other_scale_np = np.random.randn(batch_size, dims).astype('float32')
-        values_np = np.random.randn(batch_size, dims).astype('float32')
-
-        while not np.all(scale_np > 0):
-            scale_np = np.random.randn(batch_size, dims).astype('float32')
-        while not np.all(other_scale_np > 0):
-            other_scale_np = np.random.randn(batch_size, dims).astype('float32')
-        return [
-            loc_np, other_loc_np, loc_float, scale_float, other_loc_float,
-            other_scale_float, scale_np, other_scale_np, values_np
-        ]
-
-    def compare_normal_with_numpy(self,
-                                  data_list,
-                                  output_list,
-                                  batch_size=2,
-                                  dims=3,
-                                  sample_shape=7,
-                                  tolerance=1e-6):
-        """
-        Compare the outputs of Normal's methods in paddle and numpy. If the outputs are not consistent,
-        raise errors.
-
-        Args:
-          data_list: Input data generated by function ``get_normal_random_input``.
-          output_list: The outputs of Normal's methods in static or dynamic mode.
-          batch_size(int): The first dimension of the shape of parameters(loc and scale).
-          dims(int): The second dimension of the shape of parameters.
-          sample_shape(int): The sample value used in ``sample`` method.
-          tolerance(float): The tolerance of the error.
-        """
-        loc_np, other_loc_np, loc_float, scale_float, other_loc_float, other_scale_float, scale_np, other_scale_np, values_np = data_list
-
-        np_normal_int = NormalNumpy(int(loc_float), int(scale_float))
-        np_normal_float = NormalNumpy(loc_float, scale_float)
-        np_other_normal_float = NormalNumpy(other_loc_float, other_scale_float)
-        np_normal_float_np_broadcast = NormalNumpy(loc_float, scale_np)
-        np_other_normal_float_np_broadcast = NormalNumpy(other_loc_float,
-                                                         other_scale_np)
-        np_normal = NormalNumpy(loc_np, scale_np)
-        np_other_normal = NormalNumpy(other_loc_np, other_scale_np)
-
-        gt_sample_int = np_normal_int.sample([batch_size, dims])
-        gt_sample_float = np_normal_float.sample([batch_size, dims])
-        gt_sample_float_np_broadcast = np_normal_float_np_broadcast.sample(
-            [batch_size, dims])
-        gt_sample_np = np_normal.sample([batch_size, dims])
-
-        gt_sample_int_diff = np_normal_int.sample([sample_shape])
-        gt_sample_float_diff = np_normal_float.sample([sample_shape])
-        gt_sample_float_np_broadcast_diff = np_normal_float_np_broadcast.sample(
-            [sample_shape])
-        gt_sample_np_diff = np_normal.sample([sample_shape])
-
-        gt_entropy_int = np_normal_int.entropy()
-        gt_entropy_float = np_normal_float.entropy()
-        gt_entropy_float_np_broadcast = np_normal_float_np_broadcast.entropy()
-        gt_entropy = np_normal.entropy()
-        gt_lp_float_np_broadcast = np_normal_float_np_broadcast.log_prob(
-            values_np)
-        gt_lp = np_normal.log_prob(values_np)
-        gt_p_float_np_broadcast = np_normal_float_np_broadcast.probs(values_np)
-        gt_p = np_normal.probs(values_np)
-        gt_kl_float = np_normal_float.kl_divergence(np_other_normal_float)
-        gt_kl_float_np_broadcast = np_normal_float_np_broadcast.kl_divergence(
-            np_other_normal_float_np_broadcast)
-        gt_kl = np_normal.kl_divergence(np_other_normal)
-
-        [
-            output_sample_int, output_sample_float,
-            output_sample_float_np_broadcast, output_sample_np,
-            output_sample_variable, output_sample_int_diff,
-            output_sample_float_diff, output_sample_float_np_broadcast_diff,
-            output_sample_np_diff, output_sample_variable_diff,
-            output_entropy_int, output_entropy_float,
-            output_entropy_float_np_broadcast, output_entropy_np,
-            output_entropy_variable, output_lp_float_np_broadcast, output_lp_np,
-            output_lp_variable, output_p_float_np_broadcast, output_p_np,
-            output_p_variable, output_kl_float, output_kl_float_np_broadcast,
-            output_kl_np, output_kl_variable
-        ] = output_list
-
-        np.testing.assert_equal(output_sample_int.shape, gt_sample_int.shape)
-        np.testing.assert_equal(output_sample_float.shape,
-                                gt_sample_float.shape)
-        np.testing.assert_equal(output_sample_float_np_broadcast.shape,
-                                gt_sample_float_np_broadcast.shape)
-        np.testing.assert_equal(output_sample_np.shape, gt_sample_np.shape)
-        np.testing.assert_equal(output_sample_variable.shape,
-                                gt_sample_np.shape)
-        np.testing.assert_equal(output_sample_int_diff.shape,
-                                gt_sample_int_diff.shape)
-        np.testing.assert_equal(output_sample_float_diff.shape,
-                                gt_sample_float_diff.shape)
-        np.testing.assert_equal(output_sample_float_np_broadcast_diff.shape,
-                                gt_sample_float_np_broadcast_diff.shape)
-        np.testing.assert_equal(output_sample_np_diff.shape,
-                                gt_sample_np_diff.shape)
-        np.testing.assert_equal(output_sample_variable_diff.shape,
-                                gt_sample_np_diff.shape)
-        np.testing.assert_allclose(
-            output_entropy_int, gt_entropy_int, rtol=tolerance, atol=tolerance)
-        np.testing.assert_allclose(
-            output_entropy_float,
-            gt_entropy_float,
-            rtol=tolerance,
-            atol=tolerance)
-        np.testing.assert_allclose(
-            output_entropy_float_np_broadcast,
-            gt_entropy_float_np_broadcast,
-            rtol=tolerance,
-            atol=tolerance)
-        np.testing.assert_allclose(
-            output_entropy_np, gt_entropy, rtol=tolerance, atol=tolerance)
-        np.testing.assert_allclose(
-            output_entropy_variable, gt_entropy, rtol=tolerance, atol=tolerance)
-        np.testing.assert_allclose(
-            output_lp_float_np_broadcast,
-            gt_lp_float_np_broadcast,
-            rtol=tolerance,
-            atol=tolerance)
-        np.testing.assert_allclose(
-            output_lp_np, gt_lp, rtol=tolerance, atol=tolerance)
-        np.testing.assert_allclose(
-            output_lp_variable, gt_lp, rtol=tolerance, atol=tolerance)
-        np.testing.assert_allclose(
-            output_p_float_np_broadcast,
-            gt_p_float_np_broadcast,
-            rtol=tolerance,
-            atol=tolerance)
-        np.testing.assert_allclose(
-            output_p_np, gt_p, rtol=tolerance, atol=tolerance)
-        np.testing.assert_allclose(
-            output_p_variable, gt_p, rtol=tolerance, atol=tolerance)
-        np.testing.assert_allclose(
-            output_kl_float, gt_kl_float, rtol=tolerance, atol=tolerance)
-        np.testing.assert_allclose(
-            output_kl_float_np_broadcast,
-            gt_kl_float_np_broadcast,
-            rtol=tolerance,
-            atol=tolerance)
+        paddle.enable_static()
+        self.test_program = fluid.Program()
+        self.executor = fluid.Executor(self.place)
+        self.init_static_data(batch_size, dims)
+
+    def init_numpy_data(self, batch_size, dims):
+        # low ans high are 'float'
+        self.low_np = np.random.uniform(-2, 1)
+        self.high_np = np.random.uniform(1, 3)
+        self.values_np = np.array([1.0]).astype('float32')
+
+    def init_dynamic_data(self, batch_size, dims):
+        self.dynamic_low = self.low_np
+        self.dynamic_high = self.high_np
+        self.dynamic_values = paddle.to_tensor(self.values_np)
+
+    def init_static_data(self, batch_size, dims):
+        self.static_low = self.low_np
+        self.static_high = self.high_np
+        with fluid.program_guard(self.test_program):
+            self.static_values = layers.data(
+                name='values', shape=[], dtype='float32')
+
+    def compare_with_numpy(self, fetch_list, sample_shape=7, tolerance=1e-6):
+        sample, entropy, log_prob, probs = fetch_list
+
+        np_uniform = UniformNumpy(self.low_np, self.high_np)
+        np_sample = np_uniform.sample([sample_shape])
+        np_entropy = np_uniform.entropy()
+        np_lp = np_uniform.log_prob(self.values_np)
+        np_p = np_uniform.probs(self.values_np)
+
+        np.testing.assert_equal(sample.shape, np_sample.shape)
         np.testing.assert_allclose(
-            output_kl_np, gt_kl, rtol=tolerance, atol=tolerance)
+            entropy, np_entropy, rtol=tolerance, atol=tolerance)
         np.testing.assert_allclose(
-            output_kl_variable, gt_kl, rtol=tolerance, atol=tolerance)
-
-    def test_normal_distribution_static(self,
-                                        batch_size=2,
-                                        dims=3,
-                                        sample_shape=7,
-                                        tolerance=1e-6):
-        """
-        Test Normal's methods in static mode.
-
-        Args:
-          refer to ``compare_normal_with_numpy`` function.
-        """
-        test_program = fluid.Program()
-        data_list = self.get_normal_random_input(batch_size, dims)
-        loc_np, other_loc_np, loc_float, scale_float, other_loc_float, other_scale_float, scale_np, other_scale_np, values_np = data_list
-
-        feed_vars, fetch_list = self.build_normal_static(
-            test_program, batch_size, dims, sample_shape, loc_float,
-            scale_float, other_loc_float, other_scale_float, scale_np,
-            other_scale_np, loc_np, other_loc_np, values_np)
-        self.executor.run(fluid.default_startup_program())
+            log_prob, np_lp, rtol=tolerance, atol=tolerance)
+        np.testing.assert_allclose(probs, np_p, rtol=tolerance, atol=tolerance)
 
-        output_list = self.executor.run(program=test_program,
-                                        feed=feed_vars,
-                                        fetch_list=fetch_list)
-
-        self.compare_normal_with_numpy(data_list, output_list, batch_size, dims,
-                                       sample_shape, tolerance)
-
-    def test_normal_distribution_dygraph(self,
-                                         batch_size=2,
-                                         dims=3,
-                                         sample_shape=7,
-                                         tolerance=1e-6):
-        """
-        Test Normal's methods in dynamic mode.
-
-        Args:
-          refer to ``compare_normal_with_numpy`` function.
-        """
-        paddle.disable_static()
-        data_list = self.get_normal_random_input(batch_size, dims)
-        loc_np, other_loc_np, loc_float, scale_float, other_loc_float, other_scale_float, scale_np, other_scale_np, values_np = data_list
-
-        output_list = self.build_normal_dygraph(
-            batch_size, dims, sample_shape, loc_float, scale_float,
-            other_loc_float, other_scale_float, scale_np, other_scale_np,
-            loc_np, other_loc_np, values_np)
-
-        self.compare_normal_with_numpy(data_list, output_list, batch_size, dims,
-                                       sample_shape, tolerance)
+    def test_uniform_distribution_dygraph(self, sample_shape=7, tolerance=1e-6):
+        paddle.disable_static(self.place)
+        uniform = Uniform(self.dynamic_low, self.dynamic_high)
+        sample = uniform.sample([sample_shape]).numpy()
+        entropy = uniform.entropy().numpy()
+        log_prob = uniform.log_prob(self.dynamic_values).numpy()
+        probs = uniform.probs(self.dynamic_values).numpy()
+        fetch_list = [sample, entropy, log_prob, probs]
+
+        self.compare_with_numpy(fetch_list)
+
+    def test_uniform_distribution_static(self, sample_shape=7, tolerance=1e-6):
         paddle.enable_static()
+        with fluid.program_guard(self.test_program):
+            uniform = Uniform(self.static_low, self.static_high)
+            sample = uniform.sample([sample_shape])
+            entropy = uniform.entropy()
+            log_prob = uniform.log_prob(self.static_values)
+            probs = uniform.probs(self.static_values)
+            fetch_list = [sample, entropy, log_prob, probs]
 
-    def build_uniform_common_net(self, batch_size, dims, sample_shape,
-                                 low_float, high_float, high_np, low_np,
-                                 values_np, low, high, values):
-        """Generate Uniform object and get the output of its methods including ``sample``, ``entropy``,
-         ``log_prob`` and ``probs``.
-        Parameters ``low`` and ``high`` have different data types to test different situations.
-
-        Args:
-          batch_size(int): The first dimension of the shape of parameters(low and high).
-          dims(int): The second dimension of the shape of parameters.
-          sample_shape(int): The sample value used in ``sample`` method.
-          low_float(float): Parameter ``low`` is a float number.
-          high_float(float): Parameter ``high`` is a float number.
-          high_np(numpy.ndarray): An numpy array whose shape is [batch_size, dims].
-          low_np(numpy.ndarray): An numpy array whose shape is [batch_size, dims].
-          values_np(numpy.ndarray): The input of ``log_prob`` and ``probs`` methods. An numpy array whose
-            shape is [batch_size, dims].
-          low(Tensor): In dynamic mode, low is generated in ``build_uniform_dygraph``, it's a Tensor filled
-            with ``low_np`` data. In static mode, low is generated in ``build_uniform_static``.
-          high(Tensor): In dynamic mode, high is generated in ``build_uniform_dygraph``, it's a Tensor filled
-            with ``high_np`` data. In static mode, high is generated in ``build_uniform_static``.
-          values(Tensor): In dynamic mode, values is generated in ``build_uniform_dygraph``, it's a Tensor
-            filled with ``values_np`` data. In static mode, values is generated in ``build_uniform_static``.
-
-        Returns:
-          List: The elements of the list are the output of sample, entropy, log_prob, probs methods.
-          The inputs' type of these methods can be float, np.ndarray and Tensor. And broadcast will be
-           considered.
-
-        """
-        uniform_int = Uniform(int(low_float), int(high_float))
-        uniform_float = Uniform(low_float, high_float)
-        uniform_float_np_broadcast = Uniform(low_float, high_np)
-        uniform_np = Uniform(low_np, high_np)
-        uniform_variable = Uniform(low, high)
-
-        sample_int = uniform_int.sample([batch_size, dims])
-        sample_float = uniform_float.sample([batch_size, dims])
-        sample_float_np_broadcast = uniform_float_np_broadcast.sample(
-            [batch_size, dims])
-        sample_np = uniform_np.sample([batch_size, dims])
-        sample_variable = uniform_variable.sample([batch_size, dims])
-
-        sample_int_diff = uniform_int.sample([sample_shape])
-        sample_float_diff = uniform_float.sample([sample_shape])
-        sample_float_np_broadcast_diff = uniform_float_np_broadcast.sample(
-            [sample_shape])
-        sample_np_diff = uniform_np.sample([sample_shape])
-        sample_variable_diff = uniform_variable.sample([sample_shape])
-
-        entropy_int = uniform_int.entropy()
-        entropy_float = uniform_float.entropy()
-        entropy_float_np_broadcast = uniform_float_np_broadcast.entropy()
-        entropy_np = uniform_np.entropy()
-        entropy_variable = uniform_variable.entropy()
-
-        lp_float_np_broadcast = uniform_float_np_broadcast.log_prob(values)
-        lp_np = uniform_np.log_prob(values)
-        lp_variable = uniform_variable.log_prob(values)
-
-        p_float_np_broadcast = uniform_float_np_broadcast.probs(values)
-        p_np = uniform_np.probs(values)
-        p_variable = uniform_variable.probs(values)
-
-        fetch_list = [
-            sample_int, sample_float, sample_float_np_broadcast, sample_np,
-            sample_variable, sample_int_diff, sample_float_diff,
-            sample_float_np_broadcast_diff, sample_np_diff,
-            sample_variable_diff, entropy_int, entropy_float,
-            entropy_float_np_broadcast, entropy_np, entropy_variable,
-            lp_float_np_broadcast, lp_np, lp_variable, p_float_np_broadcast,
-            p_np, p_variable
-        ]
-        return fetch_list
-
-    def build_uniform_static(self, test_program, batch_size, dims, sample_shape,
-                             low_float, high_float, high_np, low_np, values_np):
-        """
-        In static mode, generate feed data of Uniform network, and get output fetch_list using
-        ``build_uniform_common_net``.
-
-        Args:
-          test_program: In static mode, the Program object.
-          other args can refer to function ``build_uniform_common_net``.
-
-        Returns:
-          feed_vars: The feed data of Uniform network in static mode.
-          fetch_list: The output is generated by function ``build_uniform_common_net``.
-        """
-        with fluid.program_guard(test_program):
-            low = layers.data(name='low', shape=[dims], dtype='float32')
-            high = layers.data(name='high', shape=[dims], dtype='float32')
-
-            values = layers.data(name='values', shape=[dims], dtype='float32')
-
-            fetch_list = self.build_uniform_common_net(
-                batch_size, dims, sample_shape, low_float, high_float, high_np,
-                low_np, values_np, low, high, values)
-
-        feed_vars = {'low': low_np, 'high': high_np, 'values': values_np}
-        return feed_vars, fetch_list
-
-    def build_uniform_dygraph(self, batch_size, dims, sample_shape, low_float,
-                              high_float, high_np, low_np, values_np):
-        """
-        In dynamic mode, generate input data of Uniform network, and get output fetch_list using
-        ``build_uniform_common_net``.
-
-        Args:
-          refer to function ``build_uniform_common_net``.
-
-        Returns:
-          fetch_list_numpy: The output is generated by function ``build_uniform_common_net``. Transform
-          these tensor to numpy.ndarray.
-        """
-        low = paddle.to_tensor(low_np)
-        high = paddle.to_tensor(high_np)
-        values = paddle.to_tensor(values_np)
-
-        fetch_list = self.build_uniform_common_net(
-            batch_size, dims, sample_shape, low_float, high_float, high_np,
-            low_np, values_np, low, high, values)
-        fetch_list_numpy = [t.numpy() for t in fetch_list]
-        return fetch_list_numpy
-
-    def compare_uniform_with_numpy(self,
-                                   data_list,
-                                   output_list,
-                                   batch_size=2,
-                                   dims=3,
-                                   sample_shape=7,
-                                   tolerance=1e-6):
-        """
-        Compare the outputs of Uniform's methods in paddle and numpy. If the outputs are not consistent,
-        raise errors.
-
-        Args:
-          data_list: Input data including float and numpy.ndarray type of ``low`` and ``high`` parameters.
-          output_list: The outputs of Uniform's methods in static or dynamic mode.
-          batch_size(int): The first dimension of the shape of parameters(low and high).
-          dims(int): The second dimension of the shape of parameters.
-          sample_shape(int): The sample value used in ``sample`` method.
-          tolerance(float): The tolerance of the error.
-        """
-        [low_np, low_float, high_float, high_np, values_np] = data_list
-
-        np_uniform_int = UniformNumpy(int(low_float), int(high_float))
-        np_uniform_float = UniformNumpy(low_float, high_float)
-        np_uniform_float_np_broadcast = UniformNumpy(low_float, high_np)
-        np_uniform = UniformNumpy(low_np, high_np)
-
-        gt_sample_int = np_uniform_int.sample([batch_size, dims])
-        gt_sample_float = np_uniform_float.sample([batch_size, dims])
-        gt_sample_float_np_broadcast = np_uniform_float_np_broadcast.sample(
-            [batch_size, dims])
-        gt_sample_np = np_uniform.sample([batch_size, dims])
-        gt_sample_int_diff = np_uniform_int.sample([sample_shape])
-        gt_sample_float_diff = np_uniform_float.sample([sample_shape])
-        gt_sample_float_np_broadcast_diff = np_uniform_float_np_broadcast.sample(
-            [sample_shape])
-        gt_sample_np_diff = np_uniform.sample([sample_shape])
-        gt_entropy_int = np_uniform_int.entropy()
-        gt_entropy_float = np_uniform_float.entropy()
-        gt_entropy_float_np_broadcast = np_uniform_float_np_broadcast.entropy()
-        gt_entropy = np_uniform.entropy()
-        gt_lp_float_np_broadcast = np_uniform_float_np_broadcast.log_prob(
-            values_np)
-        gt_lp = np_uniform.log_prob(values_np)
-        gt_p_float_np_broadcast = np_uniform_float_np_broadcast.probs(values_np)
-        gt_p = np_uniform.probs(values_np)
-
-        [
-            output_sample_int, output_sample_float,
-            output_sample_float_np_broadcast, output_sample_np,
-            output_sample_variable, output_sample_int_diff,
-            output_sample_float_diff, output_sample_float_np_broadcast_diff,
-            output_sample_np_diff, output_sample_variable_diff,
-            output_entropy_int, output_entropy_float,
-            output_entropy_float_np_broadcast, output_entropy_np,
-            output_entropy_variable, output_lp_float_np_broadcast, output_lp_np,
-            output_lp_variable, output_p_float_np_broadcast, output_p_np,
-            output_p_variable
-        ] = output_list
-
-        np.testing.assert_equal(output_sample_int.shape, gt_sample_int.shape)
-        np.testing.assert_equal(output_sample_float.shape,
-                                gt_sample_float.shape)
-        np.testing.assert_equal(output_sample_float_np_broadcast.shape,
-                                gt_sample_float_np_broadcast.shape)
-        np.testing.assert_equal(output_sample_np.shape, gt_sample_np.shape)
-        np.testing.assert_equal(output_sample_variable.shape,
-                                gt_sample_np.shape)
-        np.testing.assert_equal(output_sample_int_diff.shape,
-                                gt_sample_int_diff.shape)
-        np.testing.assert_equal(output_sample_float_diff.shape,
-                                gt_sample_float_diff.shape)
-        np.testing.assert_equal(output_sample_float_np_broadcast_diff.shape,
-                                gt_sample_float_np_broadcast_diff.shape)
-        np.testing.assert_equal(output_sample_np_diff.shape,
-                                gt_sample_np_diff.shape)
-        np.testing.assert_equal(output_sample_variable_diff.shape,
-                                gt_sample_np_diff.shape)
-        np.testing.assert_allclose(
-            output_entropy_int, gt_entropy_int, rtol=tolerance, atol=tolerance)
-        np.testing.assert_allclose(
-            output_entropy_float,
-            gt_entropy_float,
-            rtol=tolerance,
-            atol=tolerance)
-        np.testing.assert_allclose(
-            output_entropy_float_np_broadcast,
-            gt_entropy_float_np_broadcast,
-            rtol=tolerance,
-            atol=tolerance)
-        np.testing.assert_allclose(
-            output_entropy_np, gt_entropy, rtol=tolerance, atol=tolerance)
-        np.testing.assert_allclose(
-            output_entropy_variable, gt_entropy, rtol=tolerance, atol=tolerance)
-        np.testing.assert_allclose(
-            output_lp_float_np_broadcast,
-            gt_lp_float_np_broadcast,
-            rtol=tolerance,
-            atol=tolerance)
-        np.testing.assert_allclose(
-            output_lp_np, gt_lp, rtol=tolerance, atol=tolerance)
-        np.testing.assert_allclose(
-            output_lp_variable, gt_lp, rtol=tolerance, atol=tolerance)
-        np.testing.assert_allclose(
-            output_p_float_np_broadcast,
-            gt_p_float_np_broadcast,
-            rtol=tolerance,
-            atol=tolerance)
+        feed_vars = {
+            'low': self.low_np,
+            'high': self.high_np,
+            'values': self.values_np
+        }
+
+        self.executor.run(fluid.default_startup_program())
+        fetch_list = self.executor.run(program=self.test_program,
+                                       feed=feed_vars,
+                                       fetch_list=fetch_list)
+
+        self.compare_with_numpy(fetch_list)
+
+
+class UniformTest2(UniformTest):
+    def init_numpy_data(self, batch_size, dims):
+        # low ans high are 'int'
+        self.low_np = int(np.random.uniform(-2, 1))
+        self.high_np = int(np.random.uniform(1, 3))
+        self.values_np = np.array([1.0]).astype('float32')
+
+
+class UniformTest3(UniformTest):
+    def init_numpy_data(self, batch_size, dims):
+        # test broadcast: low is float, high is numpy.ndarray with dtype 'float32'.
+        self.low_np = np.random.uniform(-2, 1)
+        self.high_np = np.random.uniform(-5.0, 5.0,
+                                         (batch_size, dims)).astype('float32')
+        self.values_np = np.random.randn(batch_size, dims).astype('float32')
+
+    def init_static_data(self, batch_size, dims):
+        self.static_low = self.low_np
+        self.static_high = self.high_np
+        with fluid.program_guard(self.test_program):
+            self.static_values = layers.data(
+                name='values', shape=[dims], dtype='float32')
+
+
+class UniformTest4(UniformTest):
+    def init_numpy_data(self, batch_size, dims):
+        # low and high are numpy.ndarray with dtype 'float32'.
+        self.low_np = np.random.randn(batch_size, dims).astype('float32')
+        self.high_np = np.random.uniform(-5.0, 5.0,
+                                         (batch_size, dims)).astype('float32')
+        self.values_np = np.random.randn(batch_size, dims).astype('float32')
+
+    def init_static_data(self, batch_size, dims):
+        self.static_low = self.low_np
+        self.static_high = self.high_np
+        with fluid.program_guard(self.test_program):
+            self.static_values = layers.data(
+                name='values', shape=[dims], dtype='float32')
+
+
+class UniformTest5(UniformTest):
+    def init_numpy_data(self, batch_size, dims):
+        # low and high are numpy.ndarray with dtype 'float64'.
+        self.low_np = np.random.randn(batch_size, dims).astype('float64')
+        self.high_np = np.random.uniform(-5.0, 5.0,
+                                         (batch_size, dims)).astype('float64')
+        self.values_np = np.random.randn(batch_size, dims).astype('float64')
+
+    def init_dynamic_data(self, batch_size, dims):
+        self.dynamic_low = self.low_np
+        self.dynamic_high = self.high_np
+        self.dynamic_values = paddle.to_tensor(self.values_np, dtype='float64')
+
+    def init_static_data(self, batch_size, dims):
+        self.static_low = self.low_np
+        self.static_high = self.high_np
+        with fluid.program_guard(self.test_program):
+            self.static_values = layers.data(
+                name='values', shape=[dims], dtype='float64')
+
+
+class UniformTest6(UniformTest):
+    def init_numpy_data(self, batch_size, dims):
+        # low and high are Tensor with dtype 'VarType.FP32'.
+        self.low_np = np.random.randn(batch_size, dims).astype('float32')
+        self.high_np = np.random.uniform(-5.0, 5.0,
+                                         (batch_size, dims)).astype('float32')
+        self.values_np = np.random.randn(batch_size, dims).astype('float32')
+
+    def init_dynamic_data(self, batch_size, dims):
+        self.dynamic_low = paddle.to_tensor(self.low_np)
+        self.dynamic_high = paddle.to_tensor(self.high_np)
+        self.dynamic_values = paddle.to_tensor(self.values_np)
+
+    def init_static_data(self, batch_size, dims):
+        with fluid.program_guard(self.test_program):
+            self.static_low = layers.data(
+                name='low', shape=[dims], dtype='float32')
+            self.static_high = layers.data(
+                name='high', shape=[dims], dtype='float32')
+            self.static_values = layers.data(
+                name='values', shape=[dims], dtype='float32')
+
+
+class UniformTest7(UniformTest):
+    def init_numpy_data(self, batch_size, dims):
+        # low and high are Tensor with dtype 'VarType.FP64'.
+        self.low_np = np.random.randn(batch_size, dims).astype('float64')
+        self.high_np = np.random.uniform(-5.0, 5.0,
+                                         (batch_size, dims)).astype('float64')
+        self.values_np = np.random.randn(batch_size, dims).astype('float64')
+
+    def init_dynamic_data(self, batch_size, dims):
+        self.dynamic_low = paddle.to_tensor(self.low_np, dtype='float64')
+        self.dynamic_high = paddle.to_tensor(self.high_np, dtype='float64')
+        self.dynamic_values = paddle.to_tensor(self.values_np, dtype='float64')
+
+    def init_static_data(self, batch_size, dims):
+        with fluid.program_guard(self.test_program):
+            self.static_low = layers.data(
+                name='low', shape=[dims], dtype='float64')
+            self.static_high = layers.data(
+                name='high', shape=[dims], dtype='float64')
+            self.static_values = layers.data(
+                name='values', shape=[dims], dtype='float64')
+
+
+class UniformTest8(UniformTest):
+    def init_numpy_data(self, batch_size, dims):
+        # low and high are Tensor with dtype 'VarType.FP64'. value's dtype is 'VarType.FP32'.
+        self.low_np = np.random.randn(batch_size, dims).astype('float64')
+        self.high_np = np.random.uniform(-5.0, 5.0,
+                                         (batch_size, dims)).astype('float64')
+        self.values_np = np.random.randn(batch_size, dims).astype('float32')
+
+    def init_dynamic_data(self, batch_size, dims):
+        self.dynamic_low = paddle.to_tensor(self.low_np, dtype='float64')
+        self.dynamic_high = paddle.to_tensor(self.high_np, dtype='float64')
+        self.dynamic_values = paddle.to_tensor(self.values_np, dtype='float32')
+
+    def init_static_data(self, batch_size, dims):
+        with fluid.program_guard(self.test_program):
+            self.static_low = layers.data(
+                name='low', shape=[dims], dtype='float64')
+            self.static_high = layers.data(
+                name='high', shape=[dims], dtype='float64')
+            self.static_values = layers.data(
+                name='values', shape=[dims], dtype='float32')
+
+
+class NormalTest(unittest.TestCase):
+    def setUp(self, use_gpu=False, batch_size=2, dims=3):
+        self.use_gpu = use_gpu
+        if not use_gpu:
+            self.place = fluid.CPUPlace()
+            self.gpu_id = -1
+        else:
+            self.place = fluid.CUDAPlace(0)
+            self.gpu_id = 0
+
+        self.init_numpy_data(batch_size, dims)
+
+        paddle.disable_static(self.place)
+        self.init_dynamic_data(batch_size, dims)
+
+        paddle.enable_static()
+        self.test_program = fluid.Program()
+        self.executor = fluid.Executor(self.place)
+        self.init_static_data(batch_size, dims)
+
+    def init_numpy_data(self, batch_size, dims):
+        # loc ans scale are 'float'
+        self.loc_np = (np.random.ranf() - 0.5) * 4
+        self.scale_np = (np.random.ranf() - 0.5) * 4
+        while self.scale_np < 0:
+            self.scale_np = (np.random.ranf() - 0.5) * 4
+        # used to construct another Normal object to calculate kl_divergence
+        self.other_loc_np = (np.random.ranf() - 0.5) * 4
+        self.other_scale_np = (np.random.ranf() - 0.5) * 4
+        while self.other_scale_np < 0:
+            self.other_scale_np = (np.random.ranf() - 0.5) * 4
+        self.values_np = np.random.ranf(1).astype('float32')
+
+    def init_dynamic_data(self, batch_size, dims):
+        self.dynamic_loc = self.loc_np
+        self.dynamic_scale = self.scale_np
+        self.dynamic_other_loc = self.other_loc_np
+        self.dynamic_other_scale = self.other_scale_np
+        self.dynamic_values = paddle.to_tensor(self.values_np)
+
+    def init_static_data(self, batch_size, dims):
+        self.static_loc = self.loc_np
+        self.static_scale = self.scale_np
+        self.static_other_loc = self.other_loc_np
+        self.static_other_scale = self.other_scale_np
+        with fluid.program_guard(self.test_program):
+            self.static_values = layers.data(
+                name='values', shape=[], dtype='float32')
+
+    def compare_with_numpy(self, fetch_list, sample_shape=7, tolerance=1e-6):
+        sample, entropy, log_prob, probs, kl = fetch_list
+
+        np_normal = NormalNumpy(self.loc_np, self.scale_np)
+        np_sample = np_normal.sample([sample_shape])
+        np_entropy = np_normal.entropy()
+        np_lp = np_normal.log_prob(self.values_np)
+        np_p = np_normal.probs(self.values_np)
+        np_other_normal = NormalNumpy(self.other_loc_np, self.other_scale_np)
+        np_kl = np_normal.kl_divergence(np_other_normal)
+
+        np.testing.assert_equal(sample.shape, np_sample.shape)
         np.testing.assert_allclose(
-            output_p_np, gt_p, rtol=tolerance, atol=tolerance)
+            entropy, np_entropy, rtol=tolerance, atol=tolerance)
         np.testing.assert_allclose(
-            output_p_variable, gt_p, rtol=tolerance, atol=tolerance)
-
-    def test_uniform_distribution_static(self,
-                                         batch_size=2,
-                                         dims=3,
-                                         sample_shape=7,
-                                         tolerance=1e-6):
-        """
-        Test Uniform's methods in static mode.
-
-        Args:
-          refer to ``compare_uniform_with_numpy`` function.
-        """
-        test_program = fluid.Program()
-
-        low_np = np.random.randn(batch_size, dims).astype('float32')
-        low_float = np.random.uniform(-2, 1)
-        high_float = np.random.uniform(1, 3)
-        high_np = np.random.uniform(-5.0, 5.0,
-                                    (batch_size, dims)).astype('float32')
-        values_np = np.random.randn(batch_size, dims).astype('float32')
-
-        data_list = [low_np, low_float, high_float, high_np, values_np]
-
-        feed_vars, fetch_list = self.build_uniform_static(
-            test_program, batch_size, dims, sample_shape, low_float, high_float,
-            high_np, low_np, values_np)
+            log_prob, np_lp, rtol=tolerance, atol=tolerance)
+        np.testing.assert_allclose(probs, np_p, rtol=tolerance, atol=tolerance)
+        np.testing.assert_allclose(kl, np_kl, rtol=tolerance, atol=tolerance)
 
-        self.executor.run(fluid.default_startup_program())
+    def test_normal_distribution_dygraph(self, sample_shape=7, tolerance=1e-6):
+        paddle.disable_static(self.place)
+        normal = Normal(self.dynamic_loc, self.dynamic_scale)
+
+        sample = normal.sample([sample_shape]).numpy()
+        entropy = normal.entropy().numpy()
+        log_prob = normal.log_prob(self.dynamic_values).numpy()
+        probs = normal.probs(self.dynamic_values).numpy()
+        other_normal = Normal(self.dynamic_other_loc, self.dynamic_other_scale)
+        kl = normal.kl_divergence(other_normal).numpy()
 
-        # result calculated by paddle
-        output_list = self.executor.run(program=test_program,
-                                        feed=feed_vars,
-                                        fetch_list=fetch_list)
-        self.compare_uniform_with_numpy(data_list, output_list, batch_size,
-                                        dims, sample_shape, tolerance)
-
-    def test_uniform_distribution_dygraph(self,
-                                          batch_size=2,
-                                          dims=3,
-                                          sample_shape=7,
-                                          tolerance=1e-6):
-        """
-        Test Uniform's methods in dynamic mode.
-
-        Args:
-          refer to ``compare_uniform_with_numpy`` function.
-        """
-        paddle.disable_static()
-
-        low_np = np.random.randn(batch_size, dims).astype('float32')
-        low_float = np.random.uniform(-2, 1)
-        high_float = np.random.uniform(1, 3)
-        high_np = np.random.uniform(-5.0, 5.0,
-                                    (batch_size, dims)).astype('float32')
-        values_np = np.random.randn(batch_size, dims).astype('float32')
-
-        data_list = [low_np, low_float, high_float, high_np, values_np]
-        output_list = self.build_uniform_dygraph(batch_size, dims, sample_shape,
-                                                 low_float, high_float, high_np,
-                                                 low_np, values_np)
-
-        self.compare_uniform_with_numpy(data_list, output_list, batch_size,
-                                        dims, sample_shape, tolerance)
+        fetch_list = [sample, entropy, log_prob, probs, kl]
+        self.compare_with_numpy(fetch_list)
+
+    def test_normal_distribution_static(self, sample_shape=7, tolerance=1e-6):
         paddle.enable_static()
+        with fluid.program_guard(self.test_program):
+            normal = Normal(self.static_loc, self.static_scale)
+
+            sample = normal.sample([sample_shape])
+            entropy = normal.entropy()
+            log_prob = normal.log_prob(self.static_values)
+            probs = normal.probs(self.static_values)
+            other_normal = Normal(self.static_other_loc,
+                                  self.static_other_scale)
+            kl = normal.kl_divergence(other_normal)
+
+            fetch_list = [sample, entropy, log_prob, probs, kl]
+
+        feed_vars = {
+            'loc': self.loc_np,
+            'scale': self.scale_np,
+            'values': self.values_np,
+            'other_loc': self.other_loc_np,
+            'other_scale': self.other_scale_np
+        }
+
+        self.executor.run(fluid.default_startup_program())
+        fetch_list = self.executor.run(program=self.test_program,
+                                       feed=feed_vars,
+                                       fetch_list=fetch_list)
+
+        self.compare_with_numpy(fetch_list)
+
+
+class NormalTest2(NormalTest):
+    def init_numpy_data(self, batch_size, dims):
+        # loc ans scale are 'int'
+        self.loc_np = int((np.random.ranf() - 0.5) * 8)
+        self.scale_np = int((np.random.ranf() - 0.5) * 8)
+        while self.scale_np < 0:
+            self.scale_np = int((np.random.ranf() - 0.5) * 8)
+        # used to construct another Normal object to calculate kl_divergence
+        self.other_loc_np = int((np.random.ranf() - 0.5) * 8)
+        self.other_scale_np = int((np.random.ranf() - 0.5) * 8)
+        while self.other_scale_np < 0:
+            self.other_scale_np = int((np.random.ranf() - 0.5) * 8)
+        self.values_np = np.random.ranf(1).astype('float32')
+
+
+class NormalTest3(NormalTest):
+    def init_numpy_data(self, batch_size, dims):
+        # test broadcast: loc is float, scale is numpy.ndarray with dtype 'float32'.
+        self.loc_np = (np.random.ranf() - 0.5) * 4
+        self.scale_np = np.random.randn(batch_size, dims).astype('float32')
+        while not np.all(self.scale_np > 0):
+            self.scale_np = np.random.randn(batch_size, dims).astype('float32')
+        self.values_np = np.random.randn(batch_size, dims).astype('float32')
+        # used to construct another Normal object to calculate kl_divergence
+        self.other_loc_np = (np.random.ranf() - 0.5) * 4
+        self.other_scale_np = np.random.randn(batch_size,
+                                              dims).astype('float32')
+        while not np.all(self.scale_np > 0):
+            self.other_scale_np = np.random.randn(batch_size,
+                                                  dims).astype('float32')
+
+    def init_static_data(self, batch_size, dims):
+        self.static_loc = self.loc_np
+        self.static_scale = self.scale_np
+        self.static_other_loc = self.other_loc_np
+        self.static_other_scale = self.other_scale_np
+        with fluid.program_guard(self.test_program):
+            self.static_values = layers.data(
+                name='values', shape=[dims], dtype='float32')
+
+
+class NormalTest4(NormalTest):
+    def init_numpy_data(self, batch_size, dims):
+        # loc and scale are numpy.ndarray with dtype 'float32'.
+        self.loc_np = np.random.randn(batch_size, dims).astype('float32')
+        self.scale_np = np.random.randn(batch_size, dims).astype('float32')
+        while not np.all(self.scale_np > 0):
+            self.scale_np = np.random.randn(batch_size, dims).astype('float32')
+        self.values_np = np.random.randn(batch_size, dims).astype('float32')
+        # used to construct another Normal object to calculate kl_divergence
+        self.other_loc_np = np.random.randn(batch_size, dims).astype('float32')
+        self.other_scale_np = np.random.randn(batch_size,
+                                              dims).astype('float32')
+        while not np.all(self.scale_np > 0):
+            self.other_scale_np = np.random.randn(batch_size,
+                                                  dims).astype('float32')
+
+    def init_static_data(self, batch_size, dims):
+        self.static_loc = self.loc_np
+        self.static_scale = self.scale_np
+        self.static_other_loc = self.other_loc_np
+        self.static_other_scale = self.other_scale_np
+        with fluid.program_guard(self.test_program):
+            self.static_values = layers.data(
+                name='values', shape=[dims], dtype='float32')
+
+
+class NormalTest5(NormalTest):
+    def init_numpy_data(self, batch_size, dims):
+        # loc and scale are numpy.ndarray with dtype 'float64'.
+        self.loc_np = np.random.randn(batch_size, dims).astype('float64')
+        self.scale_np = np.random.randn(batch_size, dims).astype('float64')
+        while not np.all(self.scale_np > 0):
+            self.scale_np = np.random.randn(batch_size, dims).astype('float64')
+        self.values_np = np.random.randn(batch_size, dims).astype('float64')
+        # used to construct another Normal object to calculate kl_divergence
+        self.other_loc_np = np.random.randn(batch_size, dims).astype('float64')
+        self.other_scale_np = np.random.randn(batch_size,
+                                              dims).astype('float64')
+        while not np.all(self.scale_np > 0):
+            self.other_scale_np = np.random.randn(batch_size,
+                                                  dims).astype('float64')
+
+    def init_dynamic_data(self, batch_size, dims):
+        self.dynamic_loc = self.loc_np
+        self.dynamic_scale = self.scale_np
+        self.dynamic_other_loc = self.other_loc_np
+        self.dynamic_other_scale = self.other_scale_np
+        self.dynamic_values = paddle.to_tensor(self.values_np, dtype='float64')
+
+    def init_static_data(self, batch_size, dims):
+        self.static_loc = self.loc_np
+        self.static_scale = self.scale_np
+        self.static_other_loc = self.other_loc_np
+        self.static_other_scale = self.other_scale_np
+        with fluid.program_guard(self.test_program):
+            self.static_values = layers.data(
+                name='values', shape=[dims], dtype='float64')
+
+
+class NormalTest6(NormalTest):
+    def init_data(self, batch_size=2, dims=3):
+        # loc and scale are Tensor with dtype 'VarType.FP32'.
+        self.loc_np = np.random.randn(batch_size, dims).astype('float32')
+        self.scale_np = np.random.randn(batch_size, dims).astype('float32')
+        while not np.all(self.scale_np > 0):
+            self.scale_np = np.random.randn(batch_size, dims).astype('float32')
+        self.values_np = np.random.randn(batch_size, dims).astype('float32')
+        self.loc = paddle.to_tensor(self.loc_np)
+        self.scale = paddle.to_tensor(self.scale_np)
+        self.values = paddle.to_tensor(self.values_np)
+        # used to construct another Normal object to calculate kl_divergence
+        self.other_loc_np = np.random.randn(batch_size, dims).astype('float32')
+        self.other_scale_np = np.random.randn(batch_size,
+                                              dims).astype('float32')
+        while not np.all(self.scale_np > 0):
+            self.other_scale_np = np.random.randn(batch_size,
+                                                  dims).astype('float32')
+        self.other_loc = paddle.to_tensor(self.other_loc_np)
+        self.other_scale = paddle.to_tensor(self.other_scale_np)
+
+    def init_numpy_data(self, batch_size, dims):
+        # loc and scale are Tensor with dtype 'VarType.FP32'.
+        self.loc_np = np.random.randn(batch_size, dims).astype('float32')
+        self.scale_np = np.random.randn(batch_size, dims).astype('float32')
+        while not np.all(self.scale_np > 0):
+            self.scale_np = np.random.randn(batch_size, dims).astype('float32')
+        self.values_np = np.random.randn(batch_size, dims).astype('float32')
+        # used to construct another Normal object to calculate kl_divergence
+        self.other_loc_np = np.random.randn(batch_size, dims).astype('float32')
+        self.other_scale_np = np.random.randn(batch_size,
+                                              dims).astype('float32')
+        while not np.all(self.scale_np > 0):
+            self.other_scale_np = np.random.randn(batch_size,
+                                                  dims).astype('float32')
+
+    def init_dynamic_data(self, batch_size, dims):
+        self.dynamic_loc = paddle.to_tensor(self.loc_np)
+        self.dynamic_scale = paddle.to_tensor(self.scale_np)
+        self.dynamic_values = paddle.to_tensor(self.values_np)
+        self.dynamic_other_loc = paddle.to_tensor(self.other_loc_np)
+        self.dynamic_other_scale = paddle.to_tensor(self.other_scale_np)
+
+    def init_static_data(self, batch_size, dims):
+        with fluid.program_guard(self.test_program):
+            self.static_loc = layers.data(
+                name='loc', shape=[dims], dtype='float32')
+            self.static_scale = layers.data(
+                name='scale', shape=[dims], dtype='float32')
+            self.static_values = layers.data(
+                name='values', shape=[dims], dtype='float32')
+            self.static_other_loc = layers.data(
+                name='other_loc', shape=[dims], dtype='float32')
+            self.static_other_scale = layers.data(
+                name='other_scale', shape=[dims], dtype='float32')
+
+
+class NormalTest7(NormalTest):
+    def init_numpy_data(self, batch_size, dims):
+        # loc and scale are Tensor with dtype 'VarType.FP64'.
+        self.loc_np = np.random.randn(batch_size, dims).astype('float64')
+        self.scale_np = np.random.randn(batch_size, dims).astype('float64')
+        while not np.all(self.scale_np > 0):
+            self.scale_np = np.random.randn(batch_size, dims).astype('float64')
+        self.values_np = np.random.randn(batch_size, dims).astype('float64')
+        # used to construct another Normal object to calculate kl_divergence
+        self.other_loc_np = np.random.randn(batch_size, dims).astype('float64')
+        self.other_scale_np = np.random.randn(batch_size,
+                                              dims).astype('float64')
+        while not np.all(self.scale_np > 0):
+            self.other_scale_np = np.random.randn(batch_size,
+                                                  dims).astype('float64')
+
+    def init_dynamic_data(self, batch_size, dims):
+        self.dynamic_loc = paddle.to_tensor(self.loc_np, dtype='float64')
+        self.dynamic_scale = paddle.to_tensor(self.scale_np, dtype='float64')
+        self.dynamic_values = paddle.to_tensor(self.values_np, dtype='float64')
+        self.dynamic_other_loc = paddle.to_tensor(
+            self.other_loc_np, dtype='float64')
+        self.dynamic_other_scale = paddle.to_tensor(
+            self.other_scale_np, dtype='float64')
+
+    def init_static_data(self, batch_size, dims):
+        with fluid.program_guard(self.test_program):
+            self.static_loc = layers.data(
+                name='loc', shape=[dims], dtype='float64')
+            self.static_scale = layers.data(
+                name='scale', shape=[dims], dtype='float64')
+            self.static_values = layers.data(
+                name='values', shape=[dims], dtype='float64')
+            self.static_other_loc = layers.data(
+                name='other_loc', shape=[dims], dtype='float64')
+            self.static_other_scale = layers.data(
+                name='other_scale', shape=[dims], dtype='float64')
+
+
+class NormalTest8(NormalTest):
+    def init_numpy_data(self, batch_size, dims):
+        # loc and scale are Tensor with dtype 'VarType.FP64'. value's dtype is 'VarType.FP32'.
+        self.loc_np = np.random.randn(batch_size, dims).astype('float64')
+        self.scale_np = np.random.randn(batch_size, dims).astype('float64')
+        while not np.all(self.scale_np > 0):
+            self.scale_np = np.random.randn(batch_size, dims).astype('float64')
+        self.values_np = np.random.randn(batch_size, dims).astype('float32')
+        # used to construct another Normal object to calculate kl_divergence
+        self.other_loc_np = np.random.randn(batch_size, dims).astype('float64')
+        self.other_scale_np = np.random.randn(batch_size,
+                                              dims).astype('float64')
+        while not np.all(self.scale_np > 0):
+            self.other_scale_np = np.random.randn(batch_size,
+                                                  dims).astype('float64')
+
+    def init_dynamic_data(self, batch_size, dims):
+        self.dynamic_loc = paddle.to_tensor(self.loc_np, dtype='float64')
+        self.dynamic_scale = paddle.to_tensor(self.scale_np, dtype='float64')
+        self.dynamic_values = paddle.to_tensor(self.values_np)
+        self.dynamic_other_loc = paddle.to_tensor(
+            self.other_loc_np, dtype='float64')
+        self.dynamic_other_scale = paddle.to_tensor(
+            self.other_scale_np, dtype='float64')
+
+    def init_static_data(self, batch_size, dims):
+        with fluid.program_guard(self.test_program):
+            self.static_loc = layers.data(
+                name='loc', shape=[dims], dtype='float64')
+            self.static_scale = layers.data(
+                name='scale', shape=[dims], dtype='float64')
+            self.static_values = layers.data(
+                name='values', shape=[dims], dtype='float32')
+            self.static_other_loc = layers.data(
+                name='other_loc', shape=[dims], dtype='float64')
+            self.static_other_scale = layers.data(
+                name='other_scale', shape=[dims], dtype='float64')
 
 
 class DistributionTestError(unittest.TestCase):
diff --git a/python/paddle/fluid/tests/unittests/test_dropout_op.py b/python/paddle/fluid/tests/unittests/test_dropout_op.py
index ceec1190279212fbe6f3f128bdd1397cdb9ea1a2..7b9e25e1d4ae8dbb8e4a03d93a7d9c0f9dd18ea6 100644
--- a/python/paddle/fluid/tests/unittests/test_dropout_op.py
+++ b/python/paddle/fluid/tests/unittests/test_dropout_op.py
@@ -40,6 +40,23 @@ class TestDropoutOp(OpTest):
         self.check_grad(['X'], 'Out')
 
 
+class TestDropoutOpInput1d(OpTest):
+    def setUp(self):
+        self.op_type = "dropout"
+        self.inputs = {'X': np.random.random((2000, )).astype("float32")}
+        self.attrs = {'dropout_prob': 0.0, 'fix_seed': True, 'is_test': False}
+        self.outputs = {
+            'Out': self.inputs['X'],
+            'Mask': np.ones((2000)).astype('uint8')
+        }
+
+    def test_check_output(self):
+        self.check_output()
+
+    def test_check_grad_normal(self):
+        self.check_grad(['X'], 'Out')
+
+
 class TestDropoutOp2(TestDropoutOp):
     def setUp(self):
         self.op_type = "dropout"
@@ -436,6 +453,13 @@ class TestDropoutFAPIError(unittest.TestCase):
 
             self.assertRaises(ValueError, test_axis_max)
 
+            def test_axis_min():
+                # minimum of axis should greater equal than 0
+                x2 = fluid.data(name='x2', shape=[3, 4, 5, 6], dtype="float32")
+                paddle.nn.functional.dropout(x2, axis=[0, -1])
+
+            self.assertRaises(ValueError, test_axis_min)
+
             def test_axis_len():
                 # length of axis should not greater than dimensions of x
                 x2 = fluid.data(name='x2', shape=[3, 4, 5, 6], dtype="float32")
@@ -648,9 +672,11 @@ class TestAlphaDropoutFAPI(unittest.TestCase):
             res1 = paddle.nn.functional.alpha_dropout(x=input, p=0.)
             res2 = paddle.nn.functional.alpha_dropout(
                 x=input, p=0., training=False)
+            res3 = paddle.nn.functional.alpha_dropout(x=input, p=1.)
 
             in_np = np.random.random([40, 40]).astype("float32")
             res_np = in_np
+            res_np3 = np.zeros_like(in_np)
 
             exe = fluid.Executor(place)
             res_list = [res1, res2]
@@ -659,6 +685,10 @@ class TestAlphaDropoutFAPI(unittest.TestCase):
                                   feed={"input": in_np},
                                   fetch_list=[res])
                 self.assertTrue(np.allclose(fetches[0], res_np))
+            fetches = exe.run(fluid.default_main_program(),
+                              feed={"input": in_np},
+                              fetch_list=[res3])
+            self.assertTrue(np.allclose(fetches[0], res_np3))
 
     def test_static(self):
         for place in self.places:
@@ -669,15 +699,18 @@ class TestAlphaDropoutFAPI(unittest.TestCase):
             with fluid.dygraph.guard(place):
                 in_np = np.random.random([40, 40]).astype("float32")
                 res_np = in_np
+                res_np3 = np.zeros_like(in_np)
                 input = fluid.dygraph.to_variable(in_np)
 
                 res1 = paddle.nn.functional.alpha_dropout(x=input, p=0.)
                 res2 = paddle.nn.functional.alpha_dropout(
                     x=input, p=0., training=False)
+                res3 = paddle.nn.functional.alpha_dropout(x=input, p=1.)
 
             res_list = [res1, res2]
             for res in res_list:
                 self.assertTrue(np.allclose(res.numpy(), res_np))
+            self.assertTrue(np.allclose(res3.numpy(), res_np3))
 
 
 class TestAlphaDropoutFAPIError(unittest.TestCase):
diff --git a/python/paddle/fluid/tests/unittests/test_dygraph_weight_norm.py b/python/paddle/fluid/tests/unittests/test_dygraph_weight_norm.py
index 466226c53fabbd315acd19c6421f210d0ca225c1..a963c2ece0958048b5f0c850184a0930022e6671 100644
--- a/python/paddle/fluid/tests/unittests/test_dygraph_weight_norm.py
+++ b/python/paddle/fluid/tests/unittests/test_dygraph_weight_norm.py
@@ -121,6 +121,9 @@ class TestDygraphWeightNorm(unittest.TestCase):
         before_weight = linear.weight.numpy()
         if self.dim == None:
             self.dim = -1
+
+        if self.dim != -1:
+            self.dim = (self.dim + len(before_weight)) % len(before_weight)
         wn = weight_norm(linear, dim=self.dim)
         outputs = []
         for name, data in self.data.items():
@@ -158,6 +161,13 @@ class TestDygraphWeightNormCase3(TestDygraphWeightNorm):
         self.dim = 3
 
 
+class TestDygraphWeightNormCase4(TestDygraphWeightNorm):
+    def init_test_case(self):
+        self.batch_size = 3
+        self.data_desc = (['x', [2, 3, 3]], )
+        self.dim = -3
+
+
 class TestDygraphRemoveWeightNorm(unittest.TestCase):
     def setUp(self):
         self.init_test_case()
diff --git a/python/paddle/fluid/tests/unittests/test_dynrnn_static_input.py b/python/paddle/fluid/tests/unittests/test_dynrnn_static_input.py
index b4359fc69ae18b45774af0d2b20c1540bd99da5c..698f914f89984d8c09619a46c6a6b292b00aac9a 100644
--- a/python/paddle/fluid/tests/unittests/test_dynrnn_static_input.py
+++ b/python/paddle/fluid/tests/unittests/test_dynrnn_static_input.py
@@ -25,6 +25,7 @@ import bisect
 import numpy as np
 
 fluid.default_startup_program().random_seed = 1
+np.random.seed(1)
 
 
 class TestDyRnnStaticInput(unittest.TestCase):
diff --git a/python/paddle/fluid/tests/unittests/test_elementwise_div_op.py b/python/paddle/fluid/tests/unittests/test_elementwise_div_op.py
index 9ebaf8ff9438be8c8a57815be0798b861d05caaf..3cfbac8b613c125956861f73b1bab24c34e05572 100644
--- a/python/paddle/fluid/tests/unittests/test_elementwise_div_op.py
+++ b/python/paddle/fluid/tests/unittests/test_elementwise_div_op.py
@@ -240,124 +240,25 @@ class TestElementwiseDivBroadcast(unittest.TestCase):
             self.assertEqual((out_result == (2 / x)).all(), True)
 
 
-class TestDivideAPI(unittest.TestCase):
-    def setUp(self):
-        paddle.set_default_dtype("float64")
-        self.places = [fluid.CPUPlace()]
-        if core.is_compiled_with_cuda():
-            self.places.append(fluid.CUDAPlace(0))
-
-    def check_static_result(self, place):
-        # rule 1
-        with fluid.program_guard(fluid.Program(), fluid.Program()):
-            x = fluid.data(name="x", shape=[3], dtype="float64")
-            y = np.array([1, 2, 3])
-            self.assertRaises(TypeError, paddle.divide, x=x, y=y)
-
-        # rule 2: both the inputs are not Tensor
-        with fluid.program_guard(fluid.Program(), fluid.Program()):
-            x = 2
-            y = 4
-            res = paddle.divide(x, y)
-            exe = fluid.Executor(place)
-            np_z = exe.run(fluid.default_main_program(),
-                           feed={},
-                           fetch_list=[res])
-            self.assertEqual(np_z[0] == 0.5, True)
-
-        # rule 3: 
-        with fluid.program_guard(fluid.Program(), fluid.Program()):
-            x = fluid.data(name="x", shape=[3], dtype="float64")
-            y = fluid.data(name="y", shape=[3], dtype="float32")
-            self.assertRaises(TypeError, paddle.divide, x=x, y=y)
-
-        # rule 4: x is Tensor, y is scalar
-        with fluid.program_guard(fluid.Program(), fluid.Program()):
-            x = fluid.data(name="x", shape=[3], dtype="float64")
-            y = 2
-            exe = fluid.Executor(place)
-            res = x / y
-            np_z = exe.run(fluid.default_main_program(),
-                           feed={"x": np.array([2, 3, 4]).astype('float64')},
-                           fetch_list=[res])
-            z_expected = np.array([1., 1.5, 2.])
-            self.assertEqual((np_z[0] == z_expected).all(), True)
-
-        # rule 5: y is Tensor, x is scalar
-        with fluid.program_guard(fluid.Program(), fluid.Program()):
-            x = fluid.data(name="x", shape=[3], dtype="float64")
-            y = 2
-            exe = fluid.Executor(place)
-            res = y / x
-            np_z = exe.run(fluid.default_main_program(),
-                           feed={"x": np.array([2, 8, 4]).astype('float64')},
-                           fetch_list=[res])
-            z_expected = np.array([1., 0.25, 0.5])
-            self.assertEqual((np_z[0] == z_expected).all(), True)
-
-        # rule 6: y is Tensor, x is Tensor
-        with fluid.program_guard(fluid.Program(), fluid.Program()):
-            x = fluid.data(name="x", shape=[3], dtype="float64")
-            y = fluid.data(name="y", shape=[3], dtype="float64")
-            exe = fluid.Executor(place)
-            res = x / y
-            np_z = exe.run(fluid.default_main_program(),
-                           feed={
-                               "x": np.array([2, 3, 4]).astype('float64'),
-                               "y": np.array([1, 5, 2]).astype('float64')
-                           },
-                           fetch_list=[res])
-            z_expected = np.array([2., 0.6, 2.])
-            self.assertEqual((np_z[0] == z_expected).all(), True)
+class TestDivideOp(unittest.TestCase):
+    def test_name(self):
+        with fluid.program_guard(fluid.Program()):
+            x = fluid.data(name="x", shape=[2, 3], dtype="float32")
+            y = fluid.data(name='y', shape=[2, 3], dtype='float32')
 
-    def test_static(self):
-        for place in self.places:
-            self.check_static_result(place=place)
+            y_1 = paddle.divide(x, y, name='div_res')
+            self.assertEqual(('div_res' in y_1.name), True)
 
     def test_dygraph(self):
-        for place in self.places:
-            with fluid.dygraph.guard(place):
-                # rule 1 : avoid numpy.ndarray
-                np_x = np.array([2, 3, 4])
-                np_y = np.array([1, 5, 2])
-                x = paddle.to_tensor(np_x)
-                self.assertRaises(TypeError, paddle.divide, x=x, y=np_y)
-
-                # rule 2: both the inputs are not Tensor
-                z = paddle.divide(3, 2)
-                self.assertEqual(z.numpy()[0] == 1.5, True)
-
-                # rule 3: both the inputs are Tensor
-                np_x = np.array([2, 3, 4])
-                np_y = np.array([1, 5, 2])
-                x = paddle.to_tensor(np_x, dtype="float32")
-                y = paddle.to_tensor(np_y, dtype="float64")
-                self.assertRaises(TypeError, paddle.divide, x=x, y=y)
-
-                # rule 4: x is Tensor, y is scalar
-                np_x = np.array([2, 3, 4])
-                x = paddle.to_tensor(np_x, dtype="int32")
-                y = 2
-                z = x / y
-                z_expected = np.array([1., 1.5, 2.])
-                self.assertEqual((z_expected == z.numpy()).all(), True)
-
-                # rule 5: y is Tensor, x is scalar
-                np_x = np.array([2, 1, 4])
-                x = paddle.to_tensor(np_x, dtype="int32")
-                y = 2
-                z = y / x
-                z_expected = np.array([1., 2., 0.5])
-                self.assertEqual((z_expected == z.numpy()).all(), True)
-
-                # rule 6: y is Tensor, x is Tensor
-                np_x = np.array([2, 3, 4])
-                np_y = np.array([1, 5, 2])
-                x = paddle.to_tensor(np_x)
-                y = paddle.to_tensor(np_y)
-                z = x / y
-                z_expected = np.array([2., 0.6, 2.])
-                self.assertEqual((z_expected == z.numpy()).all(), True)
+        with fluid.dygraph.guard():
+            np_x = np.array([2, 3, 4]).astype('float64')
+            np_y = np.array([1, 5, 2]).astype('float64')
+            x = paddle.to_tensor(np_x)
+            y = paddle.to_tensor(np_y)
+            z = paddle.divide(x, y)
+            np_z = z.numpy()
+            z_expected = np.array([2., 0.6, 2.])
+            self.assertEqual((np_z == z_expected).all(), True)
 
 
 if __name__ == '__main__':
diff --git a/python/paddle/fluid/tests/unittests/test_elementwise_floordiv_op.py b/python/paddle/fluid/tests/unittests/test_elementwise_floordiv_op.py
index 0b6acc7615395ed99a484e0e56f9c62447a1f345..007affc14084956870e50e0e1b6b650d13cc3926 100644
--- a/python/paddle/fluid/tests/unittests/test_elementwise_floordiv_op.py
+++ b/python/paddle/fluid/tests/unittests/test_elementwise_floordiv_op.py
@@ -58,13 +58,6 @@ class TestElementwiseModOp(OpTest):
         pass
 
 
-class TestElementwiseModOpInverse(TestElementwiseModOp):
-    def init_input_output(self):
-        self.x = np.random.uniform(0, 10000, [10]).astype(self.dtype)
-        self.y = np.random.uniform(0, 1000, [10, 10]).astype(self.dtype)
-        self.out = np.floor_divide(self.x, self.y)
-
-
 class TestElementwiseModOp_scalar(TestElementwiseModOp):
     def init_input_output(self):
         scale_x = random.randint(0, 100000000)
@@ -74,124 +67,32 @@ class TestElementwiseModOp_scalar(TestElementwiseModOp):
         self.out = np.floor_divide(self.x, self.y)
 
 
-class TestFloorDivideAPI(unittest.TestCase):
-    def setUp(self):
-        paddle.set_default_dtype("float64")
-        self.places = [fluid.CPUPlace()]
-        if core.is_compiled_with_cuda():
-            self.places.append(fluid.CUDAPlace(0))
-
-    def check_static_result(self, place):
-        # rule 1
-        with fluid.program_guard(fluid.Program(), fluid.Program()):
-            x = fluid.data(name="x", shape=[3], dtype="float64")
-            y = np.array([1, 2, 3])
-            self.assertRaises(TypeError, paddle.floor_divide, x=x, y=y)
-
-        # rule 2: both the inputs are not Tensor
-        with fluid.program_guard(fluid.Program(), fluid.Program()):
-            x = 2
-            y = 4
-            res = paddle.floor_divide(x, y)
-            exe = fluid.Executor(place)
-            np_z = exe.run(fluid.default_main_program(),
-                           feed={},
-                           fetch_list=[res])
-            self.assertEqual(np_z[0] == 0., True)
-
-        # rule 3: 
-        with fluid.program_guard(fluid.Program(), fluid.Program()):
-            x = fluid.data(name="x", shape=[3], dtype="float64")
-            y = fluid.data(name="y", shape=[3], dtype="float32")
-            self.assertRaises(TypeError, paddle.floor_divide, x=x, y=y)
-
-        # rule 4: x is Tensor, y is scalar
-        with fluid.program_guard(fluid.Program(), fluid.Program()):
-            x = fluid.data(name="x", shape=[3], dtype="float64")
-            y = 2
-            exe = fluid.Executor(place)
-            res = x // y
-            np_z = exe.run(fluid.default_main_program(),
-                           feed={"x": np.array([2, 3, 4]).astype('float64')},
-                           fetch_list=[res])
-            z_expected = np.array([1., 1., 2.])
-            self.assertEqual((np_z[0] == z_expected).all(), True)
-
-        # rule 5: y is Tensor, x is scalar
-        with fluid.program_guard(fluid.Program(), fluid.Program()):
-            x = fluid.data(name="x", shape=[3], dtype="float64")
-            y = 2
-            exe = fluid.Executor(place)
-            res = y // x
-            np_z = exe.run(fluid.default_main_program(),
-                           feed={"x": np.array([2, 8, 4]).astype('float64')},
-                           fetch_list=[res])
-            z_expected = np.array([1., 0., 0.])
-            self.assertEqual((np_z[0] == z_expected).all(), True)
-
-        # rule 6: y is Tensor, x is Tensor
-        with fluid.program_guard(fluid.Program(), fluid.Program()):
-            x = fluid.data(name="x", shape=[3], dtype="float64")
-            y = fluid.data(name="y", shape=[3], dtype="float64")
-            exe = fluid.Executor(place)
-            res = x // y
-            np_z = exe.run(fluid.default_main_program(),
-                           feed={
-                               "x": np.array([2, 3, 4]).astype('float64'),
-                               "y": np.array([1, 5, 2]).astype('float64')
-                           },
-                           fetch_list=[res])
-            z_expected = np.array([2., 0., 2.])
-            self.assertEqual((np_z[0] == z_expected).all(), True)
-
-    def test_static(self):
-        for place in self.places:
-            self.check_static_result(place=place)
+class TestElementwiseModOpInverse(TestElementwiseModOp):
+    def init_input_output(self):
+        self.x = np.random.uniform(0, 10000, [10]).astype(self.dtype)
+        self.y = np.random.uniform(0, 1000, [10, 10]).astype(self.dtype)
+        self.out = np.floor_divide(self.x, self.y)
+
+
+class TestFloorDivideOp(unittest.TestCase):
+    def test_name(self):
+        with fluid.program_guard(fluid.Program()):
+            x = fluid.data(name="x", shape=[2, 3], dtype="int64")
+            y = fluid.data(name='y', shape=[2, 3], dtype='int64')
+
+            y_1 = paddle.floor_divide(x, y, name='div_res')
+            self.assertEqual(('div_res' in y_1.name), True)
 
     def test_dygraph(self):
-        for place in self.places:
-            with fluid.dygraph.guard(place):
-                # rule 1 : avoid numpy.ndarray
-                np_x = np.array([2, 3, 4])
-                np_y = np.array([1, 5, 2])
-                x = paddle.to_tensor(np_x)
-                self.assertRaises(TypeError, paddle.floor_divide, x=x, y=np_y)
-
-                # rule 2: both the inputs are not Tensor
-                z = paddle.floor_divide(3, 2)
-                self.assertEqual(z.numpy()[0] == 1., True)
-
-                # rule 3: both the inputs are Tensor
-                np_x = np.array([2, 3, 4])
-                np_y = np.array([1, 5, 2])
-                x = paddle.to_tensor(np_x, dtype="float32")
-                y = paddle.to_tensor(np_y, dtype="float64")
-                self.assertRaises(TypeError, paddle.floor_divide, x=x, y=y)
-
-                # rule 4: x is Tensor, y is scalar
-                np_x = np.array([2, 3, 4])
-                x = paddle.to_tensor(np_x, dtype="int32")
-                y = 2
-                z = x // y
-                z_expected = np.array([1, 1, 2])
-                self.assertEqual((z_expected == z.numpy()).all(), True)
-
-                # rule 5: y is Tensor, x is scalar
-                np_x = np.array([2, 1, 4])
-                x = paddle.to_tensor(np_x, dtype="int32")
-                y = 2
-                z = y // x
-                z_expected = np.array([1, 2, 0])
-                self.assertEqual((z_expected == z.numpy()).all(), True)
-
-                # rule 6: y is Tensor, x is Tensor
-                np_x = np.array([2, 3, 4])
-                np_y = np.array([1, 5, 2])
-                x = paddle.to_tensor(np_x)
-                y = paddle.to_tensor(np_y)
-                z = x // y
-                z_expected = np.array([2., 0., 2.])
-                self.assertEqual((z_expected == z.numpy()).all(), True)
+        with fluid.dygraph.guard():
+            np_x = np.array([2, 3, 8, 7]).astype('int64')
+            np_y = np.array([1, 5, 3, 3]).astype('int64')
+            x = paddle.to_tensor(np_x)
+            y = paddle.to_tensor(np_y)
+            z = paddle.floor_divide(x, y)
+            np_z = z.numpy()
+            z_expected = np.array([2, 0, 2, 2])
+            self.assertEqual((np_z == z_expected).all(), True)
 
         with fluid.dygraph.guard(fluid.CPUPlace()):
             # divide by zero 
diff --git a/python/paddle/fluid/tests/unittests/test_elementwise_mod_op.py b/python/paddle/fluid/tests/unittests/test_elementwise_mod_op.py
index f5d8b4f704da8acd97475444346522f63d3724fd..2a8ca51693ecfad55f2239d7619e355c6dd7f3f8 100644
--- a/python/paddle/fluid/tests/unittests/test_elementwise_mod_op.py
+++ b/python/paddle/fluid/tests/unittests/test_elementwise_mod_op.py
@@ -84,141 +84,41 @@ class TestElementwiseModOpDouble(TestElementwiseModOpFloat):
         self.dtype = np.float64
 
 
-class TestRemainderAPI(unittest.TestCase):
-    def setUp(self):
-        paddle.set_default_dtype("float64")
-        self.places = [fluid.CPUPlace()]
-        if core.is_compiled_with_cuda():
-            self.places.append(fluid.CUDAPlace(0))
-
-    def check_static_result(self, place):
-        # rule 1
-        with fluid.program_guard(fluid.Program(), fluid.Program()):
-            x = fluid.data(name="x", shape=[3], dtype="float64")
-            y = np.array([1, 2, 3])
-            self.assertRaises(TypeError, paddle.remainder, x=x, y=y)
-
-        # rule 3: 
-        with fluid.program_guard(fluid.Program(), fluid.Program()):
-            x = fluid.data(name="x", shape=[3], dtype="float64")
-            y = fluid.data(name="y", shape=[3], dtype="float32")
-            self.assertRaises(TypeError, paddle.remainder, x=x, y=y)
-
-        # rule 4: x is Tensor, y is scalar
-        with fluid.program_guard(fluid.Program(), fluid.Program()):
-            x = fluid.data(name="x", shape=[3], dtype="float64")
-            y = 2
-            exe = fluid.Executor(place)
-            res = x % y
-            np_z = exe.run(fluid.default_main_program(),
-                           feed={"x": np.array([2, 3, 4]).astype('float64')},
-                           fetch_list=[res])
-            z_expected = np.array([0., 1., 0.])
-            self.assertEqual((np_z[0] == z_expected).all(), True)
-
-        # rule 5: y is Tensor, x is scalar
-        with fluid.program_guard(fluid.Program(), fluid.Program()):
-            x = 3
-            y = fluid.data(name="y", shape=[3], dtype="float32")
-            self.assertRaises(TypeError, paddle.remainder, x=x, y=y)
-
-        # rule 6: y is Tensor, x is Tensor
-        with fluid.program_guard(fluid.Program(), fluid.Program()):
-            x = fluid.data(name="x", shape=[3], dtype="float64")
-            y = fluid.data(name="y", shape=[1], dtype="float64")
-            exe = fluid.Executor(place)
-            res = x % y
-            np_z = exe.run(fluid.default_main_program(),
-                           feed={
-                               "x": np.array([1., 2., 4]).astype('float64'),
-                               "y": np.array([1.5]).astype('float64')
-                           },
-                           fetch_list=[res])
-            z_expected = np.array([1., 0.5, 1.0])
-            self.assertEqual((np_z[0] == z_expected).all(), True)
-
-        # rule 6: y is Tensor, x is Tensor
-        with fluid.program_guard(fluid.Program(), fluid.Program()):
-            x = fluid.data(name="x", shape=[6], dtype="float64")
-            y = fluid.data(name="y", shape=[1], dtype="float64")
-            exe = fluid.Executor(place)
-            res = x % y
-            np_z = exe.run(
-                fluid.default_main_program(),
-                feed={
-                    "x": np.array([-3., -2, -1, 1, 2, 3]).astype('float64'),
-                    "y": np.array([2]).astype('float64')
-                },
-                fetch_list=[res])
-            z_expected = np.array([1., 0., 1., 1., 0., 1.])
-            self.assertEqual((np_z[0] == z_expected).all(), True)
-
-    def test_static(self):
-        for place in self.places:
-            self.check_static_result(place=place)
+class TestRemainderOp(unittest.TestCase):
+    def test_name(self):
+        with fluid.program_guard(fluid.Program()):
+            x = fluid.data(name="x", shape=[2, 3], dtype="int64")
+            y = fluid.data(name='y', shape=[2, 3], dtype='int64')
+
+            y_1 = paddle.remainder(x, y, name='div_res')
+            self.assertEqual(('div_res' in y_1.name), True)
 
     def test_dygraph(self):
-        for place in self.places:
-            with fluid.dygraph.guard(place):
-                # rule 1 : avoid numpy.ndarray
-                np_x = np.array([2, 3, 4])
-                np_y = np.array([1, 5, 2])
-                x = paddle.to_tensor(np_x)
-                self.assertRaises(TypeError, paddle.remainder, x=x, y=np_y)
-
-                # rule 3: both the inputs are Tensor
-                np_x = np.array([2, 3, 4])
-                np_y = np.array([1, 5, 2])
-                x = paddle.to_tensor(np_x, dtype="float32")
-                y = paddle.to_tensor(np_y, dtype="float64")
-                self.assertRaises(TypeError, paddle.remainder, x=x, y=y)
-
-                # rule 4: x is Tensor, y is scalar
-                np_x = np.array([2, 3, 4])
-                x = paddle.to_tensor(np_x, dtype="int32")
-                y = 2
-                z = x % y
-                z_expected = np.array([0, 1, 0])
-                self.assertEqual((z_expected == z.numpy()).all(), True)
-
-                # rule 5: y is Tensor, x is scalar
-                np_x = np.array([2, 3, 4])
-                x = paddle.to_tensor(np_x)
-                self.assertRaises(TypeError, paddle.remainder, x=3, y=x)
-
-                # rule 6: y is Tensor, x is Tensor
-                np_x = np.array([1., 2., 4])
-                np_y = np.array([1.5])
-                x = paddle.to_tensor(np_x)
-                y = paddle.to_tensor(np_y)
-                z = x % y
-                z_expected = np.array([1., 0.5, 1.0])
-                self.assertEqual((z_expected == z.numpy()).all(), True)
-
-                # rule 6: y is Tensor, x is Tensor
-                np_x = np.array([-3., -2, -1, 1, 2, 3])
-                np_y = np.array([2.])
-                x = paddle.to_tensor(np_x)
-                y = paddle.to_tensor(np_y)
-                z = x % y
-                z_expected = np.array([1., 0., 1., 1., 0., 1.])
-                self.assertEqual((z_expected == z.numpy()).all(), True)
-
-                np_x = np.array([-3.3, 11.5, -2, 3.5])
-                np_y = np.array([-1.2, 2., 3.3, -2.3])
-                x = paddle.to_tensor(np_x)
-                y = paddle.to_tensor(np_y)
-                z = x % y
-                z_expected = np.array([-0.9, 1.5, 1.3, -1.1])
-                self.assertEqual(np.allclose(z_expected, z.numpy()), True)
-
-                np_x = np.array([-3, 11, -2, 3])
-                np_y = np.array([-1, 2, 3, -2])
-                x = paddle.to_tensor(np_x, dtype="int64")
-                y = paddle.to_tensor(np_y, dtype="int64")
-                z = x % y
-                z_expected = np.array([0, 1, 1, -1])
-                self.assertEqual(np.allclose(z_expected, z.numpy()), True)
+        with fluid.dygraph.guard():
+            np_x = np.array([2, 3, 8, 7]).astype('int64')
+            np_y = np.array([1, 5, 3, 3]).astype('int64')
+            x = paddle.to_tensor(np_x)
+            y = paddle.to_tensor(np_y)
+            z = paddle.remainder(x, y)
+            np_z = z.numpy()
+            z_expected = np.array([0, 3, 2, 1])
+            self.assertEqual((np_z == z_expected).all(), True)
+
+            np_x = np.array([-3.3, 11.5, -2, 3.5])
+            np_y = np.array([-1.2, 2., 3.3, -2.3])
+            x = paddle.to_tensor(np_x)
+            y = paddle.to_tensor(np_y)
+            z = x % y
+            z_expected = np.array([-0.9, 1.5, 1.3, -1.1])
+            self.assertEqual(np.allclose(z_expected, z.numpy()), True)
+
+            np_x = np.array([-3, 11, -2, 3])
+            np_y = np.array([-1, 2, 3, -2])
+            x = paddle.to_tensor(np_x, dtype="int64")
+            y = paddle.to_tensor(np_y, dtype="int64")
+            z = x % y
+            z_expected = np.array([0, 1, 1, -1])
+            self.assertEqual(np.allclose(z_expected, z.numpy()), True)
 
 
 if __name__ == '__main__':
diff --git a/python/paddle/fluid/tests/unittests/test_empty_like_op.py b/python/paddle/fluid/tests/unittests/test_empty_like_op.py
new file mode 100644
index 0000000000000000000000000000000000000000..32d732d9a809950ade5484431b833056336acd54
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/test_empty_like_op.py
@@ -0,0 +1,192 @@
+#  Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+
+import unittest
+import numpy as np
+import paddle
+import paddle.fluid as fluid
+from paddle.fluid.data_feeder import convert_dtype
+import paddle.fluid.core as core
+from paddle.static import program_guard, Program
+
+
+class TestEmptyLikeAPICommon(unittest.TestCase):
+    def __check_out__(self, out):
+        data_type = convert_dtype(out.dtype)
+        self.assertEqual(data_type, self.dst_dtype,
+                         'dtype should be %s, but get %s' %
+                         (self.dst_dtype, data_type))
+
+        shape = out.shape
+        self.assertTupleEqual(shape, self.dst_shape,
+                              'shape should be %s, but get %s' %
+                              (self.dst_shape, shape))
+
+        if data_type in ['float32', 'float64', 'int32', 'int64']:
+            max_value = np.nanmax(out)
+            min_value = np.nanmin(out)
+            always_non_full_zero = max_value > min_value
+            always_full_zero = max_value == 0.0 and min_value == 0.0
+            self.assertTrue(always_full_zero or always_non_full_zero,
+                            'always_full_zero or always_non_full_zero.')
+        elif data_type in ['bool']:
+            total_num = out.size
+            true_num = np.sum(out == True)
+            false_num = np.sum(out == False)
+            self.assertTrue(total_num == true_num + false_num,
+                            'The value should always be True or False.')
+        else:
+            self.assertTrue(False, 'invalid data type')
+
+
+class TestEmptyLikeAPI(TestEmptyLikeAPICommon):
+    def setUp(self):
+        self.init_config()
+
+    def test_dygraph_api_out(self):
+        paddle.disable_static()
+        out = paddle.empty_like(self.x, self.dtype)
+        self.__check_out__(out.numpy())
+        paddle.enable_static()
+
+    def init_config(self):
+        self.x = np.random.random((200, 3)).astype("float32")
+        self.dtype = self.x.dtype
+        self.dst_shape = self.x.shape
+        self.dst_dtype = self.dtype
+
+
+class TestEmptyLikeAPI2(TestEmptyLikeAPI):
+    def init_config(self):
+        self.x = np.random.random((200, 3)).astype("float64")
+        self.dtype = self.x.dtype
+        self.dst_shape = self.x.shape
+        self.dst_dtype = self.dtype
+
+
+class TestEmptyLikeAPI3(TestEmptyLikeAPI):
+    def init_config(self):
+        self.x = np.random.random((200, 3)).astype("int")
+        self.dtype = self.x.dtype
+        self.dst_shape = self.x.shape
+        self.dst_dtype = self.dtype
+
+
+class TestEmptyLikeAPI4(TestEmptyLikeAPI):
+    def init_config(self):
+        self.x = np.random.random((200, 3)).astype("int64")
+        self.dtype = self.x.dtype
+        self.dst_shape = self.x.shape
+        self.dst_dtype = self.dtype
+
+
+class TestEmptyLikeAPI5(TestEmptyLikeAPI):
+    def init_config(self):
+        self.x = np.random.random((200, 3)).astype("bool")
+        self.dtype = self.x.dtype
+        self.dst_shape = self.x.shape
+        self.dst_dtype = self.dtype
+
+
+class TestEmptyLikeAPI6(TestEmptyLikeAPI):
+    def init_config(self):
+        self.x = np.random.random((200, 3)).astype("float64")
+        self.dtype = "float32"
+        self.dst_shape = self.x.shape
+        self.dst_dtype = self.dtype
+
+
+class TestEmptyLikeAPI7(TestEmptyLikeAPI):
+    def init_config(self):
+        self.x = np.random.random((200, 3)).astype("int")
+        self.dtype = "float32"
+        self.dst_shape = self.x.shape
+        self.dst_dtype = self.dtype
+
+
+class TestEmptyLikeAPI8(TestEmptyLikeAPI):
+    def init_config(self):
+        self.x = np.random.random((200, 3)).astype("int64")
+        self.dtype = "float32"
+        self.dst_shape = self.x.shape
+        self.dst_dtype = self.dtype
+
+
+class TestEmptyLikeAPI9(TestEmptyLikeAPI):
+    def init_config(self):
+        self.x = np.random.random((200, 3)).astype("bool")
+        self.dtype = "float32"
+        self.dst_shape = self.x.shape
+        self.dst_dtype = self.dtype
+
+
+class TestEmptyLikeAPI10(TestEmptyLikeAPI):
+    def init_config(self):
+        self.x = np.random.random((200, 3)).astype("float32")
+        self.dtype = "bool"
+        self.dst_shape = self.x.shape
+        self.dst_dtype = self.dtype
+
+
+class TestEmptyLikeAPI_Static(TestEmptyLikeAPICommon):
+    def setUp(self):
+        self.init_config()
+
+    def test_static_graph(self):
+        dtype = 'float32'
+
+        train_program = Program()
+        startup_program = Program()
+
+        with program_guard(train_program, startup_program):
+            x = np.random.random(self.x_shape).astype(dtype)
+            data_x = paddle.static.data(
+                'x', shape=self.data_x_shape, dtype=dtype)
+
+            out = paddle.empty_like(data_x)
+
+        place = paddle.CUDAPlace(0) if core.is_compiled_with_cuda(
+        ) else paddle.CPUPlace()
+        exe = paddle.static.Executor(place)
+        res = exe.run(train_program, feed={'x': x}, fetch_list=[out])
+
+        self.dst_dtype = dtype
+        self.dst_shape = x.shape
+        self.__check_out__(res[0])
+
+    def init_config(self):
+        self.x_shape = (200, 3)
+        self.data_x_shape = [200, 3]
+
+
+class TestEmptyLikeAPI_Static2(TestEmptyLikeAPI_Static):
+    def init_config(self):
+        self.x_shape = (3, 200, 3)
+        self.data_x_shape = [-1, 200, 3]
+
+
+class TestEmptyError(unittest.TestCase):
+    def test_attr(self):
+        def test_dtype():
+            x = np.random.random((200, 3)).astype("float64")
+            dtype = 'uint8'
+            result = paddle.empty_like(x, dtype=dtype)
+
+        self.assertRaises(TypeError, test_dtype)
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_empty_op.py b/python/paddle/fluid/tests/unittests/test_empty_op.py
new file mode 100644
index 0000000000000000000000000000000000000000..e8b1f836fcaa8d53671307d9075efd45fc88ce7b
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/test_empty_op.py
@@ -0,0 +1,270 @@
+#Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+
+import unittest
+import numpy as np
+import paddle
+import paddle.fluid as fluid
+from op_test import OpTest
+from paddle.fluid import Program, program_guard
+from paddle.fluid.framework import convert_np_dtype_to_dtype_
+
+
+# Situation 1: Attr(shape) is a list(without tensor)
+class TestEmptyOp(OpTest):
+    def setUp(self):
+        self.op_type = "empty"
+        self.init_config()
+
+    def test_check_output(self):
+        self.check_output_customized(self.verify_output)
+
+    def verify_output(self, outs):
+        data_type = outs[0].dtype
+        if data_type in ['float32', 'float64', 'int32', 'int64']:
+            max_value = np.nanmax(outs[0])
+            min_value = np.nanmin(outs[0])
+
+            always_full_zero = max_value == 0.0 and min_value == 0.0
+            always_non_full_zero = max_value > min_value
+            self.assertTrue(always_full_zero or always_non_full_zero,
+                            'always_full_zero or always_non_full_zero.')
+        elif data_type in ['bool']:
+            total_num = outs[0].size
+            true_num = np.sum(outs[0] == True)
+            false_num = np.sum(outs[0] == False)
+            self.assertTrue(total_num == true_num + false_num,
+                            'The value should always be True or False.')
+        else:
+            self.assertTrue(False, 'invalid data type')
+
+    def init_config(self):
+        shape = [500, 3]
+        dtype = 'float32'
+        dtype_inner = convert_np_dtype_to_dtype_(dtype)
+        self.attrs = {'shape': shape, 'dtype': dtype_inner}
+        self.inputs = {}
+        self.outputs = {'Out': np.zeros(shape).astype(dtype)}
+
+
+class TestEmptyOp2(TestEmptyOp):
+    def init_config(self):
+        shape = [500, 3]
+        dtype = 'float64'
+        dtype_inner = convert_np_dtype_to_dtype_(dtype)
+        self.attrs = {'shape': shape, 'dtype': dtype_inner}
+        self.inputs = {}
+        self.outputs = {'Out': np.zeros(shape).astype(dtype)}
+
+
+class TestEmptyOp3(TestEmptyOp):
+    def init_config(self):
+        shape = [500, 3]
+        dtype = 'int32'
+        dtype_inner = convert_np_dtype_to_dtype_(dtype)
+        self.attrs = {'shape': shape, 'dtype': dtype_inner}
+        self.inputs = {}
+        self.outputs = {'Out': np.zeros(shape).astype(dtype)}
+
+
+class TestEmptyOp4(TestEmptyOp):
+    def init_config(self):
+        shape = [500, 3]
+        dtype = 'int64'
+        dtype_inner = convert_np_dtype_to_dtype_(dtype)
+        self.attrs = {'shape': shape, 'dtype': dtype_inner}
+        self.inputs = {}
+        self.outputs = {'Out': np.zeros(shape).astype(dtype)}
+
+
+class TestEmptyOp5(TestEmptyOp):
+    def init_config(self):
+        shape = [500, 3]
+        dtype = 'bool'
+        dtype_inner = convert_np_dtype_to_dtype_(dtype)
+        self.attrs = {'shape': shape, 'dtype': dtype_inner}
+        self.inputs = {}
+        self.outputs = {'Out': np.zeros(shape).astype(dtype)}
+
+
+# Situation 2: shape is a tensor
+class TestEmptyOp_ShapeTensor(OpTest):
+    def setUp(self):
+        self.op_type = "empty"
+        self.init_config()
+
+    def init_config(self):
+        self.shape = [500, 3]
+        dtype = 'float32'
+        dtype_inner = convert_np_dtype_to_dtype_(dtype)
+        self.attrs = {'dtype': dtype_inner}
+        self.inputs = {"ShapeTensor": np.array(self.shape).astype("int32")}
+        self.outputs = {'Out': np.zeros(self.shape).astype(dtype)}
+
+    def test_check_output(self):
+        self.check_output_customized(self.verify_output)
+
+    def verify_output(self, outs):
+        data_type = outs[0].dtype
+        if data_type in ['float32', 'float64', 'int32', 'int64']:
+            max_value = np.nanmax(outs[0])
+            min_value = np.nanmin(outs[0])
+
+            always_full_zero = max_value == 0.0 and min_value == 0.0
+            always_non_full_zero = max_value > min_value
+            self.assertTrue(always_full_zero or always_non_full_zero,
+                            'always_full_zero or always_non_full_zero.')
+        elif data_type in ['bool']:
+            total_num = outs[0].size
+            true_num = np.sum(outs[0] == True)
+            false_num = np.sum(outs[0] == False)
+            self.assertTrue(total_num == true_num + false_num,
+                            'The value should always be True or False.')
+        else:
+            self.assertTrue(False, 'invalid data type')
+
+
+# Situation 3: Attr(shape) is a list(with tensor)
+class TestEmptyOp_ShapeTensorList(OpTest):
+    def setUp(self):
+        self.op_type = "empty"
+        self.init_config()
+
+    def init_config(self):
+        self.shape = [123, 92]
+        self.infer_shape = [-1, 92]
+
+        dtype = 'float32'
+        dtype_inner = convert_np_dtype_to_dtype_(dtype)
+
+        shape_tensor_list = []
+        for index, ele in enumerate(self.shape):
+            shape_tensor_list.append(("x" + str(index), np.ones(
+                (1)).astype('int32') * ele))
+
+        self.inputs = {"ShapeTensorList": shape_tensor_list}
+        self.attrs = {'shape': self.infer_shape, 'dtype': dtype_inner}
+        self.outputs = {'Out': np.zeros(self.shape).astype(dtype)}
+
+    def test_check_output(self):
+        self.check_output_customized(self.verify_output)
+
+    def verify_output(self, outs):
+        data_type = outs[0].dtype
+        if data_type in ['float32', 'float64', 'int32', 'int64']:
+            max_value = np.nanmax(outs[0])
+            min_value = np.nanmin(outs[0])
+
+            always_full_zero = max_value == 0.0 and min_value == 0.0
+            always_non_full_zero = max_value > min_value
+            self.assertTrue(always_full_zero or always_non_full_zero,
+                            'always_full_zero or always_non_full_zero.')
+        elif data_type in ['bool']:
+            total_num = outs[0].size
+            true_num = np.sum(outs[0] == True)
+            false_num = np.sum(outs[0] == False)
+            self.assertTrue(total_num == true_num + false_num,
+                            'The value should always be True or False.')
+        else:
+            self.assertTrue(False, 'invalid data type')
+
+
+class TestEmptyAPI(unittest.TestCase):
+    def __check_out__(self, out, dtype='float32'):
+        max_value = np.nanmax(np.array(out))
+        min_value = np.nanmin(np.array(out))
+        always_non_full_zero = max_value > min_value
+        always_full_zero = max_value == 0.0 and min_value == 0.0
+        self.assertTrue(always_full_zero or always_non_full_zero,
+                        'always_full_zero or always_non_full_zero.')
+
+    def test_dygraph_api_out(self):
+        paddle.disable_static()
+        shape = [200, 3]
+        out = paddle.empty(shape=shape)
+        self.__check_out__(out)
+        paddle.enable_static()
+
+    def test_dygraph_api_out_2(self):
+        paddle.disable_static()
+        shape_data = np.array([200, 3]).astype('int32')
+        shape = paddle.to_tensor(shape_data)
+        out = paddle.empty(shape=shape)
+        self.__check_out__(out)
+        paddle.enable_static()
+
+    def test_dygraph_api_out_3(self):
+        paddle.disable_static()
+        shape_data = np.array([200, 3]).astype('int64')
+        shape = paddle.to_tensor(shape_data)
+        out = paddle.empty(shape=shape)
+        self.__check_out__(out)
+        paddle.enable_static()
+
+    def test_dygraph_api_attr(self):
+        paddle.disable_static()
+        shape = [200, 3]
+        dtype = 'float64'
+        out = paddle.empty(shape=shape, dtype=dtype)
+        self.__check_out__(out, dtype)
+        paddle.enable_static()
+
+    def test_static_graph(self):
+        dtype = 'float64'
+
+        positive_2_int32 = fluid.layers.fill_constant([1], "int32", 3)
+        positive_2_int64 = fluid.layers.fill_constant([1], "int64", 3)
+
+        shape_tensor_int32 = fluid.data(
+            name="shape_tensor_int32", shape=[2], dtype="int32")
+        shape_tensor_int64 = fluid.data(
+            name="shape_tensor_int64", shape=[2], dtype="int64")
+
+        out_1 = paddle.empty(shape=[200, 3], dtype=dtype)
+        out_2 = paddle.empty(shape=shape_tensor_int32, dtype=dtype)
+        out_3 = paddle.empty(shape=shape_tensor_int64, dtype=dtype)
+        out_4 = paddle.empty(shape=[200, positive_2_int32], dtype=dtype)
+        out_5 = paddle.empty(shape=[200, positive_2_int64], dtype=dtype)
+
+        place = paddle.CPUPlace()
+        exe = paddle.static.Executor(place)
+        res_1, res_2, res_3, res_4, res_5 = exe.run(
+            fluid.default_main_program(),
+            feed={
+                "shape_tensor_int32": np.array([200, 3]).astype("int32"),
+                "shape_tensor_int64": np.array([200, 3]).astype("int64"),
+            },
+            fetch_list=[out_1, out_2, out_3, out_4, out_5])
+
+        self.__check_out__(res_1, dtype)
+        self.__check_out__(res_2, dtype)
+        self.__check_out__(res_3, dtype)
+        self.__check_out__(res_4, dtype)
+        self.__check_out__(res_5, dtype)
+
+
+class TestEmptyError(unittest.TestCase):
+    def test_attr(self):
+        def test_dtype():
+            shape = [200, 3]
+            dtype = 'uint8'
+            result = paddle.empty(shape=shape, dtype=dtype)
+
+        self.assertRaises(TypeError, test_dtype)
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_fake_quantize_op.py b/python/paddle/fluid/tests/unittests/test_fake_quantize_op.py
index 7835fd3f53ddb7f9a95313c6cc5fc7b72ae6d664..01f0abe0f217c342c4ea14cb55b4c40b5d273284 100644
--- a/python/paddle/fluid/tests/unittests/test_fake_quantize_op.py
+++ b/python/paddle/fluid/tests/unittests/test_fake_quantize_op.py
@@ -306,5 +306,70 @@ class TestFakeQuantDequantAbsOp(OpTest):
         self.check_grad(["X"], "Out", user_defined_grads=gradient)
 
 
+class TestChannelWiseFakeQuantDequantOp(OpTest):
+    def setUp(self):
+        self.set_arg()
+        assert self.quant_axis in [0, 1], "quant_axis should be 0 or 1."
+
+        self.op_type = "fake_channel_wise_quantize_dequantize_abs_max"
+        self.attrs = {'bit_length': 8, 'quant_axis': self.quant_axis}
+
+        scales = []
+        outputs = self.inputs['X'].copy()
+        range_v = (1 << (self.attrs['bit_length'] - 1)) - 1
+        if self.quant_axis == 0:
+            for i in range(self.inputs['X'].shape[0]):
+                scale_v = np.max(np.abs(self.inputs['X'][i])).astype("float32")
+                scales.append(scale_v)
+                outputs[i] = np.round(outputs[i] * range_v /
+                                      scale_v) * scale_v / range_v
+        elif self.quant_axis == 1:
+            for i in range(self.inputs['X'].shape[1]):
+                scale_v = np.max(np.abs(self.inputs['X'][:, i])).astype(
+                    "float32")
+                scales.append(scale_v)
+                outputs[:, i] = np.round(outputs[:, i] * range_v /
+                                         scale_v) * scale_v / range_v
+
+        self.outputs = {
+            'Out': outputs,
+            'OutScale': np.array(scales).astype("float32"),
+        }
+
+    def set_arg(self):
+        self.quant_axis = 0
+        self.inputs = {
+            'X': np.random.random((3, 4, 64, 64)).astype("float32"),
+        }
+
+    def test_check_output(self):
+        self.check_output()
+
+    def test_check_grad(self):
+        x = self.inputs["X"]
+        gradient = [np.ones(x.shape) / np.product(x.shape)]
+        self.check_grad(["X"], "Out", user_defined_grads=gradient)
+
+
+class TestChannelWiseFakeQuantDequantOp1(TestChannelWiseFakeQuantDequantOp):
+    def set_arg(self):
+        self.quant_axis = 1
+        self.inputs = {
+            'X': np.random.random((15, 20, 5, 5)).astype("float32"),
+        }
+
+
+class TestChannelWiseFakeQuantDequantOp2(TestChannelWiseFakeQuantDequantOp):
+    def set_arg(self):
+        self.quant_axis = 0
+        self.inputs = {'X': np.random.random((30, 15)).astype("float32"), }
+
+
+class TestChannelWiseFakeQuantDequantOp3(TestChannelWiseFakeQuantDequantOp):
+    def set_arg(self):
+        self.quant_axis = 1
+        self.inputs = {'X': np.random.random((30, 15)).astype("float32"), }
+
+
 if __name__ == "__main__":
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_fetch_unmerged.py b/python/paddle/fluid/tests/unittests/test_fetch_unmerged.py
index 1181272bd98b00f65e6925b44da814662f96045f..37d269e3369bfe7db00529dea5e08b287151691a 100644
--- a/python/paddle/fluid/tests/unittests/test_fetch_unmerged.py
+++ b/python/paddle/fluid/tests/unittests/test_fetch_unmerged.py
@@ -28,7 +28,7 @@ class TestFetchUnmerged(unittest.TestCase):
         conv_pool_1 = fluid.nets.simple_img_conv_pool(
             input=img,
             filter_size=5,
-            num_filters=20,
+            num_filters=8,
             pool_size=2,
             pool_stride=2,
             pool_type='max',
@@ -37,12 +37,12 @@ class TestFetchUnmerged(unittest.TestCase):
         conv_pool_2 = fluid.nets.simple_img_conv_pool(
             input=conv_pool_1,
             filter_size=5,
-            num_filters=50,
+            num_filters=16,
             pool_size=2,
             pool_stride=2,
             pool_type='avg',
             act="relu")
-        hidden = fluid.layers.fc(input=conv_pool_2, size=100, act='relu')
+        hidden = fluid.layers.fc(input=conv_pool_2, size=32, act='relu')
         prediction = fluid.layers.fc(input=hidden, size=10, act='softmax')
         loss = fluid.layers.cross_entropy(input=prediction, label=label)
         avg_loss = fluid.layers.mean(loss)
@@ -75,8 +75,8 @@ class TestFetchUnmerged(unittest.TestCase):
         binary = fluid.CompiledProgram(main_program).with_data_parallel(
             loss_name=loss.name, build_strategy=build_strategy)
 
-        iters = 3
-        batch_size = 64
+        iters = 2
+        batch_size = 16
         train_reader = paddle.batch(
             paddle.reader.shuffle(
                 paddle.dataset.mnist.train(), buf_size=500),
diff --git a/python/paddle/fluid/tests/unittests/test_fill_constant_op.py b/python/paddle/fluid/tests/unittests/test_fill_constant_op.py
index 3475320eeebc55a14dd569410610b70ae35e65a3..43069470680c7d49071ce54bf3649962c56f06ea 100644
--- a/python/paddle/fluid/tests/unittests/test_fill_constant_op.py
+++ b/python/paddle/fluid/tests/unittests/test_fill_constant_op.py
@@ -350,6 +350,14 @@ class TestFillConstantOpError(unittest.TestCase):
                 dtype='int16',
                 out=x1)
 
+            self.assertRaises(
+                TypeError,
+                fluid.layers.fill_constant,
+                shape=[1.1],
+                value=5,
+                dtype='float32',
+                out=x1)
+
             # The argument dtype of fill_constant_op must be one of bool, float16,
             #float32, float64, int32 or int64
             x2 = fluid.layers.data(name='x2', shape=[1], dtype="int32")
diff --git a/python/paddle/fluid/tests/unittests/test_fleet_amp_meta_optimizer.py b/python/paddle/fluid/tests/unittests/test_fleet_amp_meta_optimizer.py
index 38c3903306e6e76188cdb50476d6797814c434e9..73e014b35008ff5a0539c6a338755b9dc2cf68d4 100644
--- a/python/paddle/fluid/tests/unittests/test_fleet_amp_meta_optimizer.py
+++ b/python/paddle/fluid/tests/unittests/test_fleet_amp_meta_optimizer.py
@@ -57,7 +57,7 @@ class TestFleetAMPOptimizer(unittest.TestCase):
 
         ops = [op.type for op in avg_cost.block.ops]
         self.assertIn('cast', ops)
-        self.assertIn('isfinite', ops)
+        self.assertIn('check_finite_and_unscale', ops)
 
 
 if __name__ == "__main__":
diff --git a/python/paddle/fluid/tests/unittests/test_fleet_auto.py b/python/paddle/fluid/tests/unittests/test_fleet_auto.py
new file mode 100644
index 0000000000000000000000000000000000000000..020f2f4db382ef1277167d85917e8fdba9c83893
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/test_fleet_auto.py
@@ -0,0 +1,51 @@
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+import paddle
+import os
+import paddle.distributed.fleet as fleet
+import paddle.distributed.fleet.base.role_maker as role_maker
+
+
+class TestDistributedStrategyAuto(unittest.TestCase):
+    def setUp(self):
+        os.environ["POD_IP"] = "127.0.0.1"
+        os.environ["PADDLE_TRAINER_ENDPOINTS"] = "127.0.0.1:36001"
+        os.environ["PADDLE_TRAINERS_NUM"] = "2"
+        os.environ["PADDLE_PSERVERS_IP_PORT_LIST"] = \
+                       "127.0.0.1:36001,127.0.0.2:36001"
+
+    def test_distributed_strategy_auto(self):
+        fleet.init(is_collective=True)
+        input_x = paddle.fluid.layers.data(
+            name="x", shape=[32], dtype='float32')
+        input_y = paddle.fluid.layers.data(name="y", shape=[1], dtype='int64')
+
+        fc_1 = paddle.fluid.layers.fc(input=input_x, size=64, act='tanh')
+        fc_2 = paddle.fluid.layers.fc(input=fc_1, size=64, act='tanh')
+        prediction = paddle.fluid.layers.fc(input=[fc_2], size=2, act='softmax')
+        cost = paddle.fluid.layers.cross_entropy(
+            input=prediction, label=input_y)
+        avg_cost = paddle.fluid.layers.mean(x=cost)
+
+        strategy = paddle.distributed.fleet.DistributedStrategy()
+        strategy.auto = True
+        optimizer = paddle.fluid.optimizer.SGD(learning_rate=0.01)
+        optimizer = fleet.distributed_optimizer(optimizer, strategy=strategy)
+        optimizer.minimize(avg_cost)
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_fleet_base.py b/python/paddle/fluid/tests/unittests/test_fleet_base.py
index 9e651dea24ba7f35f3785093da8ac73dde07be5a..45597e7253c4d5bab50aa58f5f58e13e89ce1c1e 100644
--- a/python/paddle/fluid/tests/unittests/test_fleet_base.py
+++ b/python/paddle/fluid/tests/unittests/test_fleet_base.py
@@ -18,15 +18,16 @@ import paddle.distributed.fleet as fleet
 import paddle.distributed.fleet.base.role_maker as role_maker
 import os
 import paddle.fluid as fluid
+import numpy as np
 
 
 class TestFleetBase(unittest.TestCase):
     def setUp(self):
         os.environ["POD_IP"] = "127.0.0.1"
-        os.environ["PADDLE_TRAINER_ENDPOINTS"] = "127.0.0.1:36001"
+        os.environ["PADDLE_TRAINER_ENDPOINTS"] = "127.0.0.1:36000"
         os.environ["PADDLE_TRAINERS_NUM"] = "2"
         os.environ["PADDLE_PSERVERS_IP_PORT_LIST"] = \
-                       "127.0.0.1:36001,127.0.0.2:36001"
+            "127.0.0.1:36001,127.0.0.2:36002"
 
     def test_init(self):
         role = role_maker.PaddleCloudRoleMaker(is_collective=True)
@@ -57,37 +58,56 @@ class TestFleetBase(unittest.TestCase):
     def test_worker_endpoints(self):
         role = role_maker.PaddleCloudRoleMaker(is_collective=True)
         fleet.init(role)
-        print(fleet.worker_endpoints(to_string=True))
+        self.assertEqual(
+            "127.0.0.1:36000", fleet.worker_endpoints(to_string=True))
+        self.assertEqual(["127.0.0.1:36000"], fleet.worker_endpoints())
 
     def test_server_num(self):
-        role = role_maker.PaddleCloudRoleMaker(is_collective=True)
+        os.environ["TRAINING_ROLE"] = "PSERVER"
+        os.environ["PADDLE_PORT"] = "36001"
+        os.environ["POD_IP"] = "127.0.0.1"
+
+        role = role_maker.PaddleCloudRoleMaker()
         fleet.init(role)
-        if fleet.is_server():
-            print("fleet server num: {}".format(fleet.server_num()))
+        os.environ["PADDLE_TRAINERS_NUM"] = "2"
+        self.assertEqual(2, fleet.server_num())
 
     def test_server_index(self):
-        role = role_maker.PaddleCloudRoleMaker(is_collective=True)
+        os.environ["TRAINING_ROLE"] = "PSERVER"
+        os.environ["PADDLE_PORT"] = "36001"
+        os.environ["POD_IP"] = "127.0.0.1"
+
+        role = role_maker.PaddleCloudRoleMaker()
         fleet.init(role)
-        if fleet.is_server():
-            print("fleet server index: {}".format(fleet.server_index()))
+        self.assertEqual(0, fleet.server_index())
 
     def test_server_endpoints(self):
-        role = role_maker.PaddleCloudRoleMaker(is_collective=True)
+        os.environ["TRAINING_ROLE"] = "PSERVER"
+        os.environ["PADDLE_PORT"] = "36001"
+        os.environ["POD_IP"] = "127.0.0.1"
+
+        role = role_maker.PaddleCloudRoleMaker()
         fleet.init(role)
         if fleet.is_server():
-            print("fleet server index: {}".format(
-                fleet.server_endpoints(to_string=True)))
+            self.assertEqual(
+                "127.0.0.1:36001,127.0.0.2:36002",
+                fleet.server_endpoints(to_string=True))
+            self.assertEqual(["127.0.0.1:36001", "127.0.0.2:36002"],
+                             fleet.server_endpoints())
 
     def test_is_server(self):
-        role = role_maker.PaddleCloudRoleMaker(is_collective=True)
+        os.environ["TRAINING_ROLE"] = "PSERVER"
+        os.environ["PADDLE_PORT"] = "36001"
+        os.environ["POD_IP"] = "127.0.0.1"
+
+        role = role_maker.PaddleCloudRoleMaker()
         fleet.init(role)
-        if fleet.is_server():
-            print("test fleet is server")
+        self.assertTrue(fleet.is_server())
 
     def test_util(self):
         role = role_maker.PaddleCloudRoleMaker(is_collective=True)
         fleet.init(role)
-        self.assertEqual(fleet.util, None)
+        self.assertEqual(fleet.util(), None)
 
     def test_barrier_worker(self):
         role = role_maker.PaddleCloudRoleMaker(is_collective=True)
@@ -98,20 +118,17 @@ class TestFleetBase(unittest.TestCase):
     def test_init_worker(self):
         role = role_maker.PaddleCloudRoleMaker(is_collective=True)
         fleet.init(role)
-        if fleet.is_worker():
-            fleet.init_worker()
 
-    def test_run_server(self):
-        role = role_maker.PaddleCloudRoleMaker(is_collective=True)
-        fleet.init(role)
-        if fleet.is_worker():
-            fleet.run_worker()
+        with self.assertRaises(ValueError):
+            if fleet.is_worker():
+                fleet.init_worker()
 
     def test_stop_worker(self):
         role = role_maker.PaddleCloudRoleMaker(is_collective=True)
         fleet.init(role)
-        if fleet.is_worker():
-            fleet.stop_worker()
+        with self.assertRaises(ValueError):
+            if fleet.is_worker():
+                fleet.stop_worker()
 
     def test_distributed_optimizer(self):
         role = role_maker.PaddleCloudRoleMaker(is_collective=True)
@@ -125,5 +142,110 @@ class TestFleetBase(unittest.TestCase):
         self.assertRaises(Exception, fleet.init_worker)
 
 
+class TestFleetDygraph(unittest.TestCase):
+    def setUp(self):
+        os.environ[
+            "PADDLE_TRAINER_ENDPOINTS"] = "127.0.0.1:36213,127.0.0.1:36214"
+        os.environ["PADDLE_CURRENT_ENDPOINTS"] = "127.0.0.1:36213"
+        os.environ["PADDLE_TRAINERS_NUM"] = "2"
+        os.environ["PADDLE_TRAINER_ID"] = "0"
+
+    def test_dygraph_method(self):
+        paddle.disable_static()
+        value = np.arange(26).reshape(2, 13).astype("float32")
+        a = fluid.dygraph.to_variable(value)
+        layer = paddle.nn.Linear(13, 5)
+        adam = paddle.optimizer.Adam(
+            learning_rate=0.01, parameters=layer.parameters())
+        # remove init cause this UT cannot launch distributed task
+        adam = fleet.distributed_optimizer(adam)
+        dp_layer = fleet.distributed_model(layer)
+        lr = 0.001
+        adam.set_lr(lr)
+        cur_lr = adam.get_lr()
+        assert (lr == cur_lr)
+        state_dict = adam.state_dict()
+        adam.set_state_dict(state_dict)
+
+
+class TestFleetBaseSingleRunCollective(unittest.TestCase):
+    def setUp(self):
+        os.environ.pop("PADDLE_TRAINER_ENDPOINTS")
+
+    def gen_data(self):
+        return {
+            "x": np.random.random(size=(128, 32)).astype('float32'),
+            "y": np.random.randint(
+                2, size=(128, 1)).astype('int64')
+        }
+
+    def test_single_run_collective_minimize(self):
+        input_x = paddle.static.data(name="x", shape=[-1, 32], dtype='float32')
+        input_y = paddle.static.data(name="y", shape=[-1, 1], dtype='int64')
+
+        fc_1 = fluid.layers.fc(input=input_x, size=64, act='tanh')
+        prediction = fluid.layers.fc(input=fc_1, size=2, act='softmax')
+        cost = fluid.layers.cross_entropy(input=prediction, label=input_y)
+        avg_cost = paddle.mean(x=cost)
+
+        fleet.init(is_collective=True)
+        optimizer = fluid.optimizer.SGD(learning_rate=0.001)
+        optimizer = fleet.distributed_optimizer(optimizer)
+        optimizer.minimize(avg_cost)
+
+        place = fluid.CUDAPlace(0) if paddle.fluid.is_compiled_with_cuda(
+        ) else fluid.CPUPlace()
+
+        exe = fluid.Executor(place)
+        exe.run(paddle.static.default_startup_program())
+
+        for i in range(10):
+            cost_val = exe.run(feed=self.gen_data(), fetch_list=[avg_cost.name])
+            print("cost of step[{}] = {}".format(i, cost_val))
+
+
+class TestFleetBaseSingleRunPS(unittest.TestCase):
+    def setUp(self):
+        os.environ.pop("PADDLE_PSERVERS_IP_PORT_LIST")
+
+    def gen_data(self):
+        return {
+            "x": np.random.random(size=(128, 32)).astype('float32'),
+            "y": np.random.randint(
+                2, size=(128, 1)).astype('int64')
+        }
+
+    def test_single_run_ps_minimize(self):
+        input_x = paddle.static.data(name="x", shape=[-1, 32], dtype='float32')
+        input_y = paddle.static.data(name="y", shape=[-1, 1], dtype='int64')
+
+        fc_1 = fluid.layers.fc(input=input_x, size=64, act='tanh')
+        prediction = fluid.layers.fc(input=fc_1, size=2, act='softmax')
+        cost = fluid.layers.cross_entropy(input=prediction, label=input_y)
+        avg_cost = paddle.mean(x=cost)
+
+        fleet.init()
+        strategy = paddle.distributed.fleet.DistributedStrategy()
+        optimizer = fluid.optimizer.SGD(learning_rate=0.01)
+        optimizer = fleet.distributed_optimizer(optimizer, strategy=strategy)
+        optimizer.minimize(avg_cost)
+        if fleet.is_server():
+            fleet.init_server()
+            fleet.run_server()
+        elif fleet.is_worker():
+            place = fluid.CPUPlace()
+            exe = fluid.Executor(place)
+            exe.run(paddle.static.default_startup_program())
+            step = 100
+            for i in range(step):
+                cost_val = exe.run(program=fluid.default_main_program(),
+                                   feed=self.gen_data(),
+                                   fetch_list=[avg_cost.name])
+                print("worker_index: %d, step%d cost = %f" %
+                      (fleet.worker_index(), i, cost_val[0]))
+            fleet.save_persistables(exe, "fleet_single_model/")
+            print("save fleet models done.")
+
+
 if __name__ == "__main__":
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_fleet_checkpoint.py b/python/paddle/fluid/tests/unittests/test_fleet_checkpoint.py
index 66baf8faac51ebd19689a5b22b87f3a454842fac..fc57602b445ddd6aa615b47ecce6bf993703d858 100644
--- a/python/paddle/fluid/tests/unittests/test_fleet_checkpoint.py
+++ b/python/paddle/fluid/tests/unittests/test_fleet_checkpoint.py
@@ -21,8 +21,7 @@ from paddle.fluid.incubate.checkpoint.checkpoint_saver import CheckpointSaver
 import os
 import sys
 
-from paddle.fluid.incubate.fleet.utils.fs import LocalFS
-from paddle.fluid.incubate.fleet.utils.hdfs import HDFSClient
+from paddle.distributed.fleet.utils.fs import LocalFS, HDFSClient
 from paddle.fluid.incubate.checkpoint.checkpoint_saver import CheckpointSaver
 
 
diff --git a/python/paddle/fluid/tests/unittests/test_fleet_distributed_strategy.py b/python/paddle/fluid/tests/unittests/test_fleet_distributed_strategy.py
index 8d715674cc6c9ba4f8b5c1ff4fe0cbdbe7841643..b20f33e11b656f1296510df653309a3569d45043 100644
--- a/python/paddle/fluid/tests/unittests/test_fleet_distributed_strategy.py
+++ b/python/paddle/fluid/tests/unittests/test_fleet_distributed_strategy.py
@@ -81,9 +81,17 @@ class TestStrategyConfig(unittest.TestCase):
 
     def test_localsgd_configs(self):
         strategy = paddle.distributed.fleet.DistributedStrategy()
-        configs = {"k_steps": 4}
+        configs = {"k_steps": 4, "begin_step": 120}
         strategy.localsgd_configs = configs
         self.assertEqual(strategy.localsgd_configs["k_steps"], 4)
+        self.assertEqual(strategy.localsgd_configs["begin_step"], 120)
+
+    def test_adaptive_localsgd_configs(self):
+        strategy = paddle.distributed.fleet.DistributedStrategy()
+        configs = {"init_k_steps": 1, "begin_step": 120}
+        strategy.adaptive_localsgd_configs = configs
+        self.assertEqual(strategy.adaptive_localsgd_configs["init_k_steps"], 1)
+        self.assertEqual(strategy.adaptive_localsgd_configs["begin_step"], 120)
 
     def test_dgc(self):
         strategy = paddle.distributed.fleet.DistributedStrategy()
@@ -230,7 +238,7 @@ class TestStrategyConfig(unittest.TestCase):
         strategy.a_sync = True
         strategy.localsgd = True
         strategy.dgc = True
-        localsgd_configs = {"k_steps": 5}
+        localsgd_configs = {"k_steps": 5, "begin_step": 1}
         strategy.localsgd_configs = localsgd_configs
         build_strategy = paddle.fluid.BuildStrategy()
         build_strategy.enable_sequential_execution = True
@@ -316,6 +324,14 @@ class TestStrategyConfig(unittest.TestCase):
         self.assertEqual(strategy.conv_workspace_size_limit, 1000)
         strategy._enable_env()
 
+    def test_distributed_strategy_repr(self):
+        strategy = paddle.distributed.fleet.DistributedStrategy()
+        strategy.recompute = True
+        strategy.recompute_configs = {"checkpoints": ["a1", "a2", "a3"]}
+        strategy.amp = True
+        strategy.localsgd = True
+        print(str(strategy))
+
 
 if __name__ == '__main__':
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_fleet_graph_execution_meta_optimizer.py b/python/paddle/fluid/tests/unittests/test_fleet_graph_execution_meta_optimizer.py
index 9eec73116cc283b58d3ee39cefb9256e12d4ef15..f06f1eaefaeb3ee56b849e062dd4e3b0b581d119 100644
--- a/python/paddle/fluid/tests/unittests/test_fleet_graph_execution_meta_optimizer.py
+++ b/python/paddle/fluid/tests/unittests/test_fleet_graph_execution_meta_optimizer.py
@@ -17,6 +17,8 @@ import paddle
 import os
 from launch_function_helper import launch_func, wait, _find_free_port
 
+paddle.enable_static()
+
 
 class TestFleetGraphExecutionMetaOptimizer(unittest.TestCase):
     def setUp(self):
@@ -190,7 +192,7 @@ class TestFleetGraphExecutionMetaOptimizer(unittest.TestCase):
             avg_cost = paddle.fluid.layers.mean(x=cost)
 
             strategy = paddle.distributed.fleet.DistributedStrategy()
-            optimizer = paddle.optimizer.SGD(learning_rate=0.01)
+            optimizer = paddle.fluid.optimizer.SGD(learning_rate=0.01)
             optimizer = fleet.distributed_optimizer(
                 optimizer, strategy=strategy)
             optimizer.minimize(avg_cost)
diff --git a/python/paddle/fluid/tests/unittests/test_fleet_lamb_meta_optimizer.py b/python/paddle/fluid/tests/unittests/test_fleet_lamb_meta_optimizer.py
index 3f140f53b043b1949572f3728ca8a0c556317783..ec055178d90c529080489218f3aca1a71311beea 100755
--- a/python/paddle/fluid/tests/unittests/test_fleet_lamb_meta_optimizer.py
+++ b/python/paddle/fluid/tests/unittests/test_fleet_lamb_meta_optimizer.py
@@ -22,11 +22,9 @@ import paddle.distributed.fleet.base.role_maker as role_maker
 
 class TestFleetLambMetaOptimizer(unittest.TestCase):
     def setUp(self):
-        os.environ["POD_IP"] = "127.0.0.1"
-        os.environ["PADDLE_TRAINER_ENDPOINTS"] = "127.0.0.1:36001"
-        os.environ["PADDLE_TRAINERS_NUM"] = "2"
-        os.environ["PADDLE_PSERVERS_IP_PORT_LIST"] = \
-                       "127.0.0.1:36001,127.0.0.2:36001"
+        os.environ["PADDLE_TRAINER_ID"] = "1"
+        os.environ[
+            "PADDLE_TRAINER_ENDPOINTS"] = "127.0.0.1:36001,127.0.0.1:36002"
 
     def net(self, main_prog, startup_prog):
         with fluid.program_guard(main_prog, startup_prog):
@@ -97,13 +95,54 @@ class TestFleetLambMetaOptimizer(unittest.TestCase):
         optimizer = fleet.distributed_optimizer(optimizer, strategy=strategy)
         optimizer.minimize(avg_cost)
 
-        ops_with_bias = [
+        ops_without_wd = [
             op for op in avg_cost.block.ops
             if op.type == 'lamb' and op.attr('op_role_var')[0].endswith('.b_0')
         ]
-        for op in ops_with_bias:
+        for op in ops_without_wd:
             self.assertEqual(op.attr('weight_decay'), 0)
 
+    def test_lamb_apply_with_amp(self):
+        role = role_maker.PaddleCloudRoleMaker(is_collective=True)
+        fleet.init(role)
+        input_x = paddle.fluid.layers.data(
+            name="x", shape=[32], dtype='float32')
+        input_y = paddle.fluid.layers.data(name="y", shape=[1], dtype='int64')
+
+        fc_1 = paddle.fluid.layers.fc(input=input_x, size=64, act='tanh')
+        fc_2 = paddle.fluid.layers.fc(input=fc_1, size=64, act='tanh')
+        prediction = paddle.fluid.layers.fc(input=[fc_2], size=2, act='softmax')
+        cost = paddle.fluid.layers.cross_entropy(
+            input=prediction, label=input_y)
+        avg_cost = paddle.fluid.layers.mean(x=cost)
+
+        strategy = paddle.distributed.fleet.DistributedStrategy()
+        strategy.amp = True
+        strategy.amp_configs = {
+            "init_loss_scaling": 32768,
+            "decr_every_n_nan_or_inf": 2,
+            "incr_every_n_steps": 1000,
+            "incr_ratio": 2.0,
+            "use_dynamic_loss_scaling": True,
+            "decr_ratio": 0.5,
+            "custom_white_list": ['softmax'],
+            "custom_black_list": ['tanh'],
+        }
+        strategy.lamb = True
+        strategy.lamb_configs = {
+            'lamb_weight_decay': 0.01,
+            'exclude_from_weight_decay': [],
+        }
+
+        optimizer = paddle.fluid.optimizer.Adam(learning_rate=0.01)
+        optimizer = fleet.distributed_optimizer(optimizer, strategy=strategy)
+        optimizer.minimize(avg_cost)
+
+        ops = [op.type for op in avg_cost.block.ops]
+        self.assertIn('lamb', ops)
+        self.assertIn('cast', ops)
+        self.assertIn('check_finite_and_unscale', ops)
+
 
 if __name__ == "__main__":
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_fleet_lars_meta_optimizer.py b/python/paddle/fluid/tests/unittests/test_fleet_lars_meta_optimizer.py
index 3caa1a4eac0bf191b13e6708b1a9adffdb111ca7..0a70710b4590e253463640634615c2d11ff36e9f 100755
--- a/python/paddle/fluid/tests/unittests/test_fleet_lars_meta_optimizer.py
+++ b/python/paddle/fluid/tests/unittests/test_fleet_lars_meta_optimizer.py
@@ -22,11 +22,9 @@ import paddle.distributed.fleet.base.role_maker as role_maker
 
 class TestFleetLarsMetaOptimizer(unittest.TestCase):
     def setUp(self):
-        os.environ["POD_IP"] = "127.0.0.1"
-        os.environ["PADDLE_TRAINER_ENDPOINTS"] = "127.0.0.1:36001"
-        os.environ["PADDLE_TRAINERS_NUM"] = "2"
-        os.environ["PADDLE_PSERVERS_IP_PORT_LIST"] = \
-                       "127.0.0.1:36001,127.0.0.2:36001"
+        os.environ["PADDLE_TRAINER_ID"] = "1"
+        os.environ[
+            "PADDLE_TRAINER_ENDPOINTS"] = "127.0.0.1:36001,127.0.0.1:36002"
 
     def net(self, main_prog, startup_prog):
         with fluid.program_guard(main_prog, startup_prog):
@@ -52,6 +50,8 @@ class TestFleetLarsMetaOptimizer(unittest.TestCase):
                 strategy.lars_configs = {
                     "lars_coeff": 0.001,
                     "lars_weight_decay": 0.0005,
+                    "epsilon": 0,
+                    "exclude_from_weight_decay": ["batch_norm", ".b"],
                 }
 
         return avg_cost, strategy
@@ -83,6 +83,70 @@ class TestFleetLarsMetaOptimizer(unittest.TestCase):
         ops = [op.type for op in avg_cost.block.ops]
         self.assertNotIn('lars_momentum', ops)
 
+    def test_lars_exclude_fn(self):
+        role = role_maker.PaddleCloudRoleMaker(is_collective=True)
+        fleet.init(role)
+        startup_prog = fluid.Program()
+        train_prog = fluid.Program()
+        avg_cost, strategy = self.net(train_prog, startup_prog)
+        optimizer = paddle.fluid.optimizer.Momentum(
+            learning_rate=0.01, momentum=0.9)
+
+        optimizer = fleet.distributed_optimizer(optimizer, strategy=strategy)
+        optimizer.minimize(avg_cost)
+
+        ops_without_wd = [
+            op for op in avg_cost.block.ops
+            if op.type == 'lars_momentum' and ("batch_norm" in op.attr(
+                'op_role_var')[0] or ".b" in op.attr('op_role_var')[0])
+        ]
+        for op in ops_without_wd:
+            self.assertEqual(op.attr('lars_weight_decay'), 0)
+
+    def test_lars_apply_with_amp(self):
+        role = role_maker.PaddleCloudRoleMaker(is_collective=True)
+        fleet.init(role)
+        input_x = paddle.fluid.layers.data(
+            name="x", shape=[32], dtype='float32')
+        input_y = paddle.fluid.layers.data(name="y", shape=[1], dtype='int64')
+
+        fc_1 = paddle.fluid.layers.fc(input=input_x, size=64, act='tanh')
+        fc_2 = paddle.fluid.layers.fc(input=fc_1, size=64, act='tanh')
+        prediction = paddle.fluid.layers.fc(input=[fc_2], size=2, act='softmax')
+        cost = paddle.fluid.layers.cross_entropy(
+            input=prediction, label=input_y)
+        avg_cost = paddle.fluid.layers.mean(x=cost)
+
+        strategy = paddle.distributed.fleet.DistributedStrategy()
+        strategy.amp = True
+        strategy.amp_configs = {
+            "init_loss_scaling": 32768,
+            "decr_every_n_nan_or_inf": 2,
+            "incr_every_n_steps": 1000,
+            "incr_ratio": 2.0,
+            "use_dynamic_loss_scaling": True,
+            "decr_ratio": 0.5,
+            "custom_white_list": ['softmax'],
+            "custom_black_list": ['tanh'],
+        }
+        strategy.lars = True
+        strategy.lars_configs = {
+            "lars_coeff": 0.001,
+            "lars_weight_decay": 0.0005,
+            "epsilon": 0,
+            "exclude_from_weight_decay": ["batch_norm", ".b"],
+        }
+
+        optimizer = paddle.fluid.optimizer.Momentum(
+            learning_rate=0.01, momentum=0.9)
+        optimizer = fleet.distributed_optimizer(optimizer, strategy=strategy)
+        optimizer.minimize(avg_cost)
+
+        ops = [op.type for op in avg_cost.block.ops]
+        self.assertIn('lars_momentum', ops)
+        self.assertIn('cast', ops)
+        self.assertIn('check_finite_and_unscale', ops)
+
 
 if __name__ == "__main__":
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_fleet_launch.sh b/python/paddle/fluid/tests/unittests/test_fleet_launch.sh
index c5edc96963408bf1fad793f7271d75159934f019..e717962ead2e2da30092b12379bf36f368e8a735 100644
--- a/python/paddle/fluid/tests/unittests/test_fleet_launch.sh
+++ b/python/paddle/fluid/tests/unittests/test_fleet_launch.sh
@@ -79,9 +79,9 @@ if [ -f $file_1 ]; then
     rm $file_1
 fi
 
-
+# test use DISTRIBUTED_TRAINER_ENDPOINTS env in paddlecloud
 unset PADDLE_PORT
-unset TRAINER_PORTS_NUM
+export DISTRIBUTED_TRAINER_ENDPOINTS=127.0.0.1:6170,127.0.0.1:6171,127.0.0.2:6170,127.0.0.2:6171
 
 echo ""
 echo "paddle.distributed.launch async poll process test"
diff --git a/python/paddle/fluid/tests/unittests/test_fleet_localsgd_meta_optimizer.py b/python/paddle/fluid/tests/unittests/test_fleet_localsgd_meta_optimizer.py
index 07b988bf8752057e68925bc42f564a72d466361d..f5347b0c665e2a162f7f8210171ec415afee4599 100644
--- a/python/paddle/fluid/tests/unittests/test_fleet_localsgd_meta_optimizer.py
+++ b/python/paddle/fluid/tests/unittests/test_fleet_localsgd_meta_optimizer.py
@@ -44,6 +44,7 @@ class TestFleetLocalSGDMetaOptimizer(unittest.TestCase):
         strategy.auto = True
         config = strategy.localsgd_configs
         config['k_steps'] = 1
+        config['begin_step'] = 1
         strategy.localsgd_configs = config
 
         optimizer = paddle.fluid.optimizer.SGD(learning_rate=0.01)
@@ -51,5 +52,36 @@ class TestFleetLocalSGDMetaOptimizer(unittest.TestCase):
         optimizer.minimize(avg_cost)
 
 
+class TestFleetAdaptiveLocalSGDMetaOptimizer(unittest.TestCase):
+    def setUp(self):
+        os.environ["PADDLE_TRAINER_ID"] = "1"
+        os.environ[
+            "PADDLE_TRAINER_ENDPOINTS"] = "127.0.0.1:36001,127.0.0.1:36002"
+
+    def test_adaptive_localsgd_optimizer(self):
+        role = role_maker.PaddleCloudRoleMaker(is_collective=True)
+        fleet.init(role)
+        input_x = paddle.fluid.layers.data(
+            name="x", shape=[32], dtype='float32')
+        input_y = paddle.fluid.layers.data(name="y", shape=[1], dtype='int64')
+
+        fc = paddle.fluid.layers.fc(input=input_x, size=64, act='tanh')
+        prediction = paddle.fluid.layers.fc(input=[fc], size=2, act='softmax')
+        cost = paddle.fluid.layers.cross_entropy(
+            input=prediction, label=input_y)
+        avg_cost = paddle.fluid.layers.mean(x=cost)
+
+        strategy = paddle.distributed.fleet.DistributedStrategy()
+        strategy.adaptive_localsgd = True
+        config = strategy.adaptive_localsgd_configs
+        config['init_k_steps'] = 1
+        config['begin_step'] = 1
+        strategy.adaptive_localsgd_configs = config
+
+        optimizer = paddle.fluid.optimizer.SGD(learning_rate=0.01)
+        optimizer = fleet.distributed_optimizer(optimizer, strategy=strategy)
+        optimizer.minimize(avg_cost)
+
+
 if __name__ == "__main__":
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_fleet_rolemaker_2.py b/python/paddle/fluid/tests/unittests/test_fleet_rolemaker_2.py
index eb5d9eb66608dd397dad773158c337fc67be2dbb..dae7907161697107a50eaf1b1501881f74509b76 100644
--- a/python/paddle/fluid/tests/unittests/test_fleet_rolemaker_2.py
+++ b/python/paddle/fluid/tests/unittests/test_fleet_rolemaker_2.py
@@ -87,7 +87,7 @@ class TestCloudRoleMaker2(unittest.TestCase):
         role2._all_gather(1)
         role2._all_gather(1)
         role2._barrier_server()
-        role2.all_gather(1)
+        role2._all_gather(1)
         role3 = GeneralRoleMaker(path="./test_gloo_3")
         role3._worker_gather(1)
         role3._worker_gather(1)
@@ -163,10 +163,9 @@ class TestCloudRoleMaker2(unittest.TestCase):
             data = "1 1 1 1\n"
             f.write(data)
 
-        dataset = paddle.distributed.fleet.DatasetFactory().create_dataset(
-            "InMemoryDataset")
+        dataset = paddle.distributed.InMemoryDataset()
         dataset.set_filelist(["test_fleet_gloo_role_maker_1.txt"])
-        dataset.set_use_var([show, label])
+        dataset._set_use_var([show, label])
         dataset.load_into_memory()
         dataset.get_memory_data_size(fleet)
         dataset.get_shuffle_data_size(fleet)
diff --git a/python/paddle/fluid/tests/unittests/test_fleet_rolemaker_4.py b/python/paddle/fluid/tests/unittests/test_fleet_rolemaker_4.py
index 6414ef18d635aea4b73c17a4931c37e596ed6029..6cb40eef27e4d93606167232dba3fc181af3c17a 100644
--- a/python/paddle/fluid/tests/unittests/test_fleet_rolemaker_4.py
+++ b/python/paddle/fluid/tests/unittests/test_fleet_rolemaker_4.py
@@ -40,9 +40,9 @@ class TestCloudRoleMaker(unittest.TestCase):
             from paddle.fluid.incubate.fleet.parameter_server.pslib import PSLib
             from paddle.fluid.incubate.fleet.base.role_maker import \
                 GeneralRoleMaker
-            from paddle.distributed.fleet.utils import KVHandler
-            from paddle.distributed.fleet.utils import KVServer
-            from paddle.distributed.fleet.utils import KVHTTPServer
+            from paddle.distributed.fleet.utils.http_server import KVHandler
+            from paddle.distributed.fleet.utils.http_server import KVServer
+            from paddle.distributed.fleet.utils.http_server import KVHTTPServer
         except:
             print("warning: no fleet, skip test_pslib_4")
             return
diff --git a/python/paddle/fluid/tests/unittests/test_fleet_rolemaker_new.py b/python/paddle/fluid/tests/unittests/test_fleet_rolemaker_new.py
index cf9b3e1e9a1605a714b47d99183511b24c903722..4dd254af251ae955878f9846e0f0e06f65c3ec90 100644
--- a/python/paddle/fluid/tests/unittests/test_fleet_rolemaker_new.py
+++ b/python/paddle/fluid/tests/unittests/test_fleet_rolemaker_new.py
@@ -15,7 +15,11 @@
 
 from __future__ import print_function
 import os
+import platform
+import shutil
+import tempfile
 import unittest
+import paddle
 import paddle.distributed.fleet.base.role_maker as role_maker
 
 
@@ -26,25 +30,25 @@ class TestRoleMakerBase(unittest.TestCase):
 
     def test_rolemaker_base(self):
         role = role_maker.RoleMakerBase()
-        self.assertRaises(Exception, role.is_worker)
-        self.assertRaises(Exception, role.is_server)
-        self.assertRaises(Exception, role.is_first_worker)
-        self.assertRaises(Exception, role.worker_num)
-        self.assertRaises(Exception, role.server_num)
-        self.assertRaises(Exception, role.worker_index)
-        self.assertRaises(Exception, role.server_index)
-        self.assertRaises(Exception, role.role_id)
-        self.assertRaises(Exception, role.node_num)
-
-        trainer_endpoints = role.get_trainer_endpoints()
+        self.assertRaises(Exception, role._is_worker)
+        self.assertRaises(Exception, role._is_server)
+        self.assertRaises(Exception, role._is_first_worker)
+        self.assertRaises(Exception, role._worker_num)
+        self.assertRaises(Exception, role._server_num)
+        self.assertRaises(Exception, role._worker_index)
+        self.assertRaises(Exception, role._server_index)
+        self.assertRaises(Exception, role._role_id)
+        self.assertRaises(Exception, role._node_num)
+
+        trainer_endpoints = role._get_trainer_endpoints()
         self.assertTrue(len(trainer_endpoints) == 0)
-        pserver_endpoints = role.get_pserver_endpoints()
+        pserver_endpoints = role._get_pserver_endpoints()
         self.assertTrue(len(pserver_endpoints) == 0)
 
         print(role.to_string())
-        self.assertTrue(role._all_gather(role._node_type_comm, 1) is None)
-        self.assertTrue(role._all_reduce(role._node_type_comm, 1) is None)
-        role._barrier(role._node_type_comm)
+        self.assertTrue(role._all_gather(1, "worker") is None)
+        self.assertTrue(role._all_reduce(1, "sum", "worker") is None)
+        role._barrier("worker")
 
 
 class TestCloudRoleMaker(unittest.TestCase):
@@ -72,21 +76,33 @@ class TestCloudRoleMaker(unittest.TestCase):
             print("warning: no netifaces, skip test_tr_rolemaker")
             return
 
-        ro = role_maker.PaddleCloudRoleMaker(
-            is_collective=False, init_gloo=False)
-        self.assertTrue(ro.is_worker())
-        self.assertFalse(ro.is_server())
-        self.assertEqual(ro.worker_num(), 2)
-        self.assertTrue(ro.is_first_worker())
-        worker_endpoints = ro.get_trainer_endpoints()
+        ro = role_maker.PaddleCloudRoleMaker(is_collective=False)
+        self.assertTrue(ro._is_worker())
+        ro = role_maker.PaddleCloudRoleMaker(is_collective=False)
+        self.assertFalse(ro._is_server())
+        ro = role_maker.PaddleCloudRoleMaker(is_collective=False)
+        self.assertEqual(ro._worker_num(), 2)
+        ro = role_maker.PaddleCloudRoleMaker(is_collective=False)
+        self.assertTrue(ro._is_first_worker())
+        ro = role_maker.PaddleCloudRoleMaker(is_collective=False)
+        worker_endpoints = ro._get_trainer_endpoints()
         self.assertEqual(worker_endpoints[0], '127.0.0.1:36001')
-        self.assertEqual(ro.role_id(), 0)
-        self.assertEqual(ro.node_num(), 2)
+        ro = role_maker.PaddleCloudRoleMaker(is_collective=False)
+        self.assertEqual(ro._role_id(), 0)
+        ro = role_maker.PaddleCloudRoleMaker(is_collective=False)
+        self.assertEqual(ro._node_num(), 2)
+        ro = role_maker.PaddleCloudRoleMaker(is_collective=False)
+        self.assertFalse(ro._is_non_distributed())
+        ro = role_maker.PaddleCloudRoleMaker(is_collective=False)
+        self.assertEqual(ro._heter_worker_num(), 0)
+        ro = role_maker.PaddleCloudRoleMaker(is_collective=False)
+        self.assertFalse(ro._is_heter_worker())
 
     def test_tr_rolemaker_collective(self):
         ro = role_maker.PaddleCloudRoleMaker(is_collective=True)
-        self.assertEqual(ro.worker_num(), 2)
-        self.assertEqual(ro.node_num(), 2)
+        self.assertEqual(ro._worker_num(), 2)
+        ro = role_maker.PaddleCloudRoleMaker(is_collective=True)
+        self.assertEqual(ro._node_num(), 2)
 
     def test_ps_rolemaker(self):
         """Test ps rolemaker."""
@@ -102,14 +118,15 @@ class TestCloudRoleMaker(unittest.TestCase):
 
         ro = role_maker.PaddleCloudRoleMaker(
             is_collective=False, init_gloo=False)
-        self.assertEqual(ro.server_index(), 0)
-        self.assertFalse(ro.is_worker())
-        self.assertTrue(ro.is_server())
-        self.assertEqual(ro.server_num(), 2)
-        pserver_endpoints = ro.get_pserver_endpoints()
+        self.assertEqual(ro._server_index(), 0)
+        self.assertFalse(ro._is_worker())
+        self.assertTrue(ro._is_server())
+        self.assertEqual(ro._server_num(), 2)
+        pserver_endpoints = ro._get_pserver_endpoints()
         self.assertEqual(pserver_endpoints[0], '127.0.0.1:36001')
-        self.assertTrue(ro._all_gather(ro._all_comm, 1) is None)
-        self.assertTrue(ro._all_reduce(ro._all_comm, 1) is None)
+
+        self.assertEqual(ro._all_gather(1, "worker"), 1)
+        self.assertEqual(ro._all_reduce(1, "sum", "worker"), 1)
 
     def test_traing_role(self):
         """Test training role."""
@@ -121,7 +138,7 @@ class TestCloudRoleMaker(unittest.TestCase):
             return
 
         ro = role_maker.PaddleCloudRoleMaker(is_collective=False)
-        self.assertRaises(ValueError, ro.generate_role)
+        self.assertRaises(ValueError, ro._generate_role)
 
 
 class TestUserDefinedRoleMaker(unittest.TestCase):
@@ -142,14 +159,14 @@ class TestUserDefinedRoleMaker(unittest.TestCase):
         ro = role_maker.UserDefinedRoleMaker(
             is_collective=False,
             init_gloo=False,
-            server_endpoints="127.0.0.1:36001,127.0.0.1:36001",
+            server_endpoints=["127.0.0.1:36001", "127.0.0.1:36001"],
             role=role_maker.Role.SERVER,
             current_id=0,
             worker_num=2)
-        self.assertEqual(ro.server_num(), 2)
-        ro.generate_role()
-        self.assertTrue(ro.is_server())
-        self.assertEqual(ro.role_id(), 0)
+        self.assertEqual(ro._server_num(), 2)
+        ro._generate_role()
+        self.assertTrue(ro._is_server())
+        self.assertEqual(ro._role_id(), 0)
 
     def test_tr_rolemaker(self):
         try:
@@ -161,13 +178,589 @@ class TestUserDefinedRoleMaker(unittest.TestCase):
         ro = role_maker.UserDefinedRoleMaker(
             is_collective=False,
             init_gloo=False,
-            server_endpoints="127.0.0.1:36001,127.0.0.1:36001",
+            server_endpoints=["127.0.0.1:36001", "127.0.0.1:36001"],
             role=role_maker.Role.WORKER,
             current_id=0,
             worker_num=2)
-        self.assertIn("127.0.0.1:36001", ro.get_pserver_endpoints())
-        self.assertTrue(ro.is_worker())
-        self.assertEqual(ro.role_id(), 0)
+
+        self.assertIn("127.0.0.1:36001", ro._get_pserver_endpoints())
+        self.assertTrue(ro._is_worker())
+        self.assertEqual(ro._role_id(), 0)
+
+
+class TestGlooWithCloudRoleMaker(unittest.TestCase):
+    def setUp(self):
+        os.environ["PADDLE_TRAINERS_NUM"] = "1"
+        os.environ["PADDLE_PSERVERS_IP_PORT_LIST"] = "127.0.0.1:36001"
+        os.environ["PADDLE_TRAINER_ENDPOINTS"] = "127.0.0.1:36001"
+        os.environ["POD_IP"] = "127.0.0.1"
+        os.environ["PADDLE_TRAINER_ID"] = "0"
+
+    def case(self, role, comm_world):
+        role._barrier(comm_world)
+
+        gather = role._all_gather(1, comm_world)
+        self.assertEqual(gather[0], 1)
+
+        all_reduce = role._all_reduce(1, "sum", comm_world)
+        self.assertEqual(1, all_reduce)
+
+    def mkdir(self):
+        tmp = tempfile.mkdtemp()
+        return tmp
+
+    def clean(self, tmp):
+        shutil.rmtree(tmp)
+
+    def test_hdfs_gloo(self):
+        plats = platform.platform()
+        if 'Linux' not in plats:
+            print("skip gloo UT on MacOS/Win")
+            return
+
+        tmp = self.mkdir()
+        os.environ["TRAINING_ROLE"] = "TRAINER"
+        os.environ["SYS_JOB_ID"] = "gloo_for_cluster"
+        os.environ["PADDLE_WITH_GLOO"] = "1"
+        os.environ["PADDLE_GLOO_RENDEZVOUS"] = "1"
+        os.environ["PADDLE_GLOO_FS_NAME"] = "NULL"
+        os.environ["PADDLE_GLOO_FS_UGI"] = "NULL"
+        os.environ["PADDLE_GLOO_FS_PATH"] = tmp
+
+        role = role_maker.PaddleCloudRoleMaker()
+        role._generate_role()
+        self.case(role, "worker")
+        self.clean(tmp)
+
+    def test_fs_gloo(self):
+        plats = platform.platform()
+        if 'Linux' not in plats:
+            print("skip gloo UT on MacOS/Win")
+            return
+
+        tmp = self.mkdir()
+        os.environ["TRAINING_ROLE"] = "TRAINER"
+        os.environ["SYS_JOB_ID"] = "gloo_for_cluster"
+        os.environ["PADDLE_WITH_GLOO"] = "1"
+        os.environ["PADDLE_GLOO_RENDEZVOUS"] = "2"
+        os.environ["PADDLE_GLOO_FS_PATH"] = tmp
+
+        role = role_maker.PaddleCloudRoleMaker()
+        role._generate_role()
+        self.case(role, "worker")
+        self.clean(tmp)
+
+    def test_fs_gloo2(self):
+        plats = platform.platform()
+        if 'Linux' not in plats:
+            print("skip gloo UT on MacOS/Win")
+            return
+
+        tmp = self.mkdir()
+        os.environ["TRAINING_ROLE"] = "PSERVER"
+        os.environ["PADDLE_PSERVERS_IP_PORT_LIST"] = "127.0.0.1:36001"
+        os.environ["POD_IP"] = "127.0.0.1"
+        os.environ["PADDLE_PORT"] = "36001"
+
+        os.environ["SYS_JOB_ID"] = "gloo_for_cluster"
+        os.environ["PADDLE_WITH_GLOO"] = "1"
+        os.environ["PADDLE_GLOO_RENDEZVOUS"] = "2"
+        os.environ["PADDLE_GLOO_FS_PATH"] = tmp
+
+        role = role_maker.PaddleCloudRoleMaker()
+        role._generate_role()
+        self.case(role, "server")
+        self.clean(tmp)
+
+    def test_fs_gloo3(self):
+        plats = platform.platform()
+        if 'Linux' not in plats:
+            print("skip gloo UT on MacOS/Win")
+            return
+
+        tmp = self.mkdir()
+        os.environ["TRAINING_ROLE"] = "PSERVER"
+        os.environ["PADDLE_PSERVERS_IP_PORT_LIST"] = "127.0.0.1:36001"
+        os.environ["POD_IP"] = "127.0.0.1"
+        os.environ["PADDLE_PORT"] = "36001"
+
+        os.environ["SYS_JOB_ID"] = "gloo_for_cluster"
+        os.environ["PADDLE_WITH_GLOO"] = "1"
+        os.environ["PADDLE_GLOO_RENDEZVOUS"] = "1"
+        os.environ["PADDLE_GLOO_FS_NAME"] = "NULL"
+        os.environ["PADDLE_GLOO_FS_UGI"] = "NULL"
+        os.environ["PADDLE_GLOO_FS_PATH"] = tmp
+
+        role = role_maker.PaddleCloudRoleMaker()
+        role._generate_role()
+        self.case(role, "server")
+        self.clean(tmp)
+
+    def test_fs_gloo4(self):
+        plats = platform.platform()
+        if 'Linux' not in plats:
+            print("skip gloo UT on MacOS/Win")
+            return
+
+        os.environ["TRAINING_ROLE"] = "PSERVER"
+        os.environ["PADDLE_PSERVERS_IP_PORT_LIST"] = "127.0.0.1:36001"
+        os.environ["POD_IP"] = "127.0.0.1"
+        os.environ["PADDLE_PORT"] = "36001"
+
+        os.environ["SYS_JOB_ID"] = "gloo_for_cluster"
+        os.environ["PADDLE_WITH_GLOO"] = "1"
+        os.environ["PADDLE_GLOO_RENDEZVOUS"] = "3"
+        os.environ["PADDLE_GLOO_HTTP_HOST"] = "127.0.0.1"
+        os.environ["PADDLE_GLOO_HTTP_PORT"] = "30019"
+
+        role = role_maker.PaddleCloudRoleMaker()
+        role._generate_role()
+        import time
+        time.sleep(3)
+
+    def test_fs_gloo5(self):
+        plats = platform.platform()
+        if 'Linux' not in plats:
+            print("skip gloo UT on MacOS/Win")
+            return
+
+        tmp = self.mkdir()
+
+        os.environ["TRAINING_ROLE"] = "PSERVER"
+        os.environ["PADDLE_PSERVERS_IP_PORT_LIST"] = "127.0.0.1:36001"
+        os.environ["POD_IP"] = "127.0.0.1"
+        os.environ["PADDLE_PORT"] = "36001"
+        os.environ["PADDLE_TRAINERS_NUM"] = "0"
+
+        os.environ["SYS_JOB_ID"] = "gloo_for_cluster"
+        os.environ["PADDLE_WITH_GLOO"] = "2"
+        os.environ["PADDLE_GLOO_RENDEZVOUS"] = "2"
+        os.environ["PADDLE_GLOO_FS_PATH"] = tmp
+
+        role = role_maker.PaddleCloudRoleMaker()
+        role._generate_role()
+        self.case(role, "server")
+        self.case(role, "all")
+        self.clean(tmp)
+
+    def test_fs_gloo6(self):
+        plats = platform.platform()
+        if 'Linux' not in plats:
+            print("skip gloo UT on MacOS/Win")
+            return
+
+        tmp = self.mkdir()
+
+        os.environ["TRAINING_ROLE"] = "PSERVER"
+        os.environ["PADDLE_PSERVERS_IP_PORT_LIST"] = "127.0.0.1:36001"
+        os.environ["POD_IP"] = "127.0.0.1"
+        os.environ["PADDLE_PORT"] = "36001"
+        os.environ["PADDLE_TRAINERS_NUM"] = "0"
+
+        os.environ["SYS_JOB_ID"] = "gloo_for_cluster"
+
+        os.environ["PADDLE_WITH_GLOO"] = "2"
+        os.environ["PADDLE_GLOO_RENDEZVOUS"] = "1"
+        os.environ["PADDLE_GLOO_FS_NAME"] = "NULL"
+        os.environ["PADDLE_GLOO_FS_UGI"] = "NULL"
+        os.environ["PADDLE_GLOO_FS_PATH"] = tmp
+
+        role = role_maker.PaddleCloudRoleMaker()
+        role._generate_role()
+        self.case(role, "server")
+        self.case(role, "all")
+        self.clean(tmp)
+
+    def test_fs_gloo7(self):
+        plats = platform.platform()
+        if 'Linux' not in plats:
+            print("skip gloo UT on MacOS/Win")
+            return
+
+        os.environ["TRAINING_ROLE"] = "PSERVER"
+        os.environ["PADDLE_PSERVERS_IP_PORT_LIST"] = "127.0.0.1:36001"
+        os.environ["POD_IP"] = "127.0.0.1"
+        os.environ["PADDLE_PORT"] = "36001"
+        os.environ["PADDLE_TRAINERS_NUM"] = "0"
+
+        os.environ["SYS_JOB_ID"] = "gloo_for_cluster"
+
+        os.environ["PADDLE_WITH_GLOO"] = "1"
+        os.environ["PADDLE_GLOO_RENDEZVOUS"] = "5"
+
+        role = role_maker.PaddleCloudRoleMaker()
+        self.assertRaises(ValueError, role._generate_role)
+
+    def test_fs_gloo8(self):
+        plats = platform.platform()
+        if 'Linux' not in plats:
+            print("skip gloo UT on MacOS/Win")
+            return
+
+        tmp = self.mkdir()
+
+        os.environ["TRAINING_ROLE"] = "PSERVER"
+        os.environ["PADDLE_PSERVERS_IP_PORT_LIST"] = "127.0.0.1:36001"
+        os.environ["POD_IP"] = "127.0.0.1"
+        os.environ["PADDLE_PORT"] = "36001"
+        os.environ["PADDLE_TRAINERS_NUM"] = "0"
+
+        os.environ["SYS_JOB_ID"] = "gloo_for_cluster"
+
+        os.environ["PADDLE_WITH_GLOO"] = "2"
+        os.environ["PADDLE_GLOO_RENDEZVOUS"] = "1"
+        os.environ["PADDLE_GLOO_FS_NAME"] = "NULL"
+        os.environ["PADDLE_GLOO_FS_UGI"] = "NULL"
+        os.environ["PADDLE_GLOO_FS_PATH"] = tmp
+
+        def net():
+            x = paddle.fluid.layers.data(name='x', shape=[13], dtype='float32')
+            y_predict = paddle.fluid.layers.fc(input=x, size=1, act=None)
+            y = paddle.fluid.layers.data(name='y', shape=[1], dtype='float32')
+            cost = paddle.fluid.layers.square_error_cost(
+                input=y_predict, label=y)
+            avg_cost = paddle.fluid.layers.mean(cost)
+            return avg_cost
+
+        from paddle.distributed import fleet
+
+        role = role_maker.PaddleCloudRoleMaker()
+        fleet.init(role)
+        avg_cost = net()
+
+        strategy = paddle.distributed.fleet.DistributedStrategy()
+        strategy.a_sync = False
+
+        optimizer = paddle.optimizer.SGD(0.01)
+        optimizer = fleet.distributed_optimizer(optimizer, strategy)
+        optimizer.minimize(avg_cost)
+
+        comm_world = "server"
+        fleet.util().barrier(comm_world)
+
+        gather = fleet.util().all_gather(1, comm_world)
+        self.assertEqual(gather[0], 1)
+
+        all_reduce = fleet.util().all_reduce(1, "sum", comm_world)
+        self.assertEqual(1, all_reduce)
+
+        self.clean(tmp)
+
+
+class TestGlooWithCloudRoleMaker(unittest.TestCase):
+    def setUp(self):
+        os.environ["PADDLE_TRAINERS_NUM"] = "1"
+        os.environ["PADDLE_PSERVERS_IP_PORT_LIST"] = "127.0.0.1:36001"
+        os.environ["PADDLE_TRAINER_ENDPOINTS"] = "127.0.0.1:36001"
+        os.environ["POD_IP"] = "127.0.0.1"
+        os.environ["PADDLE_TRAINER_ID"] = "0"
+
+    def case(self, role, comm_world):
+        role._barrier(comm_world)
+
+        gather = role._all_gather(1, comm_world)
+        self.assertEqual(gather[0], 1)
+
+        all_reduce = role._all_reduce(1, "sum", comm_world)
+        self.assertEqual(1, all_reduce)
+
+    def mkdir(self):
+        tmp = tempfile.mkdtemp()
+        return tmp
+
+    def clean(self, tmp):
+        shutil.rmtree(tmp)
+
+    def test_hdfs_gloo(self):
+        plats = platform.platform()
+        if 'Linux' not in plats:
+            print("skip gloo UT on MacOS/Win")
+            return
+
+        tmp = self.mkdir()
+        os.environ["TRAINING_ROLE"] = "TRAINER"
+        os.environ["SYS_JOB_ID"] = "gloo_for_cluster"
+        os.environ["PADDLE_WITH_GLOO"] = "1"
+        os.environ["PADDLE_GLOO_RENDEZVOUS"] = "1"
+        os.environ["PADDLE_GLOO_FS_NAME"] = "NULL"
+        os.environ["PADDLE_GLOO_FS_UGI"] = "NULL"
+        os.environ["PADDLE_GLOO_FS_PATH"] = tmp
+
+        role = role_maker.PaddleCloudRoleMaker()
+        role._generate_role()
+        self.case(role, "worker")
+        self.clean(tmp)
+
+    def test_fs_gloo(self):
+        plats = platform.platform()
+        if 'Linux' not in plats:
+            print("skip gloo UT on MacOS/Win")
+            return
+
+        tmp = self.mkdir()
+        os.environ["TRAINING_ROLE"] = "TRAINER"
+        os.environ["SYS_JOB_ID"] = "gloo_for_cluster"
+        os.environ["PADDLE_WITH_GLOO"] = "1"
+        os.environ["PADDLE_GLOO_RENDEZVOUS"] = "2"
+        os.environ["PADDLE_GLOO_FS_PATH"] = tmp
+
+        role = role_maker.PaddleCloudRoleMaker()
+        role._generate_role()
+        self.case(role, "worker")
+        self.clean(tmp)
+
+    def test_fs_gloo2(self):
+        plats = platform.platform()
+        if 'Linux' not in plats:
+            print("skip gloo UT on MacOS/Win")
+            return
+
+        tmp = self.mkdir()
+        os.environ["TRAINING_ROLE"] = "PSERVER"
+        os.environ["PADDLE_PSERVERS_IP_PORT_LIST"] = "127.0.0.1:36001"
+        os.environ["POD_IP"] = "127.0.0.1"
+        os.environ["PADDLE_PORT"] = "36001"
+
+        os.environ["SYS_JOB_ID"] = "gloo_for_cluster"
+        os.environ["PADDLE_WITH_GLOO"] = "1"
+        os.environ["PADDLE_GLOO_RENDEZVOUS"] = "2"
+        os.environ["PADDLE_GLOO_FS_PATH"] = tmp
+
+        role = role_maker.PaddleCloudRoleMaker()
+        role._generate_role()
+        self.case(role, "server")
+        self.clean(tmp)
+
+    def test_fs_gloo3(self):
+        plats = platform.platform()
+        if 'Linux' not in plats:
+            print("skip gloo UT on MacOS/Win")
+            return
+
+        tmp = self.mkdir()
+        os.environ["TRAINING_ROLE"] = "PSERVER"
+        os.environ["PADDLE_PSERVERS_IP_PORT_LIST"] = "127.0.0.1:36001"
+        os.environ["POD_IP"] = "127.0.0.1"
+        os.environ["PADDLE_PORT"] = "36001"
+
+        os.environ["SYS_JOB_ID"] = "gloo_for_cluster"
+        os.environ["PADDLE_WITH_GLOO"] = "1"
+        os.environ["PADDLE_GLOO_RENDEZVOUS"] = "1"
+        os.environ["PADDLE_GLOO_FS_NAME"] = "NULL"
+        os.environ["PADDLE_GLOO_FS_UGI"] = "NULL"
+        os.environ["PADDLE_GLOO_FS_PATH"] = tmp
+
+        role = role_maker.PaddleCloudRoleMaker()
+        role._generate_role()
+        self.case(role, "server")
+        self.clean(tmp)
+
+    def test_fs_gloo4(self):
+        plats = platform.platform()
+        if 'Linux' not in plats:
+            print("skip gloo UT on MacOS/Win")
+            return
+
+        os.environ["TRAINING_ROLE"] = "PSERVER"
+        os.environ["PADDLE_PSERVERS_IP_PORT_LIST"] = "127.0.0.1:36001"
+        os.environ["POD_IP"] = "127.0.0.1"
+        os.environ["PADDLE_PORT"] = "36001"
+
+        os.environ["SYS_JOB_ID"] = "gloo_for_cluster"
+        os.environ["PADDLE_WITH_GLOO"] = "1"
+        os.environ["PADDLE_GLOO_RENDEZVOUS"] = "3"
+        os.environ["PADDLE_GLOO_HTTP_HOST"] = "127.0.0.1"
+        os.environ["PADDLE_GLOO_HTTP_PORT"] = "30019"
+
+        role = role_maker.PaddleCloudRoleMaker()
+        role._generate_role()
+        import time
+        time.sleep(3)
+
+    def test_fs_gloo5(self):
+        plats = platform.platform()
+        if 'Linux' not in plats:
+            print("skip gloo UT on MacOS/Win")
+            return
+
+        tmp = self.mkdir()
+
+        os.environ["TRAINING_ROLE"] = "PSERVER"
+        os.environ["PADDLE_PSERVERS_IP_PORT_LIST"] = "127.0.0.1:36001"
+        os.environ["POD_IP"] = "127.0.0.1"
+        os.environ["PADDLE_PORT"] = "36001"
+        os.environ["PADDLE_TRAINERS_NUM"] = "0"
+
+        os.environ["SYS_JOB_ID"] = "gloo_for_cluster"
+        os.environ["PADDLE_WITH_GLOO"] = "2"
+        os.environ["PADDLE_GLOO_RENDEZVOUS"] = "2"
+        os.environ["PADDLE_GLOO_FS_PATH"] = tmp
+
+        role = role_maker.PaddleCloudRoleMaker()
+        role._generate_role()
+        self.case(role, "server")
+        self.case(role, "all")
+        self.clean(tmp)
+
+    def test_fs_gloo6(self):
+        plats = platform.platform()
+        if 'Linux' not in plats:
+            print("skip gloo UT on MacOS/Win")
+            return
+
+        tmp = self.mkdir()
+
+        os.environ["TRAINING_ROLE"] = "PSERVER"
+        os.environ["PADDLE_PSERVERS_IP_PORT_LIST"] = "127.0.0.1:36001"
+        os.environ["POD_IP"] = "127.0.0.1"
+        os.environ["PADDLE_PORT"] = "36001"
+        os.environ["PADDLE_TRAINERS_NUM"] = "0"
+
+        os.environ["SYS_JOB_ID"] = "gloo_for_cluster"
+
+        os.environ["PADDLE_WITH_GLOO"] = "2"
+        os.environ["PADDLE_GLOO_RENDEZVOUS"] = "1"
+        os.environ["PADDLE_GLOO_FS_NAME"] = "NULL"
+        os.environ["PADDLE_GLOO_FS_UGI"] = "NULL"
+        os.environ["PADDLE_GLOO_FS_PATH"] = tmp
+
+        role = role_maker.PaddleCloudRoleMaker()
+        role._generate_role()
+        self.case(role, "server")
+        self.case(role, "all")
+        self.clean(tmp)
+
+    def test_fs_gloo7(self):
+        plats = platform.platform()
+        if 'Linux' not in plats:
+            print("skip gloo UT on MacOS/Win")
+            return
+
+        os.environ["TRAINING_ROLE"] = "PSERVER"
+        os.environ["PADDLE_PSERVERS_IP_PORT_LIST"] = "127.0.0.1:36001"
+        os.environ["POD_IP"] = "127.0.0.1"
+        os.environ["PADDLE_PORT"] = "36001"
+        os.environ["PADDLE_TRAINERS_NUM"] = "0"
+
+        os.environ["SYS_JOB_ID"] = "gloo_for_cluster"
+
+        os.environ["PADDLE_WITH_GLOO"] = "1"
+        os.environ["PADDLE_GLOO_RENDEZVOUS"] = "5"
+
+        role = role_maker.PaddleCloudRoleMaker()
+        self.assertRaises(ValueError, role._generate_role)
+
+    def test_hdfs_gloo_v2(self):
+        plats = platform.platform()
+        if 'Linux' not in plats:
+            print("skip gloo UT on MacOS/Win")
+            return
+
+        os.environ["TRAINING_ROLE"] = "TRAINER"
+        os.environ["SYS_JOB_ID"] = "gloo_for_cluster"
+        os.environ["PADDLE_WITH_GLOO"] = "1"
+        os.environ["PADDLE_GLOO_RENDEZVOUS"] = "1"
+        os.environ["PADDLE_GLOO_FS_NAME"] = ""
+        os.environ["PADDLE_GLOO_FS_UGI"] = ""
+        os.environ["PADDLE_GLOO_FS_PATH"] = ""
+
+        role = role_maker.PaddleCloudRoleMaker()
+        self.assertRaises(ValueError, role._generate_role)
+
+    def test_fs_gloo_v2(self):
+        plats = platform.platform()
+        if 'Linux' not in plats:
+            print("skip gloo UT on MacOS/Win")
+            return
+
+        os.environ["TRAINING_ROLE"] = "PSERVER"
+        os.environ["PADDLE_PSERVERS_IP_PORT_LIST"] = "127.0.0.1:36001"
+        os.environ["POD_IP"] = "127.0.0.1"
+        os.environ["PADDLE_PORT"] = "36001"
+        os.environ["PADDLE_TRAINERS_NUM"] = "0"
+
+        os.environ["SYS_JOB_ID"] = "gloo_for_cluster"
+        os.environ["PADDLE_WITH_GLOO"] = "1"
+        os.environ["PADDLE_GLOO_RENDEZVOUS"] = "2"
+        os.environ["PADDLE_GLOO_FS_PATH"] = ""
+
+        role = role_maker.PaddleCloudRoleMaker()
+        self.assertRaises(ValueError, role._generate_role)
+
+    def test_http_gloo_v2(self):
+        plats = platform.platform()
+        if 'Linux' not in plats:
+            print("skip gloo UT on MacOS/Win")
+            return
+
+        os.environ["TRAINING_ROLE"] = "PSERVER"
+        os.environ["PADDLE_PSERVERS_IP_PORT_LIST"] = "127.0.0.1:36001"
+        os.environ["POD_IP"] = "127.0.0.1"
+        os.environ["PADDLE_PORT"] = "36001"
+
+        os.environ["SYS_JOB_ID"] = "gloo_for_cluster"
+        os.environ["PADDLE_WITH_GLOO"] = "1"
+        os.environ["PADDLE_GLOO_RENDEZVOUS"] = "3"
+        os.environ["PADDLE_GLOO_HTTP_HOST"] = ""
+        os.environ["PADDLE_GLOO_HTTP_PORT"] = ""
+
+        role = role_maker.PaddleCloudRoleMaker()
+        self.assertRaises(ValueError, role._generate_role)
+
+    def test_fs_gloo8(self):
+        plats = platform.platform()
+        if 'Linux' not in plats:
+            print("skip gloo UT on MacOS/Win")
+            return
+
+        tmp = self.mkdir()
+
+        os.environ["TRAINING_ROLE"] = "PSERVER"
+        os.environ["PADDLE_PSERVERS_IP_PORT_LIST"] = "127.0.0.1:36001"
+        os.environ["POD_IP"] = "127.0.0.1"
+        os.environ["PADDLE_PORT"] = "36001"
+        os.environ["PADDLE_TRAINERS_NUM"] = "0"
+
+        os.environ["SYS_JOB_ID"] = "gloo_for_cluster"
+
+        os.environ["PADDLE_WITH_GLOO"] = "2"
+        os.environ["PADDLE_GLOO_RENDEZVOUS"] = "1"
+        os.environ["PADDLE_GLOO_FS_NAME"] = "NULL"
+        os.environ["PADDLE_GLOO_FS_UGI"] = "NULL"
+        os.environ["PADDLE_GLOO_FS_PATH"] = tmp
+
+        def net():
+            x = paddle.fluid.layers.data(name='x', shape=[13], dtype='float32')
+            y_predict = paddle.fluid.layers.fc(input=x, size=1, act=None)
+            y = paddle.fluid.layers.data(name='y', shape=[1], dtype='float32')
+            cost = paddle.fluid.layers.square_error_cost(
+                input=y_predict, label=y)
+            avg_cost = paddle.fluid.layers.mean(cost)
+            return avg_cost
+
+        from paddle.distributed import fleet
+
+        role = role_maker.PaddleCloudRoleMaker()
+        fleet.init(role)
+        avg_cost = net()
+
+        strategy = paddle.distributed.fleet.DistributedStrategy()
+        strategy.a_sync = False
+
+        optimizer = paddle.optimizer.SGD(0.01)
+        optimizer = fleet.distributed_optimizer(optimizer, strategy)
+        optimizer.minimize(avg_cost)
+
+        comm_world = "server"
+        fleet.util().barrier(comm_world)
+
+        gather = fleet.util().all_gather(1, comm_world)
+        self.assertEqual(gather[0], 1)
+
+        all_reduce = fleet.util().all_reduce(1, "sum", comm_world)
+        self.assertEqual(1, all_reduce)
+
+        self.clean(tmp)
 
 
 if __name__ == "__main__":
diff --git a/python/paddle/fluid/tests/unittests/test_fleet_util.py b/python/paddle/fluid/tests/unittests/test_fleet_util.py
index dde36e073fb20eed3b17c79a886739f59ecb185d..1570912e7406f930212eead64305e1e35e1b8ac0 100644
--- a/python/paddle/fluid/tests/unittests/test_fleet_util.py
+++ b/python/paddle/fluid/tests/unittests/test_fleet_util.py
@@ -59,7 +59,7 @@ class TestFleetUtil(unittest.TestCase):
         import paddle.distributed.fleet.base.role_maker as role_maker
         role = role_maker.PaddleCloudRoleMaker(is_collective=True)
         fleet.init(role)
-        default_util = fleet.util
+        default_util = fleet.util()
         self.assertEqual(default_util, None)
 
     def test_set_user_defined_util(self):
@@ -76,108 +76,17 @@ class TestFleetUtil(unittest.TestCase):
         role = role_maker.PaddleCloudRoleMaker(is_collective=True)
         fleet.init(role)
         my_util = UserDefinedUtil()
-        fleet.util = my_util
-        user_id = fleet.util.get_user_id()
+        fleet.set_util(my_util)
+        user_id = fleet.util().get_user_id()
         self.assertEqual(user_id, 10)
 
     def test_fs(self):
-        from paddle.distributed.fleet.utils import LocalFS
+        from paddle.distributed.fleet.utils.fs import LocalFS
         fs = LocalFS()
         dirs, files = fs.ls_dir("test_tmp")
         dirs, files = fs.ls_dir("./")
         self.assertFalse(fs.need_upload_download())
-        fleet_util.set_file_system(fs)
-
-    def test_barrier(self):
-        try:
-            import netifaces
-        except:
-            print("warning: no netifaces, skip test_barrier")
-            return
-
-        gloo = fluid.core.Gloo()
-        gloo.set_rank(0)
-        gloo.set_size(1)
-        gloo.set_prefix("123")
-        gloo.set_iface("lo")
-        gloo.set_hdfs_store("./tmp_test_fleet_barrier", "", "")
-        gloo.init()
-
-        role = role_maker.UserDefinedRoleMaker(
-            is_collective=False,
-            init_gloo=False,
-            current_id=0,
-            role=role_maker.Role.SERVER,
-            worker_endpoints=["127.0.0.1:6003"],
-            server_endpoints=["127.0.0.1:6001"])
-        role._node_type_comm = gloo
-        role._role_is_generated = True
-        fleet_util._set_role_maker(role)
-
-        fleet_util.barrier("worker")
-
-    def test_all_reduce(self):
-        try:
-            import netifaces
-        except:
-            print("warning: no netifaces, skip test_all_reduce")
-            return
-
-        gloo = fluid.core.Gloo()
-        gloo.set_rank(0)
-        gloo.set_size(1)
-        gloo.set_prefix("123")
-        gloo.set_iface("lo")
-        gloo.set_hdfs_store("./tmp_test_fleet_reduce", "", "")
-        gloo.init()
-
-        role = role_maker.UserDefinedRoleMaker(
-            is_collective=False,
-            init_gloo=False,
-            current_id=0,
-            role=role_maker.Role.WORKER,
-            worker_endpoints=["127.0.0.1:6003"],
-            server_endpoints=["127.0.0.1:6001"])
-        role._node_type_comm = gloo
-        role._role_is_generated = True
-        fleet_util._set_role_maker(role)
-
-        output = fleet_util.all_reduce(1, "sum", comm_world="server")
-        print(output)
-
-    # self.assertEqual(output, 1)
-
-    def test_all_gather(self):
-        try:
-            import netifaces
-        except:
-            print("warning: no netifaces, skip test_all_gather")
-            return
-
-        gloo = fluid.core.Gloo()
-        gloo.set_rank(0)
-        gloo.set_size(1)
-        gloo.set_prefix("123")
-        gloo.set_iface("lo")
-        gloo.set_hdfs_store("./tmp_test_fleet_reduce", "", "")
-        gloo.init()
-
-        role = role_maker.UserDefinedRoleMaker(
-            is_collective=False,
-            init_gloo=False,
-            current_id=0,
-            role=role_maker.Role.SERVER,
-            worker_endpoints=["127.0.0.1:6003"],
-            server_endpoints=["127.0.0.1:6001"])
-        role._node_type_comm = gloo
-        role._all_comm = gloo
-        role._role_is_generated = True
-        fleet_util._set_role_maker(role)
-
-        output = fleet_util.all_gather(1, comm_world="all")
-        print(output)
-        # self.assertTrue(len(output) == 1 and output[0] == 1)
-        self.assertRaises(Exception, fleet_util.all_gather, 1, "test")
+        fleet_util._set_file_system(fs)
 
     def download_files(self):
         path = download(self.proto_data_url, self.module_name,
diff --git a/python/paddle/fluid/tests/unittests/test_fs_interface.py b/python/paddle/fluid/tests/unittests/test_fs_interface.py
index c01876531c99c610706265ff646d93c4a197a26e..581fa9738116dc5c737fd3264528b991e210888b 100644
--- a/python/paddle/fluid/tests/unittests/test_fs_interface.py
+++ b/python/paddle/fluid/tests/unittests/test_fs_interface.py
@@ -20,7 +20,7 @@ import os
 import sys
 import inspect
 
-from paddle.distributed.fleet.utils import LocalFS, FS, HDFSClient, FSTimeOut, FSFileExistsError, FSFileNotExistsError
+from paddle.distributed.fleet.utils.fs import LocalFS, FS, HDFSClient, FSTimeOut, FSFileExistsError, FSFileNotExistsError
 
 
 class FSTest(unittest.TestCase):
diff --git a/python/paddle/fluid/tests/unittests/test_fuse_bn_act_pass.py b/python/paddle/fluid/tests/unittests/test_fuse_bn_act_pass.py
index 921dbdbc6d4e1b169c2c8aa199ea15f886bd0128..5bcfc8720ddd2a8b495c50f886c03047c9abdb32 100644
--- a/python/paddle/fluid/tests/unittests/test_fuse_bn_act_pass.py
+++ b/python/paddle/fluid/tests/unittests/test_fuse_bn_act_pass.py
@@ -25,7 +25,7 @@ class TestFuseBatchNormActPass(unittest.TestCase):
             hidden1 = fluid.layers.conv2d(
                 input=x,
                 filter_size=3,
-                num_filters=32,
+                num_filters=16,
                 stride=1,
                 padding=1,
                 act=None,
@@ -43,7 +43,7 @@ class TestFuseBatchNormActPass(unittest.TestCase):
                 bias_attr=bias_attr,
                 act='relu',
                 data_layout='NHWC')
-            hidden3 = fluid.layers.fc(input=hidden2, size=128, act='relu')
+            hidden3 = fluid.layers.fc(input=hidden2, size=32, act='relu')
             hidden4 = fluid.layers.batch_norm(
                 input=hidden3, act='relu', data_layout='NHWC')
             prediction = fluid.layers.fc(input=hidden4, size=10, act='softmax')
@@ -63,7 +63,7 @@ class TestFuseBatchNormActPass(unittest.TestCase):
         startup_program = fluid.Program()
         x, y, loss = self.build_program(main_program, startup_program, use_cuda)
         exe = fluid.Executor(place)
-        iters = 10
+        iters = 8
         batch_size = 16
         feeder = fluid.DataFeeder(feed_list=[x, y], place=place)
 
diff --git a/python/paddle/fluid/tests/unittests/test_fused_bn_add_act.py b/python/paddle/fluid/tests/unittests/test_fused_bn_add_act.py
new file mode 100644
index 0000000000000000000000000000000000000000..1bc305cd1f4dcd3faaaf8ccbe813bdf08e966d6e
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/test_fused_bn_add_act.py
@@ -0,0 +1,215 @@
+#   Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+
+import unittest
+import numpy as np
+from op_test import OpTest
+import paddle
+import paddle.fluid as fluid
+from paddle.fluid import core
+
+
+@unittest.skipIf(not core.is_compiled_with_cuda(),
+                 "Paddle core is not compiled with CUDA")
+class TestFusedBnAddActAPI(unittest.TestCase):
+    def setUp(self):
+        self.conv_param_attr1 = fluid.ParamAttr(
+            name='conv2d_1.weight',
+            initializer=fluid.initializer.Xavier(uniform=False),
+            learning_rate=0.001)
+        self.conv_param_attr2 = fluid.ParamAttr(
+            name='conv2d_2.weight',
+            initializer=fluid.initializer.Xavier(uniform=False),
+            learning_rate=0.001)
+        self.bn_param_attr1 = fluid.ParamAttr(
+            name='batch_norm_w_1',
+            initializer=fluid.initializer.Constant(value=1.0))
+        self.bn_bias_attr1 = fluid.ParamAttr(
+            name='batch_norm_b_1',
+            initializer=fluid.initializer.Constant(value=0.0))
+        self.bn_param_attr2 = fluid.ParamAttr(
+            name='batch_norm_w_2',
+            initializer=fluid.initializer.Constant(value=1.0))
+        self.bn_bias_attr2 = fluid.ParamAttr(
+            name='batch_norm_b_2',
+            initializer=fluid.initializer.Constant(value=0.0))
+        self.fc_param_attr = fluid.ParamAttr(
+            name='fc.weight',
+            initializer=fluid.initializer.Xavier(uniform=False))
+
+    def build_fused_program(self,
+                            main_program,
+                            startup_program,
+                            use_cuda,
+                            seed=1):
+        with fluid.program_guard(main_program, startup_program):
+            x = fluid.layers.data(name='x', shape=[1, 28, 28], dtype='float32')
+            y = fluid.layers.data(name="y", shape=[1], dtype='int64')
+            conv1_1 = fluid.layers.conv2d(
+                input=x,
+                filter_size=3,
+                num_filters=32,
+                stride=1,
+                padding=1,
+                act=None,
+                param_attr=self.conv_param_attr1,
+                bias_attr=False,
+                data_format='NHWC')
+            conv1_2 = fluid.layers.conv2d(
+                input=x,
+                filter_size=3,
+                num_filters=32,
+                stride=1,
+                padding=1,
+                act=None,
+                param_attr=self.conv_param_attr2,
+                bias_attr=False,
+                data_format='NHWC')
+            bn = fluid.layers.batch_norm(
+                input=conv1_1,
+                param_attr=self.bn_param_attr1,
+                bias_attr=self.bn_bias_attr1,
+                act=None,
+                data_layout='NHWC')
+            fused_bn_add_act = fluid.contrib.layers.fused_bn_add_act(
+                conv1_2,
+                bn,
+                param_attr=self.bn_param_attr2,
+                bias_attr=self.bn_bias_attr2)
+            prediction = fluid.layers.fc(input=fused_bn_add_act,
+                                         size=10,
+                                         act='softmax',
+                                         param_attr=self.fc_param_attr)
+            loss = fluid.layers.cross_entropy(input=prediction, label=y)
+            loss = fluid.layers.mean(loss)
+            sgd = fluid.optimizer.SGD(learning_rate=0.001)
+            sgd = fluid.contrib.mixed_precision.decorate(
+                sgd, use_dynamic_loss_scaling=True, init_loss_scaling=128.0)
+            sgd.minimize(loss)
+
+        return x, y, loss
+
+    def build_origin_program(self,
+                             main_program,
+                             startup_program,
+                             use_cuda,
+                             seed=1):
+        with fluid.program_guard(main_program, startup_program):
+            x = fluid.layers.data(name='x', shape=[1, 28, 28], dtype='float32')
+            y = fluid.layers.data(name="y", shape=[1], dtype='int64')
+            conv1_1 = fluid.layers.conv2d(
+                input=x,
+                filter_size=3,
+                num_filters=32,
+                stride=1,
+                padding=1,
+                act=None,
+                param_attr=self.conv_param_attr1,
+                bias_attr=False,
+                data_format='NHWC')
+            conv1_2 = fluid.layers.conv2d(
+                input=x,
+                filter_size=3,
+                num_filters=32,
+                stride=1,
+                padding=1,
+                act=None,
+                param_attr=self.conv_param_attr2,
+                bias_attr=False,
+                data_format='NHWC')
+            bn1 = fluid.layers.batch_norm(
+                input=conv1_1,
+                param_attr=self.bn_param_attr1,
+                bias_attr=self.bn_bias_attr1,
+                act=None,
+                data_layout='NHWC')
+            bn2 = fluid.layers.batch_norm(
+                input=conv1_2,
+                param_attr=self.bn_param_attr2,
+                bias_attr=self.bn_bias_attr2,
+                act=None,
+                data_layout='NHWC')
+            out = bn1 + bn2
+            out = fluid.layers.relu(out)
+            prediction = fluid.layers.fc(input=out,
+                                         size=10,
+                                         act='softmax',
+                                         param_attr=self.fc_param_attr)
+            loss = fluid.layers.cross_entropy(input=prediction, label=y)
+            loss = fluid.layers.mean(loss)
+            sgd = fluid.optimizer.SGD(learning_rate=0.001)
+            sgd = fluid.contrib.mixed_precision.decorate(
+                sgd, use_dynamic_loss_scaling=True, init_loss_scaling=128.0)
+            sgd.minimize(loss)
+
+        return x, y, loss
+
+    def check(self, place, use_cuda):
+        paddle.manual_seed(1)
+        paddle.framework.random._manual_program_seed(1)
+        iters = 5
+        batch_size = 16
+
+        # build_fused_program
+        main_program = fluid.Program()
+        startup_program = fluid.Program()
+        x, y, loss = self.build_fused_program(main_program, startup_program,
+                                              use_cuda)
+        feeder = fluid.DataFeeder(feed_list=[x, y], place=place)
+        train_reader = paddle.batch(
+            paddle.dataset.mnist.train(), batch_size=batch_size)
+        exe = fluid.Executor(place)
+        loss_vals_fused = []
+        scope = fluid.Scope()
+        with fluid.scope_guard(scope):
+            exe.run(startup_program)
+            for _ in range(iters):
+                data = next(train_reader())
+                loss_v = exe.run(main_program,
+                                 feed=feeder.feed(data),
+                                 fetch_list=[loss])
+                loss_vals_fused.append(loss_v[0][0])
+
+        # build_origin_program
+        main_program = fluid.Program()
+        startup_program = fluid.Program()
+        x, y, loss = self.build_origin_program(main_program, startup_program,
+                                               use_cuda)
+        feeder = fluid.DataFeeder(feed_list=[x, y], place=place)
+        train_reader = paddle.batch(
+            paddle.dataset.mnist.train(), batch_size=batch_size)
+        loss_vals = []
+        scope = fluid.Scope()
+        with fluid.scope_guard(scope):
+            exe.run(startup_program)
+            for _ in range(iters):
+                data = next(train_reader())
+                loss_v = exe.run(main_program,
+                                 feed=feeder.feed(data),
+                                 fetch_list=[loss])
+                loss_vals.append(loss_v[0][0])
+
+        # check loss
+        for i in range(iters):
+            self.assertAlmostEqual(loss_vals[i], loss_vals_fused[i], delta=1e-5)
+
+    def test_fuse_bn_add_act(self):
+        place = fluid.CUDAPlace(0)
+        self.check(place, use_cuda=True)
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_gast_with_compatibility.py b/python/paddle/fluid/tests/unittests/test_gast_with_compatibility.py
index c7476a8a74256d8eb656778c945c96ee0aa88df4..c176ff09e024db90ea5a81bcf2afe18939c4f538 100644
--- a/python/paddle/fluid/tests/unittests/test_gast_with_compatibility.py
+++ b/python/paddle/fluid/tests/unittests/test_gast_with_compatibility.py
@@ -16,6 +16,7 @@ from __future__ import print_function
 
 import ast
 import gast
+import sys
 import textwrap
 import unittest
 
@@ -143,47 +144,60 @@ class TestPythonCompatibility(unittest.TestCase):
         """
         self._check_compatibility(source, target)
 
-    def test_with(self):
-        """
-        The fileds `context_expr/optional_vars` of `ast.With` in PY2
-        is moved into `ast.With.items.withitem` in PY3.
-        """
-        source = """
-        with guard():
-            a = 1
-        """
-        target = """
-        with guard_new():
-            a = 1
-        """
-        self._check_compatibility(source, target)
-
-    def test_subscript_Index(self):
-        source = """
-            x = y()[10]
-        """
-        target = """
-            x = y()[20]
-        """
-        self._check_compatibility(source, target)
-
-    def test_subscript_Slice(self):
-        source = """
-            x = y()[10:20]
-        """
-        target = """
-            x = y()[20:40]
-        """
-        self._check_compatibility(source, target)
-
-    def test_call(self):
-        source = """
-            y = foo(*arg)
-        """
-        target = """
-            y = foo(*arg_new)
-        """
-        self._check_compatibility(source, target)
+    # The 0.3.3 version of gast has a bug in python3.8 that
+    # would cause the following tests to fail. But this 
+    # problem doesn't affect the use of Paddle's related 
+    # functions, therefore, the following tests would be 
+    # disable in python3.8.
+    #
+    # This problem had been fixed and updated to version 
+    # 0.4.1 of gast.
+    #
+    # More information please refer to:
+    # https://github.com/serge-sans-paille/gast/issues/49
+    if sys.version_info < (3, 8):
+
+        def test_with(self):
+            """
+            The fileds `context_expr/optional_vars` of `ast.With` in PY2
+            is moved into `ast.With.items.withitem` in PY3.
+            """
+            source = """
+            with guard():
+                a = 1
+            """
+            target = """
+            with guard_new():
+                a = 1
+            """
+            self._check_compatibility(source, target)
+
+        def test_subscript_Index(self):
+            source = """
+                x = y()[10]
+            """
+            target = """
+                x = y()[20]
+            """
+            self._check_compatibility(source, target)
+
+        def test_subscript_Slice(self):
+            source = """
+                x = y()[10:20]
+            """
+            target = """
+                x = y()[20:40]
+            """
+            self._check_compatibility(source, target)
+
+        def test_call(self):
+            source = """
+                y = foo(*arg)
+            """
+            target = """
+                y = foo(*arg_new)
+            """
+            self._check_compatibility(source, target)
 
 
 if __name__ == '__main__':
diff --git a/python/paddle/fluid/tests/unittests/test_gather_op.py b/python/paddle/fluid/tests/unittests/test_gather_op.py
index 1f6e522d2668b5dcd2075ff7af6b4b1ee674632d..5dcce88acf16b96fc26cd56c2e5e034d19405b6c 100644
--- a/python/paddle/fluid/tests/unittests/test_gather_op.py
+++ b/python/paddle/fluid/tests/unittests/test_gather_op.py
@@ -216,7 +216,7 @@ class API_TestGather(unittest.TestCase):
                       "index": index_np,
                       'axis': axis_np},
                 fetch_list=[out])
-            expected_output = gather_numpy(x_np, index_np, axis_np)
+            expected_output = gather_numpy(x_np, index_np, axis_np[0])
         self.assertTrue(np.allclose(result, expected_output))
 
 
diff --git a/python/paddle/fluid/tests/unittests/test_gaussian_random_op.py b/python/paddle/fluid/tests/unittests/test_gaussian_random_op.py
index fc668ce3493e96e0790af522a439367fe10455f3..dddc6811ef08bdf8504cb6b4fe09813336875b10 100644
--- a/python/paddle/fluid/tests/unittests/test_gaussian_random_op.py
+++ b/python/paddle/fluid/tests/unittests/test_gaussian_random_op.py
@@ -239,24 +239,24 @@ class TestGaussianRandomAPI(unittest.TestCase):
     def test_default_dtype(self):
         paddle.disable_static()
 
-        def test_default_fp_16():
+        def test_default_fp16():
             paddle.framework.set_default_dtype('float16')
-            paddle.tensor.random.gaussian_random([2, 3])
+            paddle.tensor.random.gaussian([2, 3])
 
-        self.assertRaises(TypeError, test_default_fp_16)
+        self.assertRaises(TypeError, test_default_fp16)
 
-        def test_default_fp_32():
+        def test_default_fp32():
             paddle.framework.set_default_dtype('float32')
-            out = paddle.tensor.random.gaussian_random([2, 3])
+            out = paddle.tensor.random.gaussian([2, 3])
             self.assertEqual(out.dtype, fluid.core.VarDesc.VarType.FP32)
 
-        def test_default_fp_64():
+        def test_default_fp64():
             paddle.framework.set_default_dtype('float64')
-            out = paddle.tensor.random.gaussian_random([2, 3])
+            out = paddle.tensor.random.gaussian([2, 3])
             self.assertEqual(out.dtype, fluid.core.VarDesc.VarType.FP64)
 
-        test_default_fp_64()
-        test_default_fp_32()
+        test_default_fp64()
+        test_default_fp32()
 
         paddle.enable_static()
 
@@ -265,24 +265,24 @@ class TestStandardNormalDtype(unittest.TestCase):
     def test_default_dtype(self):
         paddle.disable_static()
 
-        def test_default_fp_16():
+        def test_default_fp16():
             paddle.framework.set_default_dtype('float16')
             paddle.tensor.random.standard_normal([2, 3])
 
-        self.assertRaises(TypeError, test_default_fp_16)
+        self.assertRaises(TypeError, test_default_fp16)
 
-        def test_default_fp_32():
+        def test_default_fp32():
             paddle.framework.set_default_dtype('float32')
             out = paddle.tensor.random.standard_normal([2, 3])
             self.assertEqual(out.dtype, fluid.core.VarDesc.VarType.FP32)
 
-        def test_default_fp_64():
+        def test_default_fp64():
             paddle.framework.set_default_dtype('float64')
             out = paddle.tensor.random.standard_normal([2, 3])
             self.assertEqual(out.dtype, fluid.core.VarDesc.VarType.FP64)
 
-        test_default_fp_64()
-        test_default_fp_32()
+        test_default_fp64()
+        test_default_fp32()
 
         paddle.enable_static()
 
diff --git a/python/paddle/fluid/tests/unittests/test_generate_proposals_op.py b/python/paddle/fluid/tests/unittests/test_generate_proposals_op.py
index ce561cd317c48228c4877a2b65b67fe049a0d84a..26fc01ca04506758599ac5d6fe6842984a8d7a9c 100644
--- a/python/paddle/fluid/tests/unittests/test_generate_proposals_op.py
+++ b/python/paddle/fluid/tests/unittests/test_generate_proposals_op.py
@@ -34,18 +34,18 @@ def generate_proposals_in_python(scores, bbox_deltas, im_info, anchors,
 
     rpn_rois = []
     rpn_roi_probs = []
-    lod = []
+    rois_num = []
     num_images = scores.shape[0]
     for img_idx in range(num_images):
         img_i_boxes, img_i_probs = proposal_for_one_image(
             im_info[img_idx, :], all_anchors, variances,
             bbox_deltas[img_idx, :, :, :], scores[img_idx, :, :, :],
             pre_nms_topN, post_nms_topN, nms_thresh, min_size, eta)
-        lod.append(img_i_probs.shape[0])
+        rois_num.append(img_i_probs.shape[0])
         rpn_rois.append(img_i_boxes)
         rpn_roi_probs.append(img_i_probs)
 
-    return rpn_rois, rpn_roi_probs, lod
+    return rpn_rois, rpn_roi_probs, rois_num
 
 
 def proposal_for_one_image(im_info, all_anchors, variances, bbox_deltas, scores,
@@ -87,6 +87,10 @@ def proposal_for_one_image(im_info, all_anchors, variances, bbox_deltas, scores,
     proposals = clip_tiled_boxes(proposals, im_info[:2])
     # remove predicted boxes with height or width < min_size
     keep = filter_boxes(proposals, min_size, im_info)
+    if len(keep) == 0:
+        proposals = np.zeros((1, 4)).astype('float32')
+        scores = np.zeros((1, 1)).astype('float32')
+        return proposals, scores
     proposals = proposals[keep, :]
     scores = scores[keep, :]
 
@@ -280,8 +284,8 @@ class TestGenerateProposalsOp(OpTest):
         }
 
         self.outputs = {
-            'RpnRois': (self.rpn_rois[0], [self.lod]),
-            'RpnRoiProbs': (self.rpn_roi_probs[0], [self.lod]),
+            'RpnRois': (self.rpn_rois[0], [self.rois_num]),
+            'RpnRoiProbs': (self.rpn_roi_probs[0], [self.rois_num]),
         }
 
     def test_check_output(self):
@@ -320,7 +324,7 @@ class TestGenerateProposalsOp(OpTest):
             (batch_size, num_anchors * 4, layer_h, layer_w)).astype('float32')
 
     def init_test_output(self):
-        self.rpn_rois, self.rpn_roi_probs, self.lod = generate_proposals_in_python(
+        self.rpn_rois, self.rpn_roi_probs, self.rois_num = generate_proposals_in_python(
             self.scores, self.bbox_deltas, self.im_info, self.anchors,
             self.variances, self.pre_nms_topN, self.post_nms_topN,
             self.nms_thresh, self.min_size, self.eta)
@@ -349,12 +353,21 @@ class TestGenerateProposalsOutLodOp(TestGenerateProposalsOp):
         }
 
         self.outputs = {
-            'RpnRois': (self.rpn_rois[0], [self.lod]),
-            'RpnRoiProbs': (self.rpn_roi_probs[0], [self.lod]),
-            'RpnRoisLod': (np.asarray(
-                self.lod, dtype=np.int32))
+            'RpnRois': (self.rpn_rois[0], [self.rois_num]),
+            'RpnRoiProbs': (self.rpn_roi_probs[0], [self.rois_num]),
+            'RpnRoisNum': (np.asarray(
+                self.rois_num, dtype=np.int32))
         }
 
 
+class TestGenerateProposalsOpNoBoxLeft(TestGenerateProposalsOp):
+    def init_test_params(self):
+        self.pre_nms_topN = 12000  # train 12000, test 2000
+        self.post_nms_topN = 5000  # train 6000, test 1000
+        self.nms_thresh = 0.7
+        self.min_size = 1000.0
+        self.eta = 1.
+
+
 if __name__ == '__main__':
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_grid_sample_function.py b/python/paddle/fluid/tests/unittests/test_grid_sample_function.py
index 4a33f32a0b6977716d8065419f8e0f88d6c4f44a..ea94a8ba69a784efb1a2a12f6f251316553cab50 100644
--- a/python/paddle/fluid/tests/unittests/test_grid_sample_function.py
+++ b/python/paddle/fluid/tests/unittests/test_grid_sample_function.py
@@ -100,7 +100,7 @@ def add_cases(suite):
         GridSampleTestCase(
             methodName='runTest',
             mode='bilinear',
-            padding_mode='reflect',
+            padding_mode='reflection',
             align_corners=True))
     suite.addTest(
         GridSampleTestCase(
diff --git a/python/paddle/fluid/tests/unittests/test_grid_sampler_op.py b/python/paddle/fluid/tests/unittests/test_grid_sampler_op.py
index 4d1ed5aeb96ebbe064e35c1bee9d5775812440f7..bf2f9518fb0c720556b7eecdf5b286dea0fff96c 100644
--- a/python/paddle/fluid/tests/unittests/test_grid_sampler_op.py
+++ b/python/paddle/fluid/tests/unittests/test_grid_sampler_op.py
@@ -73,7 +73,7 @@ def unnormalizeAndClip(grid_slice, max_val, align_corners, padding_mode):
 
     if padding_mode == "border":
         grid_slice = clip(grid_slice, 0, max_val)
-    elif padding_mode == "reflect":
+    elif padding_mode == "reflection":
         double_range = 2 * max_val if align_corners else (max_val + 1) * 2
         grid_abs = np.abs(grid_slice) if align_corners else np.abs(grid_slice +
                                                                    0.5)
@@ -211,7 +211,7 @@ class Case2(TestGridSamplerOp):
         self.grid_shape = (2, 8, 9, 2)
         self.theta_shape = (2, 2, 3)
         self.align_corners = False
-        self.padding_mode = "reflect"
+        self.padding_mode = "reflection"
         self.mode = "bilinear"
 
 
@@ -221,7 +221,7 @@ class Case3(TestGridSamplerOp):
         self.grid_shape = (2, 8, 9, 2)
         self.theta_shape = (2, 2, 3)
         self.align_corners = True
-        self.padding_mode = "reflect"
+        self.padding_mode = "reflection"
         self.mode = "bilinear"
 
 
@@ -231,7 +231,7 @@ class Case4(TestGridSamplerOp):
         self.grid_shape = (2, 8, 9, 2)
         self.theta_shape = (2, 2, 3)
         self.align_corners = False
-        self.padding_mode = "reflect"
+        self.padding_mode = "reflection"
         self.mode = "nearest"
         self.numeric_grad_delta = 0.0001
 
diff --git a/python/paddle/fluid/tests/unittests/test_group_norm_op_v2.py b/python/paddle/fluid/tests/unittests/test_group_norm_op_v2.py
index 654e8d6f129e1ffe0dce59113ca88a16d348f210..a46b9b0ca78bf37e1c421a08a6fa8c5353c6d45d 100644
--- a/python/paddle/fluid/tests/unittests/test_group_norm_op_v2.py
+++ b/python/paddle/fluid/tests/unittests/test_group_norm_op_v2.py
@@ -35,24 +35,33 @@ class TestDygraphGroupNormv2(unittest.TestCase):
 
             def compute_v1(x):
                 with fluid.dygraph.guard(p):
-                    gn = fluid.dygraph.GroupNorm(channels=2, groups=2)
+                    gn = fluid.dygraph.GroupNorm(channels=6, groups=2)
                     y = gn(fluid.dygraph.to_variable(x))
                 return y.numpy()
 
             def compute_v2(x):
                 with fluid.dygraph.guard(p):
-                    gn = paddle.nn.GroupNorm(num_channels=2, num_groups=2)
+                    gn = paddle.nn.GroupNorm(num_channels=6, num_groups=2)
                     y = gn(fluid.dygraph.to_variable(x))
                 return y.numpy()
 
+            def test_weight_bias_false():
+                with fluid.dygraph.guard(p):
+                    gn = paddle.nn.GroupNorm(
+                        num_channels=6,
+                        num_groups=2,
+                        weight_attr=False,
+                        bias_attr=False)
+
             x = np.random.randn(*shape).astype("float32")
             y1 = compute_v1(x)
             y2 = compute_v2(x)
             self.assertTrue(np.allclose(y1, y2))
+            test_weight_bias_false()
 
     def test_static(self):
         places = [fluid.CPUPlace()]
-        if core.is_compiled_with_cuda() and core.op_support_gpu("layer_norm"):
+        if core.is_compiled_with_cuda() and core.op_support_gpu("group_norm"):
             places.append(fluid.CUDAPlace(0))
         for p in places:
             exe = fluid.Executor(p)
@@ -60,7 +69,7 @@ class TestDygraphGroupNormv2(unittest.TestCase):
 
             def compute_v1(x_np):
                 with program_guard(Program(), Program()):
-                    gn = fluid.dygraph.GroupNorm(channels=2, groups=2)
+                    gn = fluid.dygraph.GroupNorm(channels=6, groups=2)
                     x = fluid.data(name='x', shape=x_np.shape, dtype=x_np.dtype)
                     y = gn(x)
                     exe.run(fluid.default_startup_program())
@@ -69,7 +78,7 @@ class TestDygraphGroupNormv2(unittest.TestCase):
 
             def compute_v2(x_np):
                 with program_guard(Program(), Program()):
-                    gn = paddle.nn.GroupNorm(num_channels=2, num_groups=2)
+                    gn = paddle.nn.GroupNorm(num_channels=6, num_groups=2)
                     x = fluid.data(name='x', shape=x_np.shape, dtype=x_np.dtype)
                     y = gn(x)
                     exe.run(fluid.default_startup_program())
diff --git a/python/paddle/fluid/tests/unittests/test_hdfs1.py b/python/paddle/fluid/tests/unittests/test_hdfs1.py
index 430ed1abe860869d791f0eac17accc8416db1eca..1aac1236156ca159e9b285611a55f38925be22c2 100644
--- a/python/paddle/fluid/tests/unittests/test_hdfs1.py
+++ b/python/paddle/fluid/tests/unittests/test_hdfs1.py
@@ -12,6 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+from paddle.fluid.tests.unittests.hdfs_test_utils import FSTestBase
 import unittest
 import paddle.fluid as fluid
 import paddle.fluid.incubate.fleet.base.role_maker as role_maker
@@ -19,12 +20,10 @@ from paddle.fluid.incubate.fleet.collective import CollectiveOptimizer, fleet
 import os
 import sys
 
-from paddle.distributed.fleet.utils import LocalFS, HDFSClient, FSTimeOut, FSFileExistsError, FSFileNotExistsError
+from paddle.distributed.fleet.utils.fs import LocalFS, HDFSClient, FSTimeOut, FSFileExistsError, FSFileNotExistsError
 
 java_home = os.environ["JAVA_HOME"]
 
-from paddle.fluid.tests.unittests.hdfs_test_utils import FSTestBase
-
 
 class FSTest1(FSTestBase):
     def test_timeout(self):
diff --git a/python/paddle/fluid/tests/unittests/test_hdfs2.py b/python/paddle/fluid/tests/unittests/test_hdfs2.py
index 7754f89e3c901ac14cb102881e8d338442038559..1fa019bb9cd02c5f9a7181bc321618d5c37a8755 100644
--- a/python/paddle/fluid/tests/unittests/test_hdfs2.py
+++ b/python/paddle/fluid/tests/unittests/test_hdfs2.py
@@ -12,6 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+from paddle.fluid.tests.unittests.hdfs_test_utils import FSTestBase
 import unittest
 import paddle.fluid as fluid
 import paddle.fluid.incubate.fleet.base.role_maker as role_maker
@@ -19,12 +20,10 @@ from paddle.fluid.incubate.fleet.collective import CollectiveOptimizer, fleet
 import os
 import sys
 
-from paddle.distributed.fleet.utils import LocalFS, HDFSClient, FSTimeOut, FSFileExistsError, FSFileNotExistsError
+from paddle.distributed.fleet.utils.fs import LocalFS, HDFSClient, FSTimeOut, FSFileExistsError, FSFileNotExistsError
 
 java_home = os.environ["JAVA_HOME"]
 
-from paddle.fluid.tests.unittests.hdfs_test_utils import FSTestBase
-
 
 class FSTest2(FSTestBase):
     def test_hdfs(self):
diff --git a/python/paddle/fluid/tests/unittests/test_hdfs3.py b/python/paddle/fluid/tests/unittests/test_hdfs3.py
index 1a045f4b17fc9b8b68ccf81a23cb953db58a9db7..218bf12ca608a1fd908b3c6e2533517a960e6a23 100644
--- a/python/paddle/fluid/tests/unittests/test_hdfs3.py
+++ b/python/paddle/fluid/tests/unittests/test_hdfs3.py
@@ -12,6 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+from paddle.fluid.tests.unittests.hdfs_test_utils import FSTestBase
 import unittest
 import paddle.fluid as fluid
 import paddle.fluid.incubate.fleet.base.role_maker as role_maker
@@ -19,12 +20,10 @@ from paddle.fluid.incubate.fleet.collective import CollectiveOptimizer, fleet
 import os
 import sys
 
-from paddle.distributed.fleet.utils import LocalFS, HDFSClient, FSTimeOut, FSFileExistsError, FSFileNotExistsError
+from paddle.distributed.fleet.utils.fs import LocalFS, HDFSClient, FSTimeOut, FSFileExistsError, FSFileNotExistsError
 
 java_home = os.environ["JAVA_HOME"]
 
-from paddle.fluid.tests.unittests.hdfs_test_utils import FSTestBase
-
 
 class FSTest3(FSTestBase):
     def test_hdfs(self):
diff --git a/python/paddle/fluid/tests/unittests/test_imperative_basic.py b/python/paddle/fluid/tests/unittests/test_imperative_basic.py
index 74cfeab601b04d9624a5f6e48fd06c6cbf3715f8..22f16287c33f96a43361b5fe4ed5d0fe3edbb1bc 100644
--- a/python/paddle/fluid/tests/unittests/test_imperative_basic.py
+++ b/python/paddle/fluid/tests/unittests/test_imperative_basic.py
@@ -652,7 +652,7 @@ class TestDygraphUtils(unittest.TestCase):
         a_np = np.random.uniform(-2, 2, (10, 20, 30)).astype(np.float32)
         helper = LayerHelper(fluid.unique_name.generate("test"), act="relu")
         func = helper.append_activation
-        with fluid.dygraph.guard():
+        with fluid.dygraph.guard(fluid.core.CPUPlace()):
             a = fluid.dygraph.to_variable(a_np)
             fluid.set_flags({'FLAGS_use_mkldnn': True})
             try:
diff --git a/python/paddle/fluid/tests/unittests/test_imperative_data_loader_process.py b/python/paddle/fluid/tests/unittests/test_imperative_data_loader_process.py
index 7fb2cb0090da57ae837d1f774518dd90a41df56c..9b2d71c9f907779bc9b27b51e21056496f8d4dd5 100644
--- a/python/paddle/fluid/tests/unittests/test_imperative_data_loader_process.py
+++ b/python/paddle/fluid/tests/unittests/test_imperative_data_loader_process.py
@@ -18,6 +18,7 @@ import multiprocessing
 import numpy as np
 import paddle.fluid as fluid
 from paddle.fluid import core
+from paddle.fluid.reader import _reader_process_loop
 
 if sys.version_info[0] == 2:
     import Queue as queue
@@ -66,7 +67,7 @@ class TestDygraphDataLoaderProcess(unittest.TestCase):
                 batch_generator_creator(self.batch_size, self.batch_num),
                 places=fluid.CPUPlace())
             loader._data_queue = queue.Queue(self.batch_num + 1)
-            loader._reader_process_loop()
+            _reader_process_loop(loader._batch_reader, loader._data_queue)
             # For clean memory mapped files
             util_queue = multiprocessing.Queue(self.batch_num + 1)
             for _ in range(self.batch_num):
@@ -94,7 +95,7 @@ class TestDygraphDataLoaderProcess(unittest.TestCase):
             loader._data_queue = queue.Queue(self.batch_num + 1)
             exception = None
             try:
-                loader._reader_process_loop()
+                _reader_process_loop(loader._batch_reader, loader._data_queue)
             except core.EnforceNotMet as ex:
                 exception = ex
             self.assertIsNotNone(exception)
diff --git a/python/paddle/fluid/tests/unittests/test_imperative_debug_string.py b/python/paddle/fluid/tests/unittests/test_imperative_debug_string.py
deleted file mode 100644
index 171687283bc5db709501ae33d131470582f4d106..0000000000000000000000000000000000000000
--- a/python/paddle/fluid/tests/unittests/test_imperative_debug_string.py
+++ /dev/null
@@ -1,75 +0,0 @@
-# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from __future__ import print_function
-
-import unittest
-import paddle.fluid as fluid
-import numpy as np
-
-
-class MLP(fluid.Layer):
-    def __init__(self, input_size):
-        super(MLP, self).__init__()
-        self._linear1 = fluid.dygraph.Linear(
-            input_size,
-            3,
-            param_attr=fluid.ParamAttr(
-                initializer=fluid.initializer.Constant(value=0.1)),
-            bias_attr=fluid.ParamAttr(
-                initializer=fluid.initializer.Constant(value=0.1)))
-        self._linear2 = fluid.dygraph.Linear(
-            3,
-            4,
-            param_attr=fluid.ParamAttr(
-                initializer=fluid.initializer.Constant(value=0.1)),
-            bias_attr=fluid.ParamAttr(
-                initializer=fluid.initializer.Constant(value=0.1)))
-
-    def forward(self, inputs):
-        x = self._linear1(inputs)
-        x = self._linear2(x)
-        x = fluid.layers.reduce_sum(x)
-        return x
-
-
-class TestDygraphDebugString(unittest.TestCase):
-    def test_dygraph_debug_string(self):
-        np_inp = np.array([[1.0, 2.0], [3.0, 4.0]], dtype=np.float32)
-        unique_name = 0
-        trace_var = 0
-        alive_var = 0
-        with fluid.dygraph.guard():
-            mlp = MLP(input_size=2)
-            for i in range(10):
-                var_inp = fluid.dygraph.base.to_variable(np_inp)
-                out = mlp(var_inp)
-                out.backward()
-                mlp.clear_gradients()
-                unique_name_tmp, trace_var_tmp, alive_var_tmp = fluid.dygraph.base._print_debug_msg(
-                    mlp.parameters(), is_test=True)
-                if i > 0:
-                    self.assertGreaterEqual(unique_name, unique_name_tmp)
-                    self.assertGreaterEqual(trace_var, trace_var_tmp)
-                    self.assertGreaterEqual(alive_var, alive_var_tmp)
-                else:
-                    unique_name = unique_name_tmp
-                    trace_var = trace_var_tmp
-                    alive_var = alive_var_tmp
-                try:
-                    fluid.dygraph.base._print_debug_msg(mlp.parameters())
-                except Exception as e:
-                    raise RuntimeError(
-                        "No Exception is accepted in _print_debug_msg, but we got: {}".
-                        format(e))
diff --git a/python/paddle/fluid/tests/unittests/test_imperative_double_grad.py b/python/paddle/fluid/tests/unittests/test_imperative_double_grad.py
index 720c9f95c251ec54c7e7fa74c8e59e135a8c6be7..39c6fca89ccbef8c61055cd7d1547d3450ae96cb 100644
--- a/python/paddle/fluid/tests/unittests/test_imperative_double_grad.py
+++ b/python/paddle/fluid/tests/unittests/test_imperative_double_grad.py
@@ -346,7 +346,7 @@ class TestRaiseNoDoubleGradOp(TestCase):
         with fluid.dygraph.guard():
             x = fluid.layers.ones(shape=[2, 3, 2, 2], dtype='float32')
             x.stop_gradient = False
-            y = paddle.fluid.layers.batch_norm(x)
+            y = paddle.fluid.layers.group_norm(x, groups=1)
 
             dx = fluid.dygraph.grad(
                 outputs=[y], inputs=[x], create_graph=True,
diff --git a/python/paddle/fluid/tests/unittests/test_imperative_numpy_bridge.py b/python/paddle/fluid/tests/unittests/test_imperative_numpy_bridge.py
index da01be8159a5c5d277a22134eb60ef37ef85fc4f..772dd913e4d20ccf51601ea620822c250cb45320 100644
--- a/python/paddle/fluid/tests/unittests/test_imperative_numpy_bridge.py
+++ b/python/paddle/fluid/tests/unittests/test_imperative_numpy_bridge.py
@@ -15,18 +15,26 @@
 import unittest
 import numpy as np
 import paddle.fluid as fluid
+import warnings
 
 
 class TestImperativeNumpyBridge(unittest.TestCase):
     def test_tensor_from_numpy(self):
         data_np = np.array([[2, 3, 1]]).astype('float32')
         with fluid.dygraph.guard(fluid.CPUPlace()):
-            var = fluid.dygraph.to_variable(data_np, zero_copy=True)
-            self.assertTrue(np.array_equal(var.numpy(), data_np))
-            data_np[0][0] = 4
-            self.assertEqual(data_np[0][0], 4)
-            self.assertEqual(var[0][0].numpy()[0], 4)
-            self.assertTrue(np.array_equal(var.numpy(), data_np))
+            with warnings.catch_warnings(record=True) as w:
+                warnings.simplefilter("always")
+                var = fluid.dygraph.to_variable(data_np, zero_copy=True)
+                assert "Currently, zero_copy is not supported, and it will be discarded." in str(
+                    w[-1].message)
+            # Temporally diable zero_copy
+            # var = fluid.dygraph.to_variable(data_np, zero_copy=True)
+            # self.assertTrue(np.array_equal(var.numpy(), data_np))
+            # data_np[0][0] = 4
+            # self.assertEqual(data_np[0][0], 4)
+            # self.assertEqual(var[0][0].numpy()[0], 4)
+            # self.assertTrue(np.array_equal(var.numpy(), data_np))
+
             var2 = fluid.dygraph.to_variable(data_np, zero_copy=False)
             self.assertTrue(np.array_equal(var2.numpy(), data_np))
             data_np[0][0] = -1
diff --git a/python/paddle/fluid/tests/unittests/test_imperative_save_load.py b/python/paddle/fluid/tests/unittests/test_imperative_save_load.py
index 48aea3a584dd25667704b22d99d1074c481bb76c..22e19efcb58d19c41835565de2c8c01fe253702a 100644
--- a/python/paddle/fluid/tests/unittests/test_imperative_save_load.py
+++ b/python/paddle/fluid/tests/unittests/test_imperative_save_load.py
@@ -374,8 +374,7 @@ class TestDygraphPtbRnn(unittest.TestCase):
                 adam._learning_rate.step_num = 0
 
             para_state_dict, opti_state_dict = paddle.load("./test_dy")
-            print(opti_state_dict['LR_Scheduler'])
-            adam.set_dict(opti_state_dict)
+            adam.set_state_dict(opti_state_dict)
 
             opti_dict = adam.state_dict()
             for k, v in opti_dict.items():
@@ -393,7 +392,7 @@ class TestDygraphPtbRnn(unittest.TestCase):
 
                 var.set(np.zeros_like(np_t), place)
 
-            ptb_model.set_dict(para_state_dict)
+            ptb_model.set_state_dict(stat_dict=para_state_dict)
 
             state_dict = ptb_model.state_dict()
 
@@ -483,7 +482,7 @@ class TestDygraphPtbRnn(unittest.TestCase):
             if isinstance(adam._learning_rate, LearningRateDecay):
                 adam._learning_rate.step_num = 0
 
-            adam.set_dict(self.opti_dict)
+            adam.set_state_dict(self.opti_dict)
             opti_dict = adam.state_dict()
             for k, v in opti_dict.items():
                 if isinstance(v, core.VarBase):
@@ -500,7 +499,7 @@ class TestDygraphPtbRnn(unittest.TestCase):
 
                 var.set(np.zeros_like(np_t), place)
 
-            ptb_model.set_dict(self.state_dict)
+            ptb_model.set_state_dict(self.state_dict)
 
             state_dict = ptb_model.state_dict()
 
@@ -593,7 +592,7 @@ class TestDygraphPtbRnn(unittest.TestCase):
             if isinstance(adam._learning_rate, LearningRateDecay):
                 adam._learning_rate.step_num = 0
 
-            adam.set_dict(np_opti_dict)
+            adam.set_state_dict(np_opti_dict)
 
             opti_dict = adam.state_dict()
             for k, v in opti_dict.items():
@@ -613,7 +612,7 @@ class TestDygraphPtbRnn(unittest.TestCase):
 
                 var.set(np.zeros_like(np_t), place)
 
-            ptb_model.set_dict(np_state_dict)
+            ptb_model.set_state_dict(np_state_dict)
 
             state_dict = ptb_model.state_dict()
 
@@ -656,8 +655,8 @@ class TestDygraphPtbRnn(unittest.TestCase):
             last_hidden = None
             last_cell = None
 
-            adam.set_dict(self.opti_dict)
-            ptb_model.set_dict(self.state_dict)
+            adam.set_state_dict(self.opti_dict)
+            ptb_model.set_state_dict(self.state_dict)
 
             for i in range(1):
                 x_data = np.arange(12).reshape(4, 3).astype('int64')
@@ -745,8 +744,8 @@ class TestDygraphPtbRnn(unittest.TestCase):
             last_cell = None
 
             state_dict, opti_dict = fluid.load_dygraph("./test_dy")
-            adam.set_dict(opti_dict)
-            ptb_model.set_dict(state_dict)
+            adam.set_state_dict(opti_dict)
+            ptb_model.set_state_dict(state_dict)
 
             for i in range(1):
                 x_data = np.arange(12).reshape(4, 3).astype('int64')
@@ -849,8 +848,8 @@ class TestDygraphPtbRnn(unittest.TestCase):
             for k, v in self.state_dict.items():
                 np_state_dict[k] = v.numpy()
 
-            adam.set_dict(np_opti_dict)
-            ptb_model.set_dict(np_state_dict)
+            adam.set_state_dict(np_opti_dict)
+            ptb_model.set_state_dict(np_state_dict)
             for i in range(1):
                 x_data = np.arange(12).reshape(4, 3).astype('int64')
                 y_data = np.arange(1, 13).reshape(4, 3).astype('int64')
@@ -912,6 +911,22 @@ class TestDygraphPtbRnn(unittest.TestCase):
             para_state_dict, opti_state_dict = paddle.load(
                 os.path.join('saved_dy', 'emb_dy.pdopt'))
 
+    def test_load_compatible_with_keep_name_table(self):
+        with fluid.dygraph.guard():
+            emb = fluid.dygraph.Embedding([10, 10])
+            state_dict = emb.state_dict()
+            paddle.save(state_dict, os.path.join('saved_dy', 'emb_dy'))
+
+            para_state_dict, opti_state_dict = paddle.load(
+                os.path.join('saved_dy', 'emb_dy'), True)
+            self.assertTrue(para_state_dict != None)
+            self.assertTrue(opti_state_dict == None)
+
+            para_state_dict, opti_state_dict = paddle.load(
+                os.path.join('saved_dy', 'emb_dy'), keep_name_table=True)
+            self.assertTrue(para_state_dict != None)
+            self.assertTrue(opti_state_dict == None)
+
 
 if __name__ == '__main__':
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_imperative_save_load_v2.py b/python/paddle/fluid/tests/unittests/test_imperative_save_load_v2.py
index 3ccd1dbda3a443d50e43ba498cb3d5b529318c32..3eb413a62664057c56567d5834b216110fac04fb 100644
--- a/python/paddle/fluid/tests/unittests/test_imperative_save_load_v2.py
+++ b/python/paddle/fluid/tests/unittests/test_imperative_save_load_v2.py
@@ -918,6 +918,29 @@ class TestDygraphPtbRnn(unittest.TestCase):
             para_state_dict, opti_state_dict = paddle.load(
                 os.path.join('saved_dy', 'emb_dy.pdopt'))
 
+    def test_no_state_in_input_dict(self):
+        with fluid.dygraph.guard():
+            emb = fluid.dygraph.Embedding([10, 10])
+            state_dict = emb.state_dict()
+            paddle.save(state_dict, os.path.join('saved_dy', 'emb_dy'))
+
+            para_state_dict, _ = paddle.load(os.path.join('saved_dy', 'emb_dy'))
+            para_state_dict.pop('weight')
+
+            emb.set_state_dict(para_state_dict)
+
+    def test_state_shape_mismatch(self):
+        with fluid.dygraph.guard():
+            emb = fluid.dygraph.Embedding([10, 10])
+            state_dict = emb.state_dict()
+            paddle.save(state_dict, os.path.join('saved_dy', 'emb_dy'))
+
+            para_state_dict, _ = paddle.load(os.path.join('saved_dy', 'emb_dy'))
+            para_state_dict['weight'] = np.expand_dims(
+                para_state_dict['weight'], axis=-1)
+
+            emb.set_state_dict(para_state_dict)
+
 
 if __name__ == '__main__':
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_imperative_star_gan_with_gradient_penalty.py b/python/paddle/fluid/tests/unittests/test_imperative_star_gan_with_gradient_penalty.py
index e94157fa047eef065bc4bd0bfb3d6b6c778ea7b9..1ab37aaed23530f7cd886193dbf02d0a94fa61e2 100644
--- a/python/paddle/fluid/tests/unittests/test_imperative_star_gan_with_gradient_penalty.py
+++ b/python/paddle/fluid/tests/unittests/test_imperative_star_gan_with_gradient_penalty.py
@@ -592,7 +592,7 @@ class TestStarGANWithGradientPenalty(unittest.TestCase):
         cfg = Config(place)
 
         dataset = create_mnist_dataset(cfg)
-        dataset = fluid.io.cache(dataset)
+        dataset = paddle.reader.cache(dataset)
 
         static_graph_model = StaticGraphTrainModel(cfg)
         static_loss = []
diff --git a/python/paddle/fluid/tests/unittests/test_imperative_using_non_zero_gpu.py b/python/paddle/fluid/tests/unittests/test_imperative_using_non_zero_gpu.py
index 0af8132acfd26917f83dda6350583c7a06858a7d..f2dfaef397797a9d1b7e2900cdb2cc3617bcb933 100644
--- a/python/paddle/fluid/tests/unittests/test_imperative_using_non_zero_gpu.py
+++ b/python/paddle/fluid/tests/unittests/test_imperative_using_non_zero_gpu.py
@@ -21,7 +21,6 @@ import numpy as np
 class TestImperativeUsingNonZeroGpu(unittest.TestCase):
     def run_main(self, np_arr, place):
         with guard(place):
-            embedding = Embedding(size=[10, 10])
             var = to_variable(np_arr)
             self.assertTrue(np.array_equal(np_arr, var.numpy()))
 
@@ -30,7 +29,6 @@ class TestImperativeUsingNonZeroGpu(unittest.TestCase):
             return
 
         np_arr = np.random.random([11, 13]).astype('float32')
-        self.run_main(np_arr, fluid.CUDAPlace(1))
         self.run_main(np_arr, fluid.CUDAPlace(0))
 
 
diff --git a/python/paddle/fluid/tests/unittests/test_inplace_addto_strategy.py b/python/paddle/fluid/tests/unittests/test_inplace_addto_strategy.py
new file mode 100644
index 0000000000000000000000000000000000000000..c75acd7c15b1e96c49fba61b9f8348b62ab73894
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/test_inplace_addto_strategy.py
@@ -0,0 +1,114 @@
+#   Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+
+import unittest
+
+import paddle
+import paddle.fluid as fluid
+import paddle.fluid.layers as layers
+from paddle.fluid.backward import calc_gradient
+import numpy as np
+
+
+class ConvBNLayer(fluid.Layer):
+    def __init__(self,
+                 num_channels,
+                 num_filters,
+                 filter_size,
+                 stride=1,
+                 groups=1,
+                 act=None,
+                 use_cudnn=False):
+        super(ConvBNLayer, self).__init__()
+
+        self._conv = fluid.dygraph.Conv2D(
+            num_channels=num_channels,
+            num_filters=num_filters,
+            filter_size=filter_size,
+            stride=stride,
+            padding=(filter_size - 1) // 2,
+            groups=groups,
+            act=None,
+            bias_attr=False,
+            use_cudnn=use_cudnn)
+
+        self._batch_norm = fluid.dygraph.BatchNorm(num_filters, act=act)
+
+    def forward(self, inputs):
+        y = self._conv(inputs)
+        y = self._batch_norm(y)
+        return y
+
+
+def create_program():
+    main = fluid.Program()
+    startup = fluid.Program()
+    with fluid.program_guard(main, startup):
+        x = fluid.data(name='img', shape=[-1, 3, 224, 224])
+        x.stop_gradient = False
+        x = fluid.layers.prelu(x, mode="channel")
+        conv = ConvBNLayer(
+            num_channels=3,
+            num_filters=3,
+            filter_size=1,
+            act='relu',
+            use_cudnn=True)
+        y = conv(x) + x
+
+        loss = fluid.layers.reduce_sum(y)
+
+        sgd = fluid.optimizer.SGD(learning_rate=0.01)
+        sgd.minimize(loss)
+
+    return loss, main, startup, conv._conv.weight
+
+
+class TestInplaceAddto(unittest.TestCase):
+    def test_result(self):
+        def run_program(enable_addto):
+            np.random.seed(10)
+            paddle.manual_seed(10)
+            paddle.framework.random._manual_program_seed(10)
+            if fluid.core.is_compiled_with_cuda():
+                fluid.set_flags({"FLAGS_cudnn_deterministic": True})
+            fluid.set_flags({"FLAGS_max_inplace_grad_add": 2})
+            loss, main, startup, w = create_program()
+            place = fluid.CUDAPlace(0) if fluid.core.is_compiled_with_cuda(
+            ) else fluid.CPUPlace()
+            exe = fluid.Executor(place)
+
+            strategy = fluid.BuildStrategy()
+            strategy.enable_addto = enable_addto
+            compiled = fluid.CompiledProgram(main).with_data_parallel(
+                loss_name=loss.name, build_strategy=strategy)
+
+            exe.run(startup)
+            img = np.random.uniform(-128, 128,
+                                    [8, 3, 224, 224]).astype(np.float32)
+            for i in range(2):
+                res = exe.run(compiled,
+                              feed={'img': img},
+                              fetch_list=[loss.name, w.name])
+            return res
+
+        res1, w1 = run_program(True)
+        res2, w2 = run_program(False)
+        print(res1, res2)
+        self.assertTrue(np.array_equal(res1, res2))
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_instance_norm_op_v2.py b/python/paddle/fluid/tests/unittests/test_instance_norm_op_v2.py
index b02ba1a584b52dbbc99fcc8ed7bad438e7a9dd46..c45c144e3ad44c5781ea1f1d7d61028b56d8a254 100644
--- a/python/paddle/fluid/tests/unittests/test_instance_norm_op_v2.py
+++ b/python/paddle/fluid/tests/unittests/test_instance_norm_op_v2.py
@@ -48,7 +48,13 @@ class TestInstanceNorm(unittest.TestCase):
                 instance_norm3d = paddle.nn.BatchNorm3d(1)
                 instance_norm3d(fluid.dygraph.to_variable(x_data_4))
 
+            def weight_bias_false():
+                x_data_4 = np.random.random(size=(2, 1, 3, 3)).astype('float32')
+                instance_norm3d = paddle.nn.BatchNorm3d(
+                    1, weight_attr=False, bias_attr=False)
+
             with fluid.dygraph.guard(p):
+                weight_bias_false()
                 self.assertRaises(ValueError, error1d)
                 self.assertRaises(ValueError, error2d)
                 self.assertRaises(ValueError, error3d)
diff --git a/python/paddle/fluid/tests/unittests/test_jit_save_load.py b/python/paddle/fluid/tests/unittests/test_jit_save_load.py
index 87b6e76a6d0ab7f5fba7c4526734d81475e1540e..7e6ca8076de5186def1229b58bd23df73021430e 100644
--- a/python/paddle/fluid/tests/unittests/test_jit_save_load.py
+++ b/python/paddle/fluid/tests/unittests/test_jit_save_load.py
@@ -23,7 +23,7 @@ from paddle.static import InputSpec
 import paddle.fluid as fluid
 from paddle.fluid.dygraph import Linear
 from paddle.fluid.dygraph import declarative, ProgramTranslator
-from paddle.fluid.dygraph.io import EXTRA_VAR_INFO_FILENAME
+from paddle.fluid.dygraph.io import EXTRA_VAR_INFO_FILENAME, VARIABLE_FILENAME
 
 BATCH_SIZE = 32
 BATCH_NUM = 10
@@ -56,6 +56,16 @@ class LinearNet(fluid.dygraph.Layer):
         return self._linear(x)
 
 
+class LinearNetWithInputSpec(fluid.dygraph.Layer):
+    def __init__(self, in_size, out_size):
+        super(LinearNetWithInputSpec, self).__init__()
+        self._linear = Linear(in_size, out_size)
+
+    @declarative(input_spec=[InputSpec(shape=[None, 784], dtype='float32')])
+    def forward(self, x):
+        return self._linear(x)
+
+
 class LinearNetNotDeclarative(fluid.dygraph.Layer):
     def __init__(self, in_size, out_size):
         super(LinearNetNotDeclarative, self).__init__()
@@ -65,6 +75,23 @@ class LinearNetNotDeclarative(fluid.dygraph.Layer):
         return self._linear(x)
 
 
+class LinerNetWithLabel(paddle.nn.Layer):
+    def __init__(self, in_size, out_size):
+        super(LinerNetWithLabel, self).__init__()
+        self._linear = Linear(in_size, out_size)
+
+    @declarative(input_spec=[
+        InputSpec(
+            shape=[None, 784], dtype='float32', name="image"), InputSpec(
+                shape=[None, 1], dtype='int64', name="label")
+    ])
+    def forward(self, x, label):
+        out = self._linear(x)
+        loss = fluid.layers.cross_entropy(out, label)
+        avg_loss = fluid.layers.mean(loss)
+        return out, avg_loss
+
+
 class LinearNetReturnLoss(fluid.dygraph.Layer):
     def __init__(self, in_size, out_size):
         super(LinearNetReturnLoss, self).__init__()
@@ -78,6 +105,72 @@ class LinearNetReturnLoss(fluid.dygraph.Layer):
         return z, loss
 
 
+class LinearNetMultiInput(fluid.dygraph.Layer):
+    def __init__(self, in_size, out_size):
+        super(LinearNetMultiInput, self).__init__()
+        self._linear1 = Linear(in_size, out_size)
+        self._linear2 = Linear(in_size, out_size)
+
+    @declarative(input_spec=[
+        InputSpec(
+            [None, 8], dtype='float32'), InputSpec(
+                [None, 8], dtype='float32')
+    ])
+    def forward(self, x, y):
+        x_out = self._linear1(x)
+        y_out = self._linear2(y)
+        loss = fluid.layers.mean(x_out + y_out)
+        return x_out, y_out, loss
+
+
+class MultiLoadingLinearNet(fluid.dygraph.Layer):
+    def __init__(self, size, model_path):
+        super(MultiLoadingLinearNet, self).__init__()
+        self._linear = Linear(size, size)
+        self._load_linear1 = fluid.dygraph.jit.load(model_path)
+        self._load_linear2 = fluid.dygraph.jit.load(model_path)
+
+    @declarative
+    def forward(self, x):
+        tmp1 = self._linear(x)
+        tmp2 = self._load_linear1(tmp1)
+        tmp3 = self._load_linear2(tmp2)
+        y = self._linear(tmp3)
+        return y
+
+
+class LinearNetReturnHidden(fluid.dygraph.Layer):
+    def __init__(self, in_size, out_size):
+        super(LinearNetReturnHidden, self).__init__()
+        self._linear_1 = Linear(in_size, out_size)
+        self._linear_2 = Linear(in_size, out_size)
+
+    @declarative
+    def forward(self, x):
+        y = self._linear_1(x)
+        z = self._linear_2(y)
+        loss = fluid.layers.mean(z)
+        return y, loss
+
+
+class EmptyLayer(paddle.nn.Layer):
+    def __init__(self):
+        super(EmptyLayer, self).__init__()
+
+    @paddle.jit.to_static
+    def forward(self, x):
+        return x
+
+
+class NoParamLayer(paddle.nn.Layer):
+    def __init__(self):
+        super(NoParamLayer, self).__init__()
+
+    @paddle.jit.to_static
+    def forward(self, x, y):
+        return x + y
+
+
 def train(layer, input_size=784, label_size=1):
     # create optimizer
     sgd = fluid.optimizer.SGDOptimizer(
@@ -102,6 +195,27 @@ def train(layer, input_size=784, label_size=1):
     return [img], layer, avg_loss
 
 
+def train_with_label(layer, input_size=784, label_size=1):
+    # create optimizer
+    sgd = fluid.optimizer.SGDOptimizer(
+        learning_rate=0.01, parameter_list=layer.parameters())
+    # create data loader
+    train_loader = fluid.io.DataLoader.from_generator(capacity=5)
+    train_loader.set_batch_generator(
+        random_batch_reader(input_size, label_size))
+    # train
+    for data in train_loader():
+        img, label = data
+        label.stop_gradient = True
+
+        out, avg_loss = layer(img, label)
+
+        avg_loss.backward()
+        sgd.minimize(avg_loss)
+        layer.clear_gradients()
+    return out
+
+
 class TestJitSaveLoad(unittest.TestCase):
     def setUp(self):
         self.model_path = "model.test_jit_save_load"
@@ -159,8 +273,11 @@ class TestJitSaveLoad(unittest.TestCase):
         train_layer.eval()
         # construct new model
         new_layer = LinearNet(784, 1)
-        model_dict, _ = fluid.dygraph.load_dygraph(self.model_path)
-        new_layer.set_dict(model_dict)
+        orig_state_dict = new_layer.state_dict()
+        load_state_dict, _ = fluid.dygraph.load_dygraph(self.model_path)
+        for structured_name in orig_state_dict:
+            self.assertTrue(structured_name in load_state_dict)
+        new_layer.set_state_dict(load_state_dict)
         new_layer.eval()
         # inference & compare
         x = fluid.dygraph.to_variable(
@@ -168,57 +285,20 @@ class TestJitSaveLoad(unittest.TestCase):
         self.assertTrue(
             np.array_equal(train_layer(x).numpy(), new_layer(x).numpy()))
 
-    def test_save_get_program_failed(self):
-        layer = LinearNetNotDeclarative(784, 1)
-        example_inputs, layer, _ = train(layer)
-        with self.assertRaises(RuntimeError):
-            fluid.dygraph.jit.save(
-                layer=layer,
-                model_path=self.model_path,
-                input_spec=example_inputs)
-
     def test_load_dygraph_no_path(self):
         model_path = "model.test_jit_save_load.no_path"
         new_layer = LinearNet(784, 1)
         with self.assertRaises(ValueError):
             model_dict, _ = fluid.dygraph.load_dygraph(model_path)
 
-    def test_load_dygraph_no_var_info(self):
-        model_path = "model.test_jit_save_load.no_var_info"
+    def test_jit_load_model_incomplete(self):
+        model_path = "model.test_jit_save_load.remove_variables"
         self.train_and_save_model(model_path=model_path)
-        # remove `__variables.info__`
-        var_info_path = os.path.join(model_path, EXTRA_VAR_INFO_FILENAME)
-        os.remove(var_info_path)
-        new_layer = LinearNet(784, 1)
-        with self.assertRaises(RuntimeError):
-            model_dict, _ = fluid.dygraph.load_dygraph(model_path)
-
-    def test_load_dygraph_not_var_file(self):
-        model_path = "model.test_jit_save_load.no_var_file"
-        configs = fluid.dygraph.jit.SaveLoadConfig()
-        configs.params_filename = "__params__"
-        self.train_and_save_model(model_path=model_path, configs=configs)
-        new_layer = LinearNet(784, 1)
-        with self.assertRaises(RuntimeError):
-            model_dict, _ = fluid.dygraph.load_dygraph(model_path)
-
-
-class LinearNetMultiInput(fluid.dygraph.Layer):
-    def __init__(self, in_size, out_size):
-        super(LinearNetMultiInput, self).__init__()
-        self._linear1 = Linear(in_size, out_size)
-        # self._linear2 = Linear(in_size, out_size)
-
-    @declarative(input_spec=[
-        InputSpec(
-            [None, 8], dtype='float32'), InputSpec(
-                [None, 8], dtype='float32')
-    ])
-    def forward(self, x, y):
-        x_out = self._linear1(x)
-        y_out = self._linear1(y)
-        loss = fluid.layers.mean(x_out + y_out)
-        return x_out, y_out, loss
+        # remove `__variables__`	
+        var_path = os.path.join(model_path, VARIABLE_FILENAME)
+        os.remove(var_path)
+        with self.assertRaises(ValueError):
+            paddle.jit.load(model_path)
 
 
 class TestSaveLoadWithInputSpec(unittest.TestCase):
@@ -364,22 +444,6 @@ class TestJitSaveLoadConfig(unittest.TestCase):
             np.array_equal(train_layer(x)[0].numpy(), infer_layer(x).numpy()))
 
 
-class MultiLoadingLinearNet(fluid.dygraph.Layer):
-    def __init__(self, size, model_path):
-        super(MultiLoadingLinearNet, self).__init__()
-        self._linear = Linear(size, size)
-        self._load_linear1 = fluid.dygraph.jit.load(model_path)
-        self._load_linear2 = fluid.dygraph.jit.load(model_path)
-
-    @declarative
-    def forward(self, x):
-        tmp1 = self._linear(x)
-        tmp2 = self._load_linear1(tmp1)
-        tmp3 = self._load_linear2(tmp2)
-        y = self._linear(tmp3)
-        return y
-
-
 class TestJitMultipleLoading(unittest.TestCase):
     def setUp(self):
         self.linear_size = 4
@@ -408,20 +472,6 @@ class TestJitMultipleLoading(unittest.TestCase):
             name_set.add(var.name)
 
 
-class LinearNetReturnHidden(fluid.dygraph.Layer):
-    def __init__(self, in_size, out_size):
-        super(LinearNetReturnHidden, self).__init__()
-        self._linear_1 = Linear(in_size, out_size)
-        self._linear_2 = Linear(in_size, out_size)
-
-    @declarative
-    def forward(self, x):
-        y = self._linear_1(x)
-        z = self._linear_2(y)
-        loss = fluid.layers.mean(z)
-        return y, loss
-
-
 class TestJitPruneModelAndLoad(unittest.TestCase):
     def setUp(self):
         self.linear_size = 4
@@ -480,5 +530,230 @@ class TestJitPruneModelAndLoad(unittest.TestCase):
             fluid.dygraph.jit.load(self.model_path)
 
 
+class TestJitSaveMultiCases(unittest.TestCase):
+    def setUp(self):
+        # enable dygraph mode
+        fluid.enable_dygraph()
+        # config seed
+        paddle.manual_seed(SEED)
+        paddle.framework.random._manual_program_seed(SEED)
+
+    def verify_inference_correctness(self, layer, model_path, with_label=False):
+        layer.eval()
+        loaded_layer = paddle.jit.load(model_path)
+        loaded_layer.eval()
+        # inference & compare
+        x = paddle.to_variable(np.random.random((1, 784)).astype('float32'))
+        if with_label:
+            y = paddle.to_variable(np.random.random((1, 1)).astype('int64'))
+            pred, _ = layer(x, y)
+            pred = pred.numpy()
+        else:
+            pred = layer(x).numpy()
+        loaded_pred = loaded_layer(x).numpy()
+        self.assertTrue(
+            np.array_equal(pred, loaded_pred),
+            msg="Result diff when load and inference:\nlayer result:\n{}\n" \
+                "loaded layer result:\n{}".format(pred, loaded_pred))
+
+    def test_no_prune_to_static_after_train(self):
+        layer = LinearNet(784, 1)
+
+        train(layer)
+
+        model_path = "test_no_prune_to_static_after_train"
+        paddle.jit.save(layer, model_path)
+
+        self.verify_inference_correctness(layer, model_path)
+
+    def test_no_prune_to_static_no_train(self):
+        layer = LinearNetWithInputSpec(784, 1)
+
+        model_path = "test_no_prune_to_static_no_train"
+        paddle.jit.save(layer, model_path)
+
+        self.verify_inference_correctness(layer, model_path)
+
+    def test_no_prune_no_to_static_after_train(self):
+        layer = LinearNetNotDeclarative(784, 1)
+
+        train(layer)
+
+        model_path = "test_no_prune_no_to_static_after_train"
+        paddle.jit.save(
+            layer,
+            model_path,
+            input_spec=[InputSpec(
+                shape=[None, 784], dtype='float32')])
+
+        self.verify_inference_correctness(layer, model_path)
+
+    def test_no_prune_no_to_static_after_train_with_examples(self):
+        layer = LinearNetNotDeclarative(784, 1)
+
+        example_inputs, _, _ = train(layer)
+
+        model_path = "test_no_prune_no_to_static_after_train_with_examples"
+        fluid.dygraph.jit.save(
+            layer=layer, model_path=model_path, input_spec=example_inputs)
+
+        self.verify_inference_correctness(layer, model_path)
+
+    def test_no_prune_no_to_static_no_train(self):
+        layer = LinearNetNotDeclarative(784, 1)
+
+        model_path = "test_no_prune_no_to_static_no_train"
+        paddle.jit.save(
+            layer,
+            model_path,
+            input_spec=[InputSpec(
+                shape=[None, 784], dtype='float32')])
+
+        self.verify_inference_correctness(layer, model_path)
+
+    def test_prune_to_static_after_train(self):
+        layer = LinerNetWithLabel(784, 1)
+
+        out = train_with_label(layer)
+
+        model_path = "test_prune_to_static_after_train"
+        configs = paddle.SaveLoadConfig()
+        configs.output_spec = [out]
+        paddle.jit.save(
+            layer,
+            model_path,
+            input_spec=[
+                InputSpec(
+                    shape=[None, 784], dtype='float32', name="image")
+            ],
+            configs=configs)
+
+        self.verify_inference_correctness(layer, model_path, True)
+
+    def test_prune_to_static_no_train(self):
+        layer = LinerNetWithLabel(784, 1)
+
+        model_path = "test_prune_to_static_no_train"
+        configs = paddle.SaveLoadConfig()
+        # TODO: no train, cannot get output_spec var here
+        # now only can use index
+        configs.output_spec = layer.forward.outputs[:1]
+        paddle.jit.save(
+            layer,
+            model_path,
+            input_spec=[
+                InputSpec(
+                    shape=[None, 784], dtype='float32', name="image")
+            ],
+            configs=configs)
+
+        self.verify_inference_correctness(layer, model_path, True)
+
+    def test_no_prune_input_spec_name_warning(self):
+        layer = LinearNetWithInputSpec(784, 1)
+
+        train(layer)
+
+        model_path = "test_no_prune_input_spec_name_warning"
+        paddle.jit.save(
+            layer,
+            model_path,
+            input_spec=[InputSpec(
+                shape=[None, 784], dtype='float32')])
+        paddle.jit.save(
+            layer,
+            model_path,
+            input_spec=[
+                InputSpec(
+                    shape=[None, 784], dtype='float32', name='feed_input')
+            ])
+
+        self.verify_inference_correctness(layer, model_path)
+
+    def test_not_prune_output_spec_name_warning(self):
+        layer = LinearNet(784, 1)
+
+        train(layer)
+
+        model_path = "test_not_prune_output_spec_name_warning"
+        configs = paddle.SaveLoadConfig()
+        out = paddle.to_variable(np.random.random((1, 1)).astype('float'))
+        configs.output_spec = [out]
+        paddle.jit.save(layer, model_path, configs=configs)
+
+        self.verify_inference_correctness(layer, model_path)
+
+    def test_prune_input_spec_name_error(self):
+        layer = LinerNetWithLabel(784, 1)
+
+        model_path = "test_prune_input_spec_name_error"
+        with self.assertRaises(ValueError):
+            paddle.jit.save(
+                layer,
+                model_path,
+                input_spec=[InputSpec(
+                    shape=[None, 784], dtype='float32')])
+        with self.assertRaises(ValueError):
+            paddle.jit.save(
+                layer,
+                model_path,
+                input_spec=[
+                    InputSpec(
+                        shape=[None, 784], dtype='float32', name='feed_input')
+                ])
+
+    def test_prune_output_spec_name_error(self):
+        layer = LinerNetWithLabel(784, 1)
+
+        train_with_label(layer)
+
+        model_path = "test_prune_to_static_after_train"
+        configs = paddle.SaveLoadConfig()
+        out = paddle.to_variable(np.random.random((1, 1)).astype('float'))
+        configs.output_spec = [out]
+        with self.assertRaises(ValueError):
+            paddle.jit.save(
+                layer,
+                model_path,
+                input_spec=[
+                    InputSpec(
+                        shape=[None, 784], dtype='float32', name="image")
+                ],
+                configs=configs)
+
+
+class TestJitSaveLoadEmptyLayer(unittest.TestCase):
+    def setUp(self):
+        self.model_path = "model.jit_save_load_empty_layer"
+        # enable dygraph mode
+        paddle.disable_static()
+
+    def test_save_load_empty_layer(self):
+        layer = EmptyLayer()
+        x = paddle.to_variable(np.random.random((10)).astype('float32'))
+        out = layer(x)
+        paddle.jit.save(layer, self.model_path)
+        load_layer = paddle.jit.load(self.model_path)
+        load_out = load_layer(x)
+        self.assertTrue(np.array_equal(out, load_out))
+
+
+class TestJitSaveLoadNoParamLayer(unittest.TestCase):
+    def setUp(self):
+        self.model_path = "model.jit_save_load_no_param_layer"
+        # enable dygraph mode
+        paddle.disable_static()
+
+    def test_save_load_no_param_layer(self):
+        layer = NoParamLayer()
+        x = paddle.to_variable(np.random.random((5)).astype('float32'))
+        y = paddle.to_variable(np.random.random((5)).astype('float32'))
+        out = layer(x, y)
+        paddle.jit.save(layer, self.model_path)
+        load_layer = paddle.jit.load(self.model_path)
+        load_out = load_layer(x, y)
+        self.assertTrue(np.array_equal(out, load_out))
+
+
 if __name__ == '__main__':
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_kldiv_loss_op.py b/python/paddle/fluid/tests/unittests/test_kldiv_loss_op.py
index 8780727e4cb276a989a8d04d05c6419a4874e7f5..041fe4e9043d60852fcaab42bc233b63b39609ce 100644
--- a/python/paddle/fluid/tests/unittests/test_kldiv_loss_op.py
+++ b/python/paddle/fluid/tests/unittests/test_kldiv_loss_op.py
@@ -24,7 +24,10 @@ def kldiv_loss(x, target, reduction):
     loss = np.where(target >= 0, output, np.zeros_like(x))
 
     if reduction == "batchmean":
-        return loss.sum() / x.shape[0]
+        if len(x.shape) > 0:
+            return loss.sum() / x.shape[0]
+        else:
+            return loss.sum()
     if reduction == "mean":
         return loss.mean()
     if reduction == "sum":
@@ -93,6 +96,9 @@ class TestKLDivLossDygraph(unittest.TestCase):
     def test_kl_loss_batchmean(self):
         self.run_kl_loss('batchmean')
 
+    def test_kl_loss_batchmean_shape(self):
+        self.run_kl_loss('batchmean', ())
+
     def test_kl_loss_mean(self):
         self.run_kl_loss('mean')
 
diff --git a/python/paddle/fluid/tests/unittests/test_launch.sh b/python/paddle/fluid/tests/unittests/test_launch.sh
index 98c907a551965331f79d1635362213b43d867002..958d78246627d4cd2f826f74aeccff5ffe254034 100644
--- a/python/paddle/fluid/tests/unittests/test_launch.sh
+++ b/python/paddle/fluid/tests/unittests/test_launch.sh
@@ -48,9 +48,9 @@ if [ -f $file_1 ]; then
     rm $file_1
 fi
 
-
+# test use DISTRIBUTED_TRAINER_ENDPOINTS env in paddlecloud
 unset PADDLE_PORT
-unset TRAINER_PORTS_NUM
+export DISTRIBUTED_TRAINER_ENDPOINTS=127.0.0.1:6170,127.0.0.1:6171,127.0.0.2:6170,127.0.0.2:6171
 
 echo ""
 echo "paddle.distributed.launch async poll process test"
diff --git a/python/paddle/fluid/tests/unittests/test_layers.py b/python/paddle/fluid/tests/unittests/test_layers.py
index b76887f0965ca64b2b40bf9c0ce6e82b44fdad2f..26073f49bdd3d494da7b39346c5bafb2aefba56a 100644
--- a/python/paddle/fluid/tests/unittests/test_layers.py
+++ b/python/paddle/fluid/tests/unittests/test_layers.py
@@ -2677,13 +2677,6 @@ class TestBook(LayerTest):
             out = layers.sigmoid(input, name='sigmoid')
             return (out)
 
-    def make_logsigmoid(self):
-        with program_guard(fluid.default_main_program(),
-                           fluid.default_startup_program()):
-            input = self._get_data(name="input", shape=[16], dtype="float32")
-            out = layers.logsigmoid(input, name='logsigmoid')
-            return (out)
-
     def make_exp(self):
         with program_guard(fluid.default_main_program(),
                            fluid.default_startup_program()):
@@ -3318,15 +3311,29 @@ class TestBook(LayerTest):
             return (out)
 
     def test_roi_pool(self):
-        # TODO(minqiyang): dygraph do not support lod now
+        x_np = np.random.rand(2, 3, 8, 8).astype('float32')
+        rois_np = np.random.rand(3, 4).astype('float32')
+        rois_num_np = np.array([1, 2]).astype('int32')
+
         with self.static_graph():
-            x = layers.data(name="x", shape=[256, 30, 30], dtype="float32")
-            rois = layers.data(
-                name="rois", shape=[4], dtype="float32", lod_level=1)
-            rois_lod = layers.data(
-                name="rois_lod", shape=[None, ], dtype="int", lod_level=1)
-            output = layers.roi_pool(x, rois, 7, 7, 0.6, rois_lod)
-            return (output)
+            x = layers.data(name="x", shape=[3, 8, 8], dtype="float32")
+            rois = layers.data(name="rois", shape=[4], dtype="float32")
+            rois_num = fluid.data(name="rois_num", shape=[None], dtype="int32")
+            output = layers.roi_pool(x, rois, 4, 4, 0.5, rois_num=rois_num)
+            static_res = self.get_static_graph_result(
+                feed={'x': x_np,
+                      'rois': rois_np,
+                      'rois_num': rois_num_np},
+                fetch_list=[output])[0]
+
+        with self.dynamic_graph():
+            x_dy = base.to_variable(x_np)
+            rois_dy = base.to_variable(rois_np)
+            rois_num_dy = base.to_variable(rois_num_np)
+            dy_res = layers.roi_pool(
+                x_dy, rois_dy, 4, 4, 0.5, rois_num=rois_num_dy)
+            dy_res_value = dy_res[0].numpy()
+        self.assertTrue(np.array_equal(static_res, dy_res_value))
 
     def test_sequence_enumerate(self):
         # TODO(minqiyang): dygraph do not support lod now
@@ -3335,16 +3342,29 @@ class TestBook(LayerTest):
             out = layers.sequence_enumerate(input=x, win_size=2, pad_value=0)
 
     def test_roi_align(self):
-        # TODO(minqiyang): dygraph do not support lod now
+        x_np = np.random.rand(2, 3, 8, 8).astype('float32')
+        rois_np = np.random.rand(3, 4).astype('float32')
+        rois_num_np = np.array([1, 2]).astype('int32')
+
         with self.static_graph():
-            x = layers.data(name="x", shape=[256, 30, 30], dtype="float32")
-            rois = layers.data(
-                name="rois", shape=[4], dtype="float32", lod_level=1)
-            rois_lod = layers.data(
-                name="rois_lod", shape=[None, ], dtype="int", lod_level=1)
-            output = layers.roi_align(x, rois, 14, 14, 0.5, 2, 'roi_align',
-                                      rois_lod)
-            return (output)
+            x = layers.data(name="x", shape=[3, 8, 8], dtype="float32")
+            rois = layers.data(name="rois", shape=[4], dtype="float32")
+            rois_num = fluid.data(name="rois_num", shape=[None], dtype="int32")
+            output = layers.roi_align(x, rois, 4, 4, 0.5, 2, rois_num=rois_num)
+            static_res = self.get_static_graph_result(
+                feed={'x': x_np,
+                      'rois': rois_np,
+                      'rois_num': rois_num_np},
+                fetch_list=[output])[0]
+
+        with self.dynamic_graph():
+            x_dy = base.to_variable(x_np)
+            rois_dy = base.to_variable(rois_np)
+            rois_num_dy = base.to_variable(rois_num_np)
+            dy_res = layers.roi_align(
+                x_dy, rois_dy, 4, 4, 0.5, 2, rois_num=rois_num_dy)
+            dy_res_value = dy_res.numpy()
+        self.assertTrue(np.array_equal(static_res, dy_res_value))
 
     def test_roi_perspective_transform(self):
         # TODO(minqiyang): dygraph do not support lod now
diff --git a/python/paddle/fluid/tests/unittests/test_linear_interp_op.py b/python/paddle/fluid/tests/unittests/test_linear_interp_op.py
index 53e8b02081ae3acf8a7fb5dd2bc6e05cbc3be901..c9948edad0061012cf028bec674a4bb713364541 100755
--- a/python/paddle/fluid/tests/unittests/test_linear_interp_op.py
+++ b/python/paddle/fluid/tests/unittests/test_linear_interp_op.py
@@ -293,7 +293,7 @@ class TestLinearInterpOpAPI2_0(unittest.TestCase):
 
         # dygraph 
         x_data = np.random.random((1, 3, 128)).astype("float32")
-        us_1 = paddle.nn.UpSample(
+        us_1 = paddle.nn.Upsample(
             size=[64, ],
             mode='linear',
             align_mode=1,
@@ -385,19 +385,19 @@ class TestLinearInterpOpError(unittest.TestCase):
 
             def input_shape_error():
                 x1 = fluid.data(name="x1", shape=[1], dtype="float32")
-                out1 = paddle.nn.UpSample(
+                out1 = paddle.nn.Upsample(
                     size=[256, ], data_format='NCW', mode='linear')
                 out1_res = out1(x1)
 
             def data_format_error():
                 x2 = fluid.data(name="x2", shape=[1, 3, 128], dtype="float32")
-                out2 = paddle.nn.UpSample(
+                out2 = paddle.nn.Upsample(
                     size=[256, ], data_format='NHWCD', mode='linear')
                 out2_res = out2(x2)
 
             def out_shape_error():
                 x3 = fluid.data(name="x3", shape=[1, 3, 128], dtype="float32")
-                out3 = paddle.nn.UpSample(
+                out3 = paddle.nn.Upsample(
                     size=[
                         256,
                         256,
diff --git a/python/paddle/fluid/tests/unittests/test_linear_interp_v2_op.py b/python/paddle/fluid/tests/unittests/test_linear_interp_v2_op.py
index 04b56677fc158583fe79ec0dc1276210bd2ebbdc..b34989f5f5c79dfd27158f120175824389ac9731 100755
--- a/python/paddle/fluid/tests/unittests/test_linear_interp_v2_op.py
+++ b/python/paddle/fluid/tests/unittests/test_linear_interp_v2_op.py
@@ -26,6 +26,7 @@ from paddle.nn.functional import interpolate
 
 def linear_interp_np(input,
                      out_w,
+                     scale_w=0,
                      out_size=None,
                      actual_shape=None,
                      align_corners=True,
@@ -44,7 +45,10 @@ def linear_interp_np(input,
         if (align_corners):
             ratio_w = (in_w - 1.0) / (out_w - 1.0)
         else:
-            ratio_w = 1.0 * in_w / out_w
+            if scale_w > 0:
+                ratio_w = 1.0 / scale_w
+            else:
+                ratio_w = 1.0 * in_w / out_w
 
     out = np.zeros((batch_size, channel, out_w))
 
@@ -81,6 +85,7 @@ class TestLinearInterpOp(OpTest):
         self.op_type = "linear_interp_v2"
         input_np = np.random.random(self.input_shape).astype("float64")
 
+        scale_w = 0
         if self.data_layout == "NCHW":
             in_w = self.input_shape[2]
         else:
@@ -95,7 +100,7 @@ class TestLinearInterpOp(OpTest):
         else:
             out_w = self.out_w
 
-        output_np = linear_interp_np(input_np, out_w, self.out_size,
+        output_np = linear_interp_np(input_np, out_w, self.scale, self.out_size,
                                      self.actual_shape, self.align_corners,
                                      self.align_mode, self.data_layout)
         self.inputs = {'X': input_np}
@@ -195,7 +200,7 @@ class TestLinearInterpOpSizeTensor(TestLinearInterpOp):
         else:
             out_w = self.out_w
 
-        output_np = linear_interp_np(input_np, out_w, self.out_size,
+        output_np = linear_interp_np(input_np, out_w, 0, self.out_size,
                                      self.actual_shape, self.align_corners,
                                      self.align_mode, self.data_layout)
 
@@ -309,7 +314,7 @@ class TestLinearInterpOpAPI2_0(unittest.TestCase):
 
         # dygraph 
         x_data = np.random.random((1, 3, 128)).astype("float32")
-        us_1 = paddle.nn.UpSample(
+        us_1 = paddle.nn.Upsample(
             size=[64, ],
             mode='linear',
             align_mode=1,
@@ -342,7 +347,7 @@ class TestResizeLinearOpUint8(OpTest):
         else:
             out_w = self.out_w
 
-        output_np = linear_interp_np(input_np, out_w, self.out_size,
+        output_np = linear_interp_np(input_np, out_w, 0, self.out_size,
                                      self.actual_shape, self.align_corners,
                                      self.align_mode)
         self.inputs = {'X': input_np}
@@ -410,19 +415,19 @@ class TestLinearInterpOpError(unittest.TestCase):
 
             def input_shape_error():
                 x1 = fluid.data(name="x1", shape=[1], dtype="float32")
-                out1 = paddle.nn.UpSample(
+                out1 = paddle.nn.Upsample(
                     size=[256, ], data_format='NCW', mode='linear')
                 out1_res = out1(x1)
 
             def data_format_error():
                 x2 = fluid.data(name="x2", shape=[1, 3, 128], dtype="float32")
-                out2 = paddle.nn.UpSample(
+                out2 = paddle.nn.Upsample(
                     size=[256, ], data_format='NHWCD', mode='linear')
                 out2_res = out2(x2)
 
             def out_shape_error():
                 x3 = fluid.data(name="x3", shape=[1, 3, 128], dtype="float32")
-                out3 = paddle.nn.UpSample(
+                out3 = paddle.nn.Upsample(
                     size=[
                         256,
                         256,
diff --git a/python/paddle/fluid/tests/unittests/test_linspace.py b/python/paddle/fluid/tests/unittests/test_linspace.py
index 6d1f42111eebff0f469317ddf2a9ec7698a7ae1e..03cb84ec99e0259a33a086c3d3e5a71abea09d2b 100644
--- a/python/paddle/fluid/tests/unittests/test_linspace.py
+++ b/python/paddle/fluid/tests/unittests/test_linspace.py
@@ -154,16 +154,16 @@ class TestLinspaceOpError(unittest.TestCase):
             self.assertRaises(TypeError, test_step_dtype)
 
             def test_start_dtype():
-                start = fluid.data(shape=[1], dtype="int32", name="start")
+                start = fluid.data(shape=[1], dtype="float64", name="start")
                 fluid.layers.linspace(start, 10, 1, dtype="float32")
 
-            self.assertRaises(TypeError, test_start_dtype)
+            self.assertRaises(ValueError, test_start_dtype)
 
             def test_end_dtype():
-                end = fluid.data(shape=[1], dtype="int32", name="end")
+                end = fluid.data(shape=[1], dtype="float64", name="end")
                 fluid.layers.linspace(0, end, 1, dtype="float32")
 
-            self.assertRaises(TypeError, test_end_dtype)
+            self.assertRaises(ValueError, test_end_dtype)
 
             def test_num_dtype():
                 num = fluid.data(shape=[1], dtype="int32", name="step")
diff --git a/python/paddle/fluid/tests/unittests/test_listen_and_serv_op.py b/python/paddle/fluid/tests/unittests/test_listen_and_serv_op.py
index 6da37fe4d294b426ba5e494c35396fb01a43a559..6751c8870615438bb051b53f64095e5eb1937892 100644
--- a/python/paddle/fluid/tests/unittests/test_listen_and_serv_op.py
+++ b/python/paddle/fluid/tests/unittests/test_listen_and_serv_op.py
@@ -28,6 +28,8 @@ import unittest
 from multiprocessing import Process
 from op_test import OpTest
 
+paddle.enable_static()
+
 
 def run_pserver(use_cuda, sync_mode, ip, port, trainers, trainer_id):
     remove_ps_flag(os.getpid())
diff --git a/python/paddle/fluid/tests/unittests/test_load_state_dict_from_old_format.py b/python/paddle/fluid/tests/unittests/test_load_state_dict_from_old_format.py
new file mode 100644
index 0000000000000000000000000000000000000000..a1a9b3f444fa411f90e869f5265fa0933393ff56
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/test_load_state_dict_from_old_format.py
@@ -0,0 +1,175 @@
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+
+import os
+import six
+import unittest
+import numpy as np
+
+import paddle
+import paddle.fluid as fluid
+from paddle.fluid import core
+from test_imperative_base import new_program_scope
+
+
+def convolutional_neural_network(img):
+    conv_pool_1 = fluid.nets.simple_img_conv_pool(
+        input=img,
+        filter_size=5,
+        num_filters=20,
+        pool_size=2,
+        pool_stride=2,
+        act="relu")
+    conv_pool_1 = fluid.layers.batch_norm(conv_pool_1)
+    conv_pool_2 = fluid.nets.simple_img_conv_pool(
+        input=conv_pool_1,
+        filter_size=5,
+        num_filters=50,
+        pool_size=2,
+        pool_stride=2,
+        act="relu")
+    prediction = fluid.layers.fc(input=conv_pool_2, size=10, act='softmax')
+    return prediction
+
+
+def static_train_net(img, label):
+    prediction = convolutional_neural_network(img)
+
+    loss = fluid.layers.cross_entropy(input=prediction, label=label)
+    avg_loss = fluid.layers.mean(loss)
+
+    optimizer = fluid.optimizer.SGD(learning_rate=0.001)
+    optimizer.minimize(avg_loss)
+
+    return prediction, avg_loss
+
+
+class TestLoadStateDictFromSaveInferenceModel(unittest.TestCase):
+    def setUp(self):
+        self.seed = 90
+        self.epoch_num = 1
+        self.batch_size = 128
+        self.batch_num = 10
+
+    def train_and_save_model(self, only_params=False):
+        with new_program_scope():
+            startup_program = fluid.default_startup_program()
+            main_program = fluid.default_main_program()
+
+            img = fluid.data(
+                name='img', shape=[None, 1, 28, 28], dtype='float32')
+            label = fluid.data(name='label', shape=[None, 1], dtype='int64')
+
+            prediction, avg_loss = static_train_net(img, label)
+
+            place = fluid.CUDAPlace(0) if core.is_compiled_with_cuda(
+            ) else fluid.CPUPlace()
+
+            exe = fluid.Executor(place)
+
+            feeder = fluid.DataFeeder(feed_list=[img, label], place=place)
+            exe.run(startup_program)
+
+            train_reader = paddle.batch(
+                paddle.reader.shuffle(
+                    paddle.dataset.mnist.train(), buf_size=100),
+                batch_size=self.batch_size)
+
+            for _ in range(0, self.epoch_num):
+                for batch_id, data in enumerate(train_reader()):
+                    exe.run(main_program,
+                            feed=feeder.feed(data),
+                            fetch_list=[avg_loss])
+
+                    if batch_id > self.batch_num:
+                        break
+
+            static_param_dict = {}
+            for param in fluid.default_main_program().all_parameters():
+                static_param_dict[param.name] = fluid.executor._fetch_var(
+                    param.name)
+
+            if only_params:
+                fluid.io.save_params(
+                    exe, self.save_dirname, filename=self.params_filename)
+            else:
+                fluid.io.save_inference_model(
+                    self.save_dirname, ["img"], [prediction],
+                    exe,
+                    model_filename=self.model_filename,
+                    params_filename=self.params_filename)
+
+        return static_param_dict
+
+    def check_load_state_dict(self, orig_dict, load_dict):
+        for var_name, value in six.iteritems(orig_dict):
+            self.assertTrue(np.array_equal(value, load_dict[var_name]))
+
+    def test_load_default(self):
+        self.save_dirname = "static_mnist.load_state_dict.default"
+        self.model_filename = None
+        self.params_filename = None
+        orig_param_dict = self.train_and_save_model()
+
+        load_param_dict, _ = paddle.load(self.save_dirname)
+        self.check_load_state_dict(orig_param_dict, load_param_dict)
+
+    def test_load_with_model_filename(self):
+        self.save_dirname = "static_mnist.load_state_dict.model_filename"
+        self.model_filename = "static_mnist.model"
+        self.params_filename = None
+        orig_param_dict = self.train_and_save_model()
+
+        configs = paddle.SaveLoadConfig()
+        configs.separate_params = True
+        configs.model_filename = self.model_filename
+        load_param_dict, _ = paddle.load(self.save_dirname, configs)
+        self.check_load_state_dict(orig_param_dict, load_param_dict)
+
+    def test_load_with_param_filename(self):
+        self.save_dirname = "static_mnist.load_state_dict.param_filename"
+        self.model_filename = None
+        self.params_filename = "static_mnist.params"
+        orig_param_dict = self.train_and_save_model()
+
+        configs = paddle.SaveLoadConfig()
+        configs.params_filename = self.params_filename
+        load_param_dict, _ = paddle.load(self.save_dirname, configs)
+        self.check_load_state_dict(orig_param_dict, load_param_dict)
+
+    def test_load_with_model_and_param_filename(self):
+        self.save_dirname = "static_mnist.load_state_dict.model_and_param_filename"
+        self.model_filename = "static_mnist.model"
+        self.params_filename = "static_mnist.params"
+        orig_param_dict = self.train_and_save_model()
+
+        configs = paddle.SaveLoadConfig()
+        configs.params_filename = self.params_filename
+        configs.model_filename = self.model_filename
+        load_param_dict, _ = paddle.load(self.save_dirname, configs)
+        self.check_load_state_dict(orig_param_dict, load_param_dict)
+
+    def test_load_state_dict_from_save_params(self):
+        self.save_dirname = "static_mnist.load_state_dict.save_params"
+        self.params_filename = None
+        orig_param_dict = self.train_and_save_model(True)
+
+        load_param_dict, _ = paddle.load(self.save_dirname)
+        self.check_load_state_dict(orig_param_dict, load_param_dict)
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_logsumexp.py b/python/paddle/fluid/tests/unittests/test_logsumexp.py
index c2201a52605bc87246fb9c8734494b19f83ff180..cf9203dffcbaa5da641b3f7cb8925ac9efcbe115 100644
--- a/python/paddle/fluid/tests/unittests/test_logsumexp.py
+++ b/python/paddle/fluid/tests/unittests/test_logsumexp.py
@@ -46,8 +46,8 @@ class TestLogsumexp(OpTest):
         self.inputs = {'X': x}
         self.outputs = {'Out': out}
         self.attrs = {
-            'dim': self.axis,
-            'keep_dim': self.keepdim,
+            'axis': self.axis,
+            'keepdim': self.keepdim,
             'reduce_all': self.reduce_all
         }
 
diff --git a/python/paddle/fluid/tests/unittests/test_lstm_cudnn_op.py b/python/paddle/fluid/tests/unittests/test_lstm_cudnn_op.py
index 90430bbce4d1896c8fdbb829230f2ad8a691adff..29a0fa55f7729bc39b2e9202397563a5cb10747c 100644
--- a/python/paddle/fluid/tests/unittests/test_lstm_cudnn_op.py
+++ b/python/paddle/fluid/tests/unittests/test_lstm_cudnn_op.py
@@ -16,6 +16,7 @@ from __future__ import print_function
 
 import unittest
 import numpy as np
+import math
 
 import paddle.fluid.core as core
 from op_test import OpTest
@@ -27,132 +28,385 @@ SIGMOID_THRESHOLD_MAX = 13.0
 EXP_MAX_INPUT = 40.0
 
 
-def lstm_naive(input, w):
-    seq_len, batch_size, hidden_size = input.shape
-
-    offset = 0
-    wi = w[offset:offset + hidden_size * hidden_size].reshape(
-        (hidden_size, hidden_size)).transpose()
-    offset += hidden_size * hidden_size
-    wf = w[offset:offset + hidden_size * hidden_size].reshape(
-        (hidden_size, hidden_size)).transpose()
-    offset += hidden_size * hidden_size
-    wc = w[offset:offset + hidden_size * hidden_size].reshape(
-        (hidden_size, hidden_size)).transpose()
-    offset += hidden_size * hidden_size
-    wo = w[offset:offset + hidden_size * hidden_size].reshape(
-        (hidden_size, hidden_size)).transpose()
-    offset += hidden_size * hidden_size
-    ri = w[offset:offset + hidden_size * hidden_size].reshape(
-        (hidden_size, hidden_size)).transpose()
-    offset += hidden_size * hidden_size
-    rf = w[offset:offset + hidden_size * hidden_size].reshape(
-        (hidden_size, hidden_size)).transpose()
-    offset += hidden_size * hidden_size
-    rc = w[offset:offset + hidden_size * hidden_size].reshape(
-        (hidden_size, hidden_size)).transpose()
-    offset += hidden_size * hidden_size
-    ro = w[offset:offset + hidden_size * hidden_size].reshape(
-        (hidden_size, hidden_size)).transpose()
-    offset += hidden_size * hidden_size
-
-    bi_1 = w[offset:offset + hidden_size]
-    offset += hidden_size
-    bf_1 = w[offset:offset + hidden_size]
-    offset += hidden_size
-    bc_1 = w[offset:offset + hidden_size]
-    offset += hidden_size
-    bo_1 = w[offset:offset + hidden_size]
-    offset += hidden_size
-
-    bi_2 = w[offset:offset + hidden_size]
-    offset += hidden_size
-    bf_2 = w[offset:offset + hidden_size]
-    offset += hidden_size
-    bc_2 = w[offset:offset + hidden_size]
-    offset += hidden_size
-    bo_2 = w[offset:offset + hidden_size]
-
-    def sigmoid(x):
-        y = np.copy(x)
-        y[x < SIGMOID_THRESHOLD_MIN] = SIGMOID_THRESHOLD_MIN
-        y[x > SIGMOID_THRESHOLD_MAX] = SIGMOID_THRESHOLD_MAX
-        return 1. / (1. + np.exp(-y))
-
-    def tanh(x):
-        y = -2. * x
-        y[y > EXP_MAX_INPUT] = EXP_MAX_INPUT
-        return (2. / (1. + np.exp(y))) - 1.
-
-    output = []
-    pre_h = np.zeros((1, batch_size, hidden_size), dtype=input.dtype)
-    pre_c = np.zeros((1, batch_size, hidden_size), dtype=input.dtype)
-
-    for i in range(seq_len):
-        emb_1 = input[i]
-
-        input_gate = sigmoid(
-            np.matmul(emb_1, wi) + np.matmul(pre_h, ri) + bi_1 + bi_2)
-        forget_gate = sigmoid(
-            np.matmul(emb_1, wf) + np.matmul(pre_h, rf) + bf_1 + bf_2)
-        output_gate = sigmoid(
-            np.matmul(emb_1, wo) + np.matmul(pre_h, ro) + bo_1 + bo_2)
-        c_t_temp = tanh(
-            np.matmul(emb_1, wc) + np.matmul(pre_h, rc) + bc_1 + bc_2)
-        new_c = input_gate * c_t_temp + forget_gate * pre_c
-        new_h = output_gate * tanh(new_c)
-
-        pre_h = new_h
-        pre_c = new_c
-
-        output.append(new_h)
-
-    output = np.concatenate(output, -1)
-    output = output.reshape((batch_size, -1, hidden_size))
-    output = output.transpose((1, 0, 2))
-
-    return output, pre_h, pre_c
+class LayerMixin(object):
+    def __call__(self, *args, **kwargs):
+        return self.forward(*args, **kwargs)
+
+
+class LayerListMixin(LayerMixin):
+    def __init__(self, layers=None):
+        self._layers = list(layers) if layers else []
+
+    def append(self, layer):
+        self._layers.append(layer)
+
+    def __iter__(self):
+        return iter(self._layers)
+
+
+class LSTMCell(LayerMixin):
+    def __init__(self, input_size, hidden_size, bias=True):
+        self.input_size = input_size
+        self.hidden_size = hidden_size
+        self.bias = bias
+        self.dtype = np.float64
+        self.parameters = dict()
+        std = 1.0 / math.sqrt(hidden_size)
+        self.weight_ih = np.ones(
+            (4 * hidden_size, input_size), dtype=self.dtype)
+        self.weight_hh = np.ones((4 * hidden_size,
+                                  hidden_size)).astype(self.dtype)
+        self.parameters['weight_ih'] = self.weight_ih
+        self.parameters['weight_hh'] = self.weight_hh
+        if bias:
+            self.bias_ih = np.ones((4 * hidden_size)).astype(self.dtype)
+            self.bias_hh = np.ones((4 * hidden_size)).astype(self.dtype)
+            self.parameters['bias_ih'] = self.bias_ih
+            self.parameters['bias_hh'] = self.bias_hh
+        else:
+            self.bias_ih = None
+            self.bias_hh = None
+
+    def init_state(self, inputs):
+        batch_size = inputs.shape[0]
+        init_h = np.zeros((batch_size, self.hidden_size), dtype=inputs.dtype)
+        init_c = np.zeros((batch_size, self.hidden_size), dtype=inputs.dtype)
+        return init_h, init_c
+
+    def forward(self, inputs, hx=None):
+        if hx is None:
+            hx = self.init_state(inputs)
+        pre_hidden, pre_cell = hx
+        gates = np.matmul(inputs, self.weight_ih.T)
+        if self.bias_ih is not None:
+            gates = gates + self.bias_ih
+        gates += np.matmul(pre_hidden, self.weight_hh.T)
+        if self.bias_hh is not None:
+            gates = gates + self.bias_hh
+
+        chunked_gates = np.split(gates, 4, -1)
+
+        i = 1.0 / (1.0 + np.exp(-chunked_gates[0]))
+        f = 1.0 / (1.0 + np.exp(-chunked_gates[1]))
+        o = 1.0 / (1.0 + np.exp(-chunked_gates[3]))
+        c = f * pre_cell + i * np.tanh(chunked_gates[2])
+        h = o * np.tanh(c)
+
+        return h, (h, c)
+
+
+def sequence_mask(lengths, max_len=None):
+    if max_len is None:
+        max_len = np.max(lengths)
+    else:
+        assert max_len >= np.max(lengths)
+    return np.arange(max_len) < np.expand_dims(lengths, -1)
+
+
+def update_state(mask, new, old):
+    if not isinstance(old, (tuple, list)):
+        return np.where(mask, new, old)
+    else:
+        return tuple(map(lambda x, y: np.where(mask, x, y), new, old))
+
+
+def rnn(cell,
+        inputs,
+        initial_states,
+        sequence_length=None,
+        time_major=False,
+        is_reverse=False):
+    if not time_major:
+        inputs = np.transpose(inputs, [1, 0, 2])
+    if is_reverse:
+        inputs = np.flip(inputs, 0)
+
+    if sequence_length is None:
+        mask = None
+    else:
+        mask = np.transpose(sequence_mask(sequence_length), [1, 0])
+        mask = np.expand_dims(mask, -1)
+        if is_reverse:
+            mask = np.flip(mask, 0)
+
+    time_steps = inputs.shape[0]
+    state = initial_states
+    outputs = []
+    for t in range(time_steps):
+        x_t = inputs[t]
+        if mask is not None:
+            m_t = mask[t]
+            y, new_state = cell(x_t, state)
+            y = np.where(m_t, y, 0.)
+            outputs.append(y)
+            state = update_state(m_t, new_state, state)
+        else:
+            y, new_state = cell(x_t, state)
+            outputs.append(y)
+            state = new_state
+
+    outputs = np.stack(outputs)
+    final_state = state
+
+    if is_reverse:
+        outputs = np.flip(outputs, 0)
+    if not time_major:
+        outputs = np.transpose(outputs, [1, 0, 2])
+    return outputs, final_state
+
+
+def birnn(cell_fw,
+          cell_bw,
+          inputs,
+          initial_states,
+          sequence_length=None,
+          time_major=False):
+    states_fw, states_bw = initial_states
+    outputs_fw, states_fw = rnn(cell_fw,
+                                inputs,
+                                states_fw,
+                                sequence_length,
+                                time_major=time_major)
+
+    outputs_bw, states_bw = rnn(cell_bw,
+                                inputs,
+                                states_bw,
+                                sequence_length,
+                                time_major=time_major,
+                                is_reverse=True)
+
+    outputs = np.concatenate((outputs_fw, outputs_bw), -1)
+    final_states = (states_fw, states_bw)
+    return outputs, final_states
+
+
+def flatten(nested):
+    return list(_flatten(nested))
+
+
+def _flatten(nested):
+    for item in nested:
+        if isinstance(item, (list, tuple)):
+            for subitem in _flatten(item):
+                yield subitem
+        else:
+            yield item
+
+
+def unstack(array, axis=0):
+    num = array.shape[axis]
+    sub_arrays = np.split(array, num, axis)
+    return [np.squeeze(sub_array, axis) for sub_array in sub_arrays]
+
+
+def dropout(array, p=0.0):
+    if p == 0.0:
+        return array
+
+    mask = (np.random.uniform(size=array.shape) < (1 - p)).astype(array.dtype)
+    return array * (mask / (1 - p))
+
+
+def split_states(states, bidirectional=False, state_components=1):
+    if state_components == 1:
+        states = unstack(states)
+        if not bidirectional:
+            return states
+        else:
+            return list(zip(states[::2], states[1::2]))
+    else:
+        assert len(states) == state_components
+        states = tuple([unstack(item) for item in states])
+        if not bidirectional:
+            return list(zip(*states))
+        else:
+            states = list(zip(*states))
+            return list(zip(states[::2], states[1::2]))
+
+
+def concat_states(states, bidirectional=False, state_components=1):
+    if state_components == 1:
+        return np.stack(flatten(states))
+    else:
+        states = flatten(states)
+        componnets = []
+        for i in range(state_components):
+            componnets.append(states[i::state_components])
+        return [np.stack(item) for item in componnets]
+
+
+class RNN(LayerMixin):
+    def __init__(self, cell, is_reverse=False, time_major=False):
+        super(RNN, self).__init__()
+        self.cell = cell
+        if not hasattr(self.cell, "call"):
+            # for non-dygraph mode, `rnn` api uses cell.call
+            self.cell.call = self.cell.forward
+        self.is_reverse = is_reverse
+        self.time_major = time_major
+
+    def forward(self, inputs, initial_states=None, sequence_length=None):
+        final_outputs, final_states = rnn(self.cell,
+                                          inputs,
+                                          initial_states=initial_states,
+                                          sequence_length=sequence_length,
+                                          time_major=self.time_major,
+                                          is_reverse=self.is_reverse)
+        return final_outputs, final_states
+
+
+class BiRNN(LayerMixin):
+    def __init__(self, cell_fw, cell_bw, time_major=False):
+        super(BiRNN, self).__init__()
+        self.cell_fw = cell_fw
+        self.cell_bw = cell_bw
+        self.time_major = time_major
+
+    def forward(self,
+                inputs,
+                initial_states=None,
+                sequence_length=None,
+                **kwargs):
+        if isinstance(initial_states, (list, tuple)):
+            assert len(initial_states) == 2, \
+                "length of initial_states should be 2 when it is a list/tuple"
+        else:
+            initial_states = [initial_states, initial_states]
+
+        outputs, final_states = birnn(self.cell_fw, self.cell_bw, inputs,
+                                      initial_states, sequence_length,
+                                      self.time_major)
+        return outputs, final_states
+
+
+class RNNMixin(LayerListMixin):
+    def forward(self, inputs, initial_states=None, sequence_length=None):
+        batch_index = 1 if self.time_major else 0
+        batch_size = inputs.shape[batch_index]
+        dtype = inputs.dtype
+        if initial_states is None:
+            state_shape = (self.num_layers * self.num_directions, batch_size,
+                           self.hidden_size)
+            if self.state_components == 1:
+                initial_states = np.zeros(state_shape, dtype)
+            else:
+                initial_states = tuple([
+                    np.zeros(state_shape, dtype)
+                    for _ in range(self.state_components)
+                ])
+
+        states = split_states(initial_states, self.num_directions == 2,
+                              self.state_components)
+        final_states = []
+
+        for i, rnn_layer in enumerate(self):
+            if i > 0:
+                inputs = dropout(inputs, self.dropout)
+            outputs, final_state = rnn_layer(inputs, states[i], sequence_length)
+            final_states.append(final_state)
+            inputs = outputs
+
+        final_states = concat_states(final_states, self.num_directions == 2,
+                                     self.state_components)
+        return outputs, final_states
+
+
+class LSTM(RNNMixin):
+    def __init__(self,
+                 input_size,
+                 hidden_size,
+                 num_layers=1,
+                 direction="forward",
+                 dropout=0.,
+                 time_major=False):
+        super(LSTM, self).__init__()
+
+        if direction in ["forward", "backward"]:
+            is_reverse = direction == "backward"
+            cell = LSTMCell(input_size, hidden_size)
+            self.append(RNN(cell, is_reverse, time_major))
+            for i in range(1, num_layers):
+                cell = LSTMCell(hidden_size, hidden_size)
+                self.append(RNN(cell, is_reverse, time_major))
+        elif direction == "bidirectional":
+            cell_fw = LSTMCell(input_size, hidden_size)
+            cell_bw = LSTMCell(input_size, hidden_size)
+            self.append(BiRNN(cell_fw, cell_bw, time_major))
+            for i in range(1, num_layers):
+                cell_fw = LSTMCell(2 * hidden_size, hidden_size)
+                cell_bw = LSTMCell(2 * hidden_size, hidden_size)
+                self.append(BiRNN(cell_fw, cell_bw, time_major))
+        else:
+            raise ValueError(
+                "direction should be forward, backward or bidirectional, "
+                "received direction = {}".format(direction))
+
+        self.input_size = input_size
+        self.hidden_size = hidden_size
+        self.dropout = dropout
+        self.num_directions = 2 if direction == "bidirectional" else 1
+        self.time_major = time_major
+        self.num_layers = num_layers
+        self.state_components = 2
 
 
 @unittest.skipIf(not core.is_compiled_with_cuda(),
                  "core is not compiled with CUDA")
 class TestCUDNNLstmOp(OpTest):
-    # TODO(GaoWei8):when input dtype is fp64, precision threshold should be removed.
+    #TODO(GaoWei8): Need to satisfy the result through the new interface
     def setUp(self):
         self.op_type = "cudnn_lstm"
         self.dtype = np.float64
+        self.sequence_length = np.array([12, 11, 10, 9, 8], dtype=np.int32)
+        self.num_layers = 1
 
-        seq_length = 20
+        seq_length = 12
         batch_size = 5
-        hidden_size = 20
+        input_size = 21
+        hidden_size = 21
 
         input_weight_size = (hidden_size * hidden_size) * 4
         hidden_weight_size = (hidden_size * hidden_size) * 4
         weight_size = input_weight_size + hidden_weight_size
         weight_size += hidden_size * 8
+        weight_size *= self.num_layers
 
         input = np.random.uniform(
-            low=-0.1, high=0.1, size=(seq_length, batch_size,
-                                      hidden_size)).astype(self.dtype)
-        flat_w = np.random.uniform(
-            low=-0.1, high=0.1, size=(weight_size)).astype(self.dtype)
-
-        output, last_hidden, last_cell = lstm_naive(input, flat_w)
-
-        init_h = np.zeros((1, batch_size, hidden_size), dtype=np.float64)
-        init_c = np.zeros((1, batch_size, hidden_size), dtype=np.float64)
+            low=-0.1, high=0.1,
+            size=(seq_length, batch_size, input_size)).astype(self.dtype)
+        input[11][1:][:] = 0
+        input[10][2:][:] = 0
+        input[9][3:][:] = 0
+        input[8][4:][:] = 0
+
+        rnn1 = LSTM(
+            input_size,
+            hidden_size,
+            self.num_layers,
+            time_major=True,
+            direction="forward")
+
+        output, (last_hidden, last_cell) = rnn1(
+            input, sequence_length=self.sequence_length)
+
+        flat_w = np.ones((weight_size)).astype(self.dtype)
+        init_h = np.zeros((self.num_layers, batch_size,
+                           hidden_size)).astype(self.dtype)
+        init_c = np.zeros((self.num_layers, batch_size,
+                           hidden_size)).astype(self.dtype)
         state_out = np.ndarray((300)).astype("uint8")
 
         self.inputs = {
             'Input': input,
             'W': flat_w,
             'InitH': init_h,
-            'InitC': init_c
+            'InitC': init_c,
+            'SequenceLength': self.sequence_length
         }
         self.attrs = {
             'dropout_prob': 0.0,
             'is_bidirec': False,
-            'input_size': hidden_size,
+            'input_size': input_size,
             'hidden_size': hidden_size,
             'num_layers': 1,
         }
@@ -164,19 +418,26 @@ class TestCUDNNLstmOp(OpTest):
             'StateOut': state_out
         }
 
+    def set_attrs(self):
+        pass
+
     def test_output_with_place(self):
-        # depend on the scope structure
         place = core.CUDAPlace(0)
         self.check_output_with_place(
             place, no_check_set=['Reserve', 'StateOut'])
 
     def test_grad_with_place(self):
-        # depend on the scope structure
         place = core.CUDAPlace(0)
-        self.check_grad_with_place(
-            place,
-            set(['Input', 'W', 'InitH', 'InitC']), ['Out', 'LastH', 'LastC'],
-            max_relative_error=1e-4)
+        self.check_grad_with_place(place,
+                                   set(['Input', 'W', 'InitH', 'InitC']),
+                                   ['Out', 'LastH', 'LastC'])
+
+
+@unittest.skipIf(not core.is_compiled_with_cuda(),
+                 "core is not compiled with CUDA")
+class TestCUDNNLstmOp2(TestCUDNNLstmOp):
+    def set_attrs(self):
+        self.num_layers = 2
 
 
 @unittest.skipIf(not core.is_compiled_with_cuda(),
@@ -198,7 +459,7 @@ class TestCUDNNlstmAPI(unittest.TestCase):
                                       'float64', 0.0)
         rnn_out, last_h, last_c = layers.lstm(input, init_h, init_c, seq_len,
                                               hidden_size, num_layers,
-                                              dropout_prob)
+                                              dropout_prob, False, True)
         exe = fluid.Executor(fluid.CUDAPlace(0))
         exe.run(fluid.default_startup_program())
         input_i = np.random.uniform(
@@ -208,12 +469,6 @@ class TestCUDNNlstmAPI(unittest.TestCase):
                       feed={'input': input_i},
                       fetch_list=[rnn_out, last_h, last_c, 'cudnn_lstm_0.w_0'])
 
-        output, last_hidden, last_cell = lstm_naive(input_i, out[3])
-
-        self.assertTrue(np.allclose(output, out[0], atol=1e-5))
-        self.assertTrue(np.allclose(last_hidden, out[1], atol=1e-5))
-        self.assertTrue(np.allclose(last_cell, out[2], atol=1e-5))
-
 
 if __name__ == '__main__':
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_math_op_patch.py b/python/paddle/fluid/tests/unittests/test_math_op_patch.py
index 00137f63e244a0e166047e89f9ef436da158ed16..f6eff22d6ce5f06d8853d6244f79b4b07b3fa4f5 100644
--- a/python/paddle/fluid/tests/unittests/test_math_op_patch.py
+++ b/python/paddle/fluid/tests/unittests/test_math_op_patch.py
@@ -189,15 +189,15 @@ class TestMathOpPatches(unittest.TestCase):
     @prog_scope()
     def test_integer_div(self):
         a = fluid.layers.data(name="a", shape=[1], dtype='int64')
-        b = a / 2
+        b = a / 7
         place = fluid.CPUPlace()
         exe = fluid.Executor(place)
-        a_np = numpy.array([3, 4, 10, 14, 9, 18])
+        a_np = numpy.array([3, 4, 10, 14, 9, 18]).astype('int64')
         b_np, = exe.run(fluid.default_main_program(),
                         feed={"a": a_np},
                         fetch_list=[b])
-        # for paddle2.0, use true_divide
-        b_np_actual = (a_np / 2.0)
+
+        b_np_actual = (a_np / 7).astype('int64')
         self.assertTrue(numpy.array_equal(b_np, b_np_actual))
 
     @prog_scope()
diff --git a/python/paddle/fluid/tests/unittests/test_math_op_patch_var_base.py b/python/paddle/fluid/tests/unittests/test_math_op_patch_var_base.py
index 9bb12d546550a821e8a133dd9c91d5d41a50b1b2..a70862f40197c513a0cd04753553264708ee2a1c 100644
--- a/python/paddle/fluid/tests/unittests/test_math_op_patch_var_base.py
+++ b/python/paddle/fluid/tests/unittests/test_math_op_patch_var_base.py
@@ -307,7 +307,7 @@ class TestMathOpPatchesVarBase(unittest.TestCase):
             np.array_equal(x.sigmoid().numpy(), fluid.layers.sigmoid(x).numpy(
             )))
         self.assertTrue(
-            np.array_equal(x.logsigmoid().numpy(),
+            np.array_equal(x.log_sigmoid().numpy(),
                            fluid.layers.logsigmoid(x).numpy()))
         self.assertTrue(np.array_equal(x.exp().numpy(), paddle.exp(x).numpy()))
         self.assertTrue(
diff --git a/python/paddle/fluid/tests/unittests/test_monitor.py b/python/paddle/fluid/tests/unittests/test_monitor.py
index f6207edb41c190ac51dfe67dad22bb0191a67a07..cf273876b1f2f8a9b4828375ca6e20e591feb306 100644
--- a/python/paddle/fluid/tests/unittests/test_monitor.py
+++ b/python/paddle/fluid/tests/unittests/test_monitor.py
@@ -52,18 +52,17 @@ class TestDatasetWithStat(unittest.TestCase):
                 name=slot, shape=[1], dtype="int64", lod_level=1)
             slots_vars.append(var)
 
-        dataset = paddle.distributed.fleet.DatasetFactory().create_dataset(
-            "InMemoryDataset")
-        dataset.set_batch_size(32)
-        dataset.set_thread(3)
+        dataset = paddle.distributed.InMemoryDataset()
+        dataset._set_batch_size(32)
+        dataset._set_thread(3)
         dataset.set_filelist([
             "test_in_memory_dataset_run_a.txt",
             "test_in_memory_dataset_run_b.txt"
         ])
-        dataset.set_pipe_command("cat")
-        dataset.set_use_var(slots_vars)
+        dataset._set_pipe_command("cat")
+        dataset._set_use_var(slots_vars)
         dataset.load_into_memory()
-        dataset.set_fea_eval(1, True)
+        dataset._set_fea_eval(1, True)
         dataset.slots_shuffle(["slot1"])
 
         exe = fluid.Executor(fluid.CPUPlace())
diff --git a/python/paddle/fluid/tests/unittests/test_multiprocess_dataloader_exception.py b/python/paddle/fluid/tests/unittests/test_multiprocess_dataloader_exception.py
index 3a8867f6bd29f5bc0e512f9c8b22ecf192253fc7..6fd14b40bc9108b6075a0ac1f40cbefd79b8f0d9 100644
--- a/python/paddle/fluid/tests/unittests/test_multiprocess_dataloader_exception.py
+++ b/python/paddle/fluid/tests/unittests/test_multiprocess_dataloader_exception.py
@@ -27,6 +27,7 @@ import paddle.fluid.core as core
 from paddle.io import Dataset, IterableDataset, BatchSampler, DataLoader
 from paddle.fluid.dygraph.nn import Linear
 from paddle.fluid.dygraph.base import to_variable
+from paddle.fluid.dataloader.dataloader_iter import _worker_loop
 
 
 class RandomDataset(Dataset):
@@ -185,9 +186,10 @@ class TestDataLoaderWorkerLoop(unittest.TestCase):
                 for i in range(10):
                     indices_queue.put([i, i + 10])
                 indices_queue.put(None)
-                loader._worker_loop(
-                    loader._dataset, 0, indices_queue, loader._data_queue,
-                    loader._workers_done_event, _collate_fn, _init_fn, 0, 1)
+                _worker_loop(loader._dataset, 0, indices_queue,
+                             loader._data_queue, loader._workers_done_event,
+                             _collate_fn, _init_fn, 0, 1,
+                             loader._use_shared_memory)
                 self.assertTrue(False)
         except AssertionError:
             pass
@@ -228,9 +230,10 @@ class TestDataLoaderWorkerLoop(unittest.TestCase):
                     indices_queue.put([i, i + 10])
                 indices_queue.put(None)
                 loader._workers_done_event.set()
-                loader._worker_loop(
-                    loader._dataset, 0, indices_queue, loader._data_queue,
-                    loader._workers_done_event, _collate_fn, _init_fn, 0, 1)
+                _worker_loop(loader._dataset, 0, indices_queue,
+                             loader._data_queue, loader._workers_done_event,
+                             _collate_fn, _init_fn, 0, 1,
+                             loader._use_shared_memory)
                 self.assertTrue(True)
         except AssertionError:
             pass
diff --git a/python/paddle/fluid/tests/unittests/test_multiprocess_reader_exception.py b/python/paddle/fluid/tests/unittests/test_multiprocess_reader_exception.py
index 39cb6651a4b7e7a31c90110771676641a14be292..9634f5af30a4649768ddcaf3ae117548d29b1726 100644
--- a/python/paddle/fluid/tests/unittests/test_multiprocess_reader_exception.py
+++ b/python/paddle/fluid/tests/unittests/test_multiprocess_reader_exception.py
@@ -12,8 +12,9 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+import paddle
 import paddle.fluid as fluid
-from paddle.fluid.io import multiprocess_reader
+from paddle.reader import multiprocess_reader
 import unittest
 import numpy as np
 import six
diff --git a/python/paddle/fluid/tests/unittests/test_mv_op.py b/python/paddle/fluid/tests/unittests/test_mv_op.py
new file mode 100644
index 0000000000000000000000000000000000000000..6b930e59aa554c57ba1ecae2c01aaefabbe578e9
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/test_mv_op.py
@@ -0,0 +1,94 @@
+#Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+
+import unittest
+import numpy as np
+import paddle
+import paddle.fluid as fluid
+import paddle.fluid.layers as layers
+import paddle.fluid.core as core
+from op_test import OpTest
+
+
+class TestMVOp(OpTest):
+    def setUp(self):
+        self.op_type = "mv"
+        self.init_config()
+        self.inputs = {'X': self.x, 'Vec': self.vec}
+        self.outputs = {'Out': np.dot(self.x, self.vec)}
+
+    def test_check_output(self):
+        self.check_output()
+
+    def test_check_grad(self):
+        self.check_grad(['X', 'Vec'], 'Out')
+
+    def init_config(self):
+        self.x = np.random.random((5, 100)).astype("float64")
+        self.vec = np.random.random((100)).astype("float64")
+
+
+class TestMVAPI(unittest.TestCase):
+    def test_dygraph_api_out(self):
+        paddle.disable_static()
+
+        self.x_data = np.random.random((5, 100)).astype("float64")
+        self.x = paddle.to_tensor(self.x_data)
+        self.vec_data = np.random.random((100)).astype("float64")
+        self.vec = paddle.to_tensor(self.vec_data)
+        z = paddle.mv(self.x, self.vec)
+        np_z = z.numpy()
+        z_expected = np.array(np.dot(self.x_data, self.vec_data))
+        self.assertTrue(np.allclose(np_z, z_expected))
+
+        paddle.enable_static()
+
+    def test_static_graph(self):
+        paddle.enable_static()
+
+        self.input_x = np.random.rand(5, 100).astype("float64")
+        self.input_vec = np.random.rand(100).astype("float64")
+
+        data_x = paddle.static.data("x", shape=[5, 100], dtype="float64")
+        data_vec = paddle.static.data("vec", shape=[100], dtype="float64")
+        result_vec = paddle.mv(data_x, data_vec)
+        self.place = paddle.CPUPlace()
+        exe = paddle.static.Executor(self.place)
+        res, = exe.run(feed={"x": self.input_x,
+                             "vec": self.input_vec},
+                       fetch_list=[result_vec])
+        z_expected = np.array(np.dot(self.input_x, self.input_vec))
+        self.assertTrue(np.allclose(res, z_expected))
+
+
+class TestMVError(unittest.TestCase):
+    def test_input(self):
+        def test_shape():
+            paddle.enable_static()
+
+            self.input_x = np.random.rand(5, 100).astype("float64")
+            self.input_vec = np.random.rand(100).astype("float64")
+
+            data_x = paddle.static.data("x", shape=[5, 100], dtype="float64")
+            data_vec = paddle.static.data(
+                "vec", shape=[100, 2], dtype="float64")
+            result_vec = paddle.mv(data_x, data_vec)
+
+        self.assertRaises(ValueError, test_shape)
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_nan_inf.py b/python/paddle/fluid/tests/unittests/test_nan_inf.py
index d4a971d25bc334906ce1737d963fcf419d452df3..dc9ea5d957aed42e11e978ce6d221c873696030c 100644
--- a/python/paddle/fluid/tests/unittests/test_nan_inf.py
+++ b/python/paddle/fluid/tests/unittests/test_nan_inf.py
@@ -19,6 +19,9 @@ import unittest
 import os
 import sys
 import subprocess
+import paddle
+
+paddle.enable_static()
 
 
 class TestNanInf(unittest.TestCase):
diff --git a/python/paddle/fluid/tests/unittests/test_nearest_interp_v2_op.py b/python/paddle/fluid/tests/unittests/test_nearest_interp_v2_op.py
index 19da09a463f3cc6224a22eb90278abae9ec59b91..2feca1c30689cec20e1d696cc672516414786038 100755
--- a/python/paddle/fluid/tests/unittests/test_nearest_interp_v2_op.py
+++ b/python/paddle/fluid/tests/unittests/test_nearest_interp_v2_op.py
@@ -26,6 +26,8 @@ import paddle
 def nearest_neighbor_interp_np(X,
                                out_h,
                                out_w,
+                               scale_h=0,
+                               scale_w=0,
                                out_size=None,
                                actual_shape=None,
                                align_corners=True,
@@ -46,13 +48,18 @@ def nearest_neighbor_interp_np(X,
         if (align_corners):
             ratio_h = (in_h - 1.0) / (out_h - 1.0)
         else:
-            ratio_h = 1.0 * in_h / out_h
+            if scale_h > 0:
+                ratio_h = 1.0 / scale_h
+            else:
+                ratio_h = 1.0 * in_h / out_h
     if (out_w > 1):
         if (align_corners):
             ratio_w = (in_w - 1.0) / (out_w - 1.0)
         else:
-            ratio_w = 1.0 * in_w / out_w
-
+            if scale_w > 0:
+                ratio_w = 1.0 / scale_w
+            else:
+                ratio_w = 1.0 * in_w / out_w
     out = np.zeros((n, c, out_h, out_w))
 
     if align_corners:
@@ -89,7 +96,8 @@ class TestNearestInterpOp(OpTest):
         else:
             in_h = self.input_shape[1]
             in_w = self.input_shape[2]
-
+        scale_h = 0
+        scale_w = 0
         if self.scale:
             if isinstance(self.scale, float) or isinstance(self.scale, int):
                 if self.scale > 0:
@@ -106,8 +114,8 @@ class TestNearestInterpOp(OpTest):
             out_w = self.out_w
 
         output_np = nearest_neighbor_interp_np(
-            input_np, out_h, out_w, self.out_size, self.actual_shape,
-            self.align_corners, self.data_layout)
+            input_np, out_h, out_w, scale_h, scale_w, self.out_size,
+            self.actual_shape, self.align_corners, self.data_layout)
         self.inputs = {'X': input_np}
         if self.out_size is not None:
             self.inputs['OutSize'] = self.out_size
@@ -265,7 +273,7 @@ class TestNearestInterpOpUint8(OpTest):
             out_h = self.out_h
             out_w = self.out_w
 
-        output_np = nearest_neighbor_interp_np(input_np, out_h, out_w,
+        output_np = nearest_neighbor_interp_np(input_np, out_h, out_w, 0, 0,
                                                self.out_size, self.actual_shape,
                                                self.align_corners)
         self.inputs = {'X': input_np}
@@ -408,7 +416,7 @@ class TestNearestInterpOp_attr_tensor(OpTest):
             if isinstance(self.scale, list) and len(self.scale) == 1:
                 self.scale = [self.scale[0], self.scale[0]]
             self.attrs['scale'] = self.scale
-        output_np = nearest_neighbor_interp_np(input_np, out_h, out_w,
+        output_np = nearest_neighbor_interp_np(input_np, out_h, out_w, 0, 0,
                                                self.out_size, self.actual_shape,
                                                self.align_corners)
         self.outputs = {'Out': output_np}
diff --git a/python/paddle/fluid/tests/unittests/test_nn_functional_embedding_dygraph.py b/python/paddle/fluid/tests/unittests/test_nn_functional_embedding_dygraph.py
index e0edf9019356f38eb3c74b9cadfa6ae575e9b823..43a0d481b28fdc47dec52fe9763dd920fd5a76a2 100644
--- a/python/paddle/fluid/tests/unittests/test_nn_functional_embedding_dygraph.py
+++ b/python/paddle/fluid/tests/unittests/test_nn_functional_embedding_dygraph.py
@@ -16,20 +16,49 @@ from __future__ import print_function
 
 import unittest
 
+import paddle
+import paddle.nn as nn
+import numpy as np
+
+paddle.disable_static()
+
 
 class EmbeddingDygraph(unittest.TestCase):
     def test_1(self):
-        import paddle
-        import paddle.nn as nn
-        import numpy as np
-        paddle.disable_static()
+        x_data = np.arange(3, 6).reshape((3, 1)).astype(np.int64)
+        y_data = np.arange(6, 12).reshape((3, 2)).astype(np.float32)
+        paddle.disable_static(paddle.CPUPlace())
+        x = paddle.to_tensor(x_data, stop_gradient=False)
+        y = paddle.to_tensor(y_data, stop_gradient=False)
+
+        embedding = paddle.nn.Embedding(10, 3, sparse=True)
+
+        w0 = np.full(shape=(10, 3), fill_value=2).astype(np.float32)
+        embedding.weight.set_value(w0)
+
+        adam = paddle.optimizer.Adam(
+            parameters=[embedding.weight], learning_rate=0.01)
+        adam.clear_grad()
+
+        out = embedding(x)
+        out.backward()
+        adam.step()
+
+    def test_2(self):
+        x_data = np.arange(3, 6).reshape((3, 1)).astype(np.int64)
+        y_data = np.arange(6, 12).reshape((3, 2)).astype(np.float32)
+        paddle.disable_static(paddle.CPUPlace())
+        x = paddle.to_tensor(x_data, stop_gradient=False)
+        y = paddle.to_tensor(y_data, stop_gradient=False)
+
+        with self.assertRaises(ValueError):
+            embedding = paddle.nn.Embedding(10, 3, padding_idx=11, sparse=True)
 
-        # example 1
-        inp_word = np.array([[2, 3, 5], [4, 2, 1]]).astype('int64')
-        inp_word.shape  # [2, 3]
-        dict_size = 20
+        with self.assertRaises(ValueError):
+            embedding = paddle.nn.Embedding(-1, 3, sparse=True)
 
-        emb = nn.Embedding(dict_size, 32, weight_attr='emb.w', sparse=False)
+        with self.assertRaises(ValueError):
+            embedding = paddle.nn.Embedding(10, -3, sparse=True)
 
 
 if __name__ == '__main__':
diff --git a/python/paddle/fluid/tests/unittests/test_nn_functional_embedding_static.py b/python/paddle/fluid/tests/unittests/test_nn_functional_embedding_static.py
index c9c91ceb39de42c44f9ce81658aa79b896999552..4af0cce12b7334857c54ae9e8e9418848275ff32 100644
--- a/python/paddle/fluid/tests/unittests/test_nn_functional_embedding_static.py
+++ b/python/paddle/fluid/tests/unittests/test_nn_functional_embedding_static.py
@@ -73,8 +73,13 @@ class EmbeddingStatic(unittest.TestCase):
                     dtype="int32")
 
                 emb = functional.embedding(
-                    x=label, weight=weight, sparse=True, name="embedding")
+                    x=label,
+                    weight=weight,
+                    padding_idx=129,
+                    sparse=True,
+                    name="embedding")
 
+        with self.assertRaises(ValueError):
             test_bad_x()
 
 
diff --git a/python/paddle/fluid/tests/unittests/test_nn_grad.py b/python/paddle/fluid/tests/unittests/test_nn_grad.py
index c6cfe01dce40458684c7464ca5ebddd389c62cbe..5d1e016287e07a8505336e6cb447c0e1b29a2ec2 100644
--- a/python/paddle/fluid/tests/unittests/test_nn_grad.py
+++ b/python/paddle/fluid/tests/unittests/test_nn_grad.py
@@ -17,6 +17,7 @@ from __future__ import print_function
 import unittest
 import numpy as np
 
+import paddle
 import paddle.fluid as fluid
 import paddle.fluid.layers as layers
 import paddle.fluid.core as core
@@ -101,6 +102,29 @@ class TestReduceMeanWithDimDoubleGradCheck(unittest.TestCase):
             self.func(p)
 
 
+class TestReduceSumWithDimDoubleGradCheck(unittest.TestCase):
+    @prog_scope()
+    def func(self, place):
+        shape = [7, 11]
+        eps = 0.05
+        dtype = np.float64
+
+        x = layers.data('x', shape, False, dtype)
+        x.persistable = True
+        y = layers.reduce_sum(x, dim=0)
+        x_arr = np.random.uniform(-1, 1, shape).astype(dtype)
+
+        gradient_checker.double_grad_check(
+            [x], y, x_init=x_arr, place=place, eps=eps)
+
+    def test_grad(self):
+        places = [fluid.CPUPlace()]
+        if core.is_compiled_with_cuda():
+            places.append(fluid.CUDAPlace(0))
+        for p in places:
+            self.func(p)
+
+
 class TestMulDoubleGradCheck(unittest.TestCase):
     @prog_scope()
     def func(self, place):
@@ -130,6 +154,30 @@ class TestMulDoubleGradCheck(unittest.TestCase):
 
 
 class TestReshapeDoubleGradCheck(unittest.TestCase):
+    @prog_scope()
+    def func(self, place):
+        x_shape = [3, 12]
+        expand_times = [4, 9]
+        eps = 0.005
+        dtype = np.float64
+
+        x = layers.data('x', x_shape, False, dtype)
+        x.persistable = True
+        out = layers.expand(x, expand_times)
+        x_arr = np.random.uniform(-1, 1, x_shape).astype(dtype)
+
+        gradient_checker.double_grad_check(
+            [x], out, x_init=x_arr, place=place, eps=eps)
+
+    def test_grad(self):
+        places = [fluid.CPUPlace()]
+        if core.is_compiled_with_cuda():
+            places.append(fluid.CUDAPlace(0))
+        for p in places:
+            self.func(p)
+
+
+class TestExpandDoubleGradCheck(unittest.TestCase):
     @prog_scope()
     def func(self, place):
         x_shape = [3, 12]
@@ -153,5 +201,53 @@ class TestReshapeDoubleGradCheck(unittest.TestCase):
             self.func(p)
 
 
+class TestTileDoubleGradCheck(unittest.TestCase):
+    @prog_scope()
+    def func(self, place):
+        x_shape = [3, 12]
+        repeat_times = [4, 9]
+        eps = 0.005
+        dtype = np.float64
+
+        x = layers.data('x', x_shape, False, dtype)
+        x.persistable = True
+        out = paddle.tile(x, repeat_times)
+        x_arr = np.random.uniform(-1, 1, x_shape).astype(dtype)
+
+        gradient_checker.double_grad_check(
+            [x], out, x_init=x_arr, place=place, eps=eps)
+
+    def test_grad(self):
+        places = [fluid.CPUPlace()]
+        if core.is_compiled_with_cuda():
+            places.append(fluid.CUDAPlace(0))
+        for p in places:
+            self.func(p)
+
+
+class TestExpandV2DoubleGradCheck(unittest.TestCase):
+    @prog_scope()
+    def func(self, place):
+        x_shape = [1, 12]
+        new_shape = [4, 12]
+        eps = 0.005
+        dtype = np.float64
+
+        x = layers.data('x', x_shape, False, dtype)
+        x.persistable = True
+        out = paddle.expand(x, new_shape)
+        x_arr = np.random.uniform(-1, 1, x_shape).astype(dtype)
+
+        gradient_checker.double_grad_check(
+            [x], out, x_init=x_arr, place=place, eps=eps)
+
+    def test_grad(self):
+        places = [fluid.CPUPlace()]
+        if core.is_compiled_with_cuda():
+            places.append(fluid.CUDAPlace(0))
+        for p in places:
+            self.func(p)
+
+
 if __name__ == "__main__":
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_norm_all.py b/python/paddle/fluid/tests/unittests/test_norm_all.py
index c047cf6ddff78641b918de75a284574175bb3bca..352089e1fb75fa4c3423d29012fd85c3d611c81b 100644
--- a/python/paddle/fluid/tests/unittests/test_norm_all.py
+++ b/python/paddle/fluid/tests/unittests/test_norm_all.py
@@ -26,11 +26,11 @@ def p_norm(x, axis, porder, keepdims=False):
     if axis is None:
         x = x.flatten()
         if porder == np.inf:
-            r = np.amax(np.abs(x))
+            r = np.amax(np.abs(x), keepdims=keepdims)
         elif porder == -np.inf:
-            r = np.amin(np.abs(x))
+            r = np.amin(np.abs(x), keepdims=keepdims)
         else:
-            r = np.linalg.norm(x, ord=porder)
+            r = np.linalg.norm(x, ord=porder, keepdims=keepdims)
     elif isinstance(axis, list or tuple) and len(axis) == 2:
         if porder == np.inf:
             axis = tuple(axis)
@@ -41,10 +41,10 @@ def p_norm(x, axis, porder, keepdims=False):
         elif porder == 0:
             axis = tuple(axis)
             r = x.astype(bool)
-            r = np.sum(r, axis)
+            r = np.sum(r, axis, keepdims=keepdims)
         elif porder == 1:
             axis = tuple(axis)
-            r = np.sum(np.abs(x), axis)
+            r = np.sum(np.abs(x), axis, keepdims=keepdims)
         else:
             axis = tuple(axis)
             xp = np.power(np.abs(x), porder)
@@ -61,7 +61,7 @@ def p_norm(x, axis, porder, keepdims=False):
 
 def frobenius_norm(x, axis=None, keepdims=False):
     if isinstance(axis, list): axis = tuple(axis)
-    if axis is None: axis = (-2, -1)
+    if axis is None: x = x.reshape(1, x.size)
     r = np.linalg.norm(
         x, ord='fro', axis=axis, keepdims=keepdims).astype(x.dtype)
     return r
@@ -217,28 +217,37 @@ class TestPnormOp5(TestPnormOp):
         self.check_grad(['X'], 'Out', user_defined_grads=self.gradient)
 
 
-def run_fro(self, p, axis, shape_x, dtype):
+def run_fro(self, p, axis, shape_x, dtype, keep_dim, check_dim=False):
     with fluid.program_guard(fluid.Program()):
         data = fluid.data(name="X", shape=shape_x, dtype=dtype)
-        out = paddle.norm(x=data, p=p, axis=axis)
+        out = paddle.norm(x=data, p=p, axis=axis, keepdim=keep_dim)
         place = fluid.CPUPlace()
         exe = fluid.Executor(place)
         np_input = (np.random.rand(*shape_x) + 1.0).astype(dtype)
-        expected_result = frobenius_norm(np_input, axis=axis)
+        expected_result = frobenius_norm(np_input, axis=axis, keepdims=keep_dim)
         result, = exe.run(feed={"X": np_input}, fetch_list=[out])
     self.assertEqual((np.abs(result - expected_result) < 1e-6).all(), True)
+    if keep_dim and check_dim:
+        self.assertEqual(
+            (np.abs(np.array(result.shape) - np.array(expected_result.shape)) <
+             1e-6).all(), True)
 
 
-def run_pnorm(self, p, axis, shape_x, dtype):
+def run_pnorm(self, p, axis, shape_x, dtype, keep_dim, check_dim=False):
     with fluid.program_guard(fluid.Program()):
         data = fluid.data(name="X", shape=shape_x, dtype=dtype)
-        out = paddle.norm(x=data, p=p, axis=axis)
+        out = paddle.norm(x=data, p=p, axis=axis, keepdim=keep_dim)
         place = fluid.CPUPlace()
         exe = fluid.Executor(place)
         np_input = (np.random.rand(*shape_x) + 1.0).astype(dtype)
-        expected_result = p_norm(np_input, porder=p, axis=axis).astype(dtype)
+        expected_result = p_norm(
+            np_input, porder=p, axis=axis, keepdims=keep_dim).astype(dtype)
         result, = exe.run(feed={"X": np_input}, fetch_list=[out])
-        self.assertEqual((np.abs(result - expected_result) < 1e-6).all(), True)
+    self.assertEqual((np.abs(result - expected_result) < 1e-6).all(), True)
+    if keep_dim and check_dim:
+        self.assertEqual(
+            (np.abs(np.array(result.shape) - np.array(expected_result.shape)) <
+             1e-6).all(), True)
 
 
 def run_graph(self, p, axis, shape_x, dtype):
@@ -253,6 +262,7 @@ def run_graph(self, p, axis, shape_x, dtype):
 
     # compute frobenius norm along last two dimensions.
     out_fro = paddle.norm(x, p='fro')
+    out_fro = paddle.norm(x, p='fro', axis=0)
     out_fro = paddle.norm(x, p='fro', axis=[0, 1])
     # compute 2-order  norm along [0,1] dimension.
     out_pnorm = paddle.norm(x, p=2, axis=[0, 1])
@@ -274,27 +284,133 @@ def run_graph(self, p, axis, shape_x, dtype):
 
 class API_NormTest(unittest.TestCase):
     def test_basic(self):
-        run_fro(self, p='fro', axis=None, shape_x=[2, 3, 4], dtype="float32")
-        run_fro(self, p='fro', axis=[0, 1], shape_x=[2, 3, 4], dtype="float64")
-        run_pnorm(self, p=2, axis=None, shape_x=[3, 4], dtype="float32")
-        run_pnorm(self, p=2, axis=1, shape_x=[3, 4], dtype="float64")
-        run_pnorm(self, p=np.inf, axis=0, shape_x=[2, 3, 4], dtype="float32")
-        run_pnorm(self, p=np.inf, axis=None, shape_x=[2, 3, 4], dtype="float32")
-        run_pnorm(self, p=-np.inf, axis=0, shape_x=[2, 3, 4], dtype="float64")
-        run_pnorm(
-            self, p=-np.inf, axis=None, shape_x=[2, 3, 4], dtype="float64")
-        run_pnorm(self, p=0, axis=1, shape_x=[3, 4], dtype="float64")
-
-        run_pnorm(self, p=1, axis=1, shape_x=[3, 4], dtype="float64")
-        run_pnorm(self, p=0, axis=None, shape_x=[3, 4], dtype="float64")
-        run_pnorm(self, p=2, axis=[0, 1], shape_x=[2, 3, 4], dtype="float64")
-        run_pnorm(self, p=2, axis=-1, shape_x=[2, 3, 4], dtype="float64")
-        run_pnorm(self, p=1, axis=[0, 1], shape_x=[2, 3, 4], dtype="float64")
-        run_pnorm(self, p=0, axis=[0, 1], shape_x=[2, 3, 4], dtype="float64")
-        run_pnorm(
-            self, p=np.inf, axis=[0, 1], shape_x=[2, 3, 4], dtype="float64")
-        run_pnorm(
-            self, p=-np.inf, axis=[0, 1], shape_x=[2, 3, 4], dtype="float64")
+        keep_dims = {False, True}
+        for keep in keep_dims:
+            run_fro(
+                self,
+                p='fro',
+                axis=None,
+                shape_x=[2, 3, 4],
+                dtype="float32",
+                keep_dim=keep)
+            run_fro(
+                self,
+                p='fro',
+                axis=[0, 1],
+                shape_x=[2, 3, 4],
+                dtype="float64",
+                keep_dim=keep,
+                check_dim=True)
+            run_pnorm(
+                self,
+                p=2,
+                axis=None,
+                shape_x=[3, 4],
+                dtype="float32",
+                keep_dim=keep)
+            run_pnorm(
+                self,
+                p=2,
+                axis=1,
+                shape_x=[3, 4],
+                dtype="float64",
+                keep_dim=keep,
+                check_dim=True)
+            run_pnorm(
+                self,
+                p=np.inf,
+                axis=0,
+                shape_x=[2, 3, 4],
+                dtype="float32",
+                keep_dim=keep,
+                check_dim=True)
+            run_pnorm(
+                self,
+                p=np.inf,
+                axis=None,
+                shape_x=[2, 3, 4],
+                dtype="float32",
+                keep_dim=keep)
+            run_pnorm(
+                self,
+                p=-np.inf,
+                axis=0,
+                shape_x=[2, 3, 4],
+                dtype="float64",
+                keep_dim=keep,
+                check_dim=True)
+            run_pnorm(
+                self,
+                p=-np.inf,
+                axis=None,
+                shape_x=[2, 3, 4],
+                dtype="float64",
+                keep_dim=keep)
+            run_pnorm(
+                self,
+                p=0,
+                axis=1,
+                shape_x=[3, 4],
+                dtype="float64",
+                keep_dim=keep,
+                check_dim=True)
+
+            run_pnorm(
+                self,
+                p=1,
+                axis=1,
+                shape_x=[3, 4],
+                dtype="float64",
+                keep_dim=keep,
+                check_dim=True)
+            run_pnorm(
+                self,
+                p=0,
+                axis=None,
+                shape_x=[3, 4],
+                dtype="float64",
+                keep_dim=keep,
+                check_dim=True)
+            run_pnorm(
+                self,
+                p=2,
+                axis=[0, 1],
+                shape_x=[2, 3, 4],
+                dtype="float64",
+                keep_dim=keep,
+                check_dim=True)
+            run_pnorm(
+                self,
+                p=2,
+                axis=-1,
+                shape_x=[2, 3, 4],
+                dtype="float64",
+                keep_dim=keep,
+                check_dim=True)
+            run_pnorm(
+                self,
+                p=1,
+                axis=[0, 1],
+                shape_x=[2, 3, 4],
+                dtype="float64",
+                keep_dim=keep,
+                check_dim=True)
+            run_pnorm(
+                self,
+                p=np.inf,
+                axis=[0, 1],
+                shape_x=[2, 3, 4],
+                dtype="float64",
+                keep_dim=keep,
+                check_dim=True)
+            run_pnorm(
+                self,
+                p=-np.inf,
+                axis=[0, 1],
+                shape_x=[2, 3, 4],
+                dtype="float64",
+                keep_dim=keep,
+                check_dim=True)
 
     def test_dygraph(self):
         run_graph(self, p='fro', axis=None, shape_x=[2, 3, 4], dtype="float32")
@@ -315,6 +431,7 @@ class API_NormTest(unittest.TestCase):
                 paddle.norm(data, p=p, out=out)
 
             self.assertRaises(TypeError, err_dtype, "fro", [2, 2], "int64")
+            self.assertRaises(ValueError, paddle.norm, "inf", [2], "int64")
             out = fluid.data(name="out", shape=[1], dtype="int64")
             self.assertRaises(TypeError, err_dtype, "fro", [2, 2], "float64",
                               out)
@@ -325,6 +442,7 @@ class API_NormTest(unittest.TestCase):
             self.assertRaises(ValueError, paddle.norm, data, p="unsupport norm")
             self.assertRaises(ValueError, paddle.norm, data, p=[1])
             self.assertRaises(ValueError, paddle.norm, data, p=[1], axis=-1)
+            self.assertRaises(ValueError, paddle.norm, 0, [1, 0], "float64")
             data = fluid.data(name="data_3d", shape=[2, 2, 2], dtype="float64")
             self.assertRaises(
                 ValueError, paddle.norm, data, p='unspport', axis=[-3, -2, -1])
diff --git a/python/paddle/fluid/tests/unittests/test_norm_nn_grad.py b/python/paddle/fluid/tests/unittests/test_norm_nn_grad.py
index c44ea454271f3aa6cb12451cd85490b57284ea35..a89b9fde7f92de0d493ad87a2f0950548ba8ff98 100644
--- a/python/paddle/fluid/tests/unittests/test_norm_nn_grad.py
+++ b/python/paddle/fluid/tests/unittests/test_norm_nn_grad.py
@@ -68,5 +68,67 @@ class TestInstanceNormDoubleGradCheckWithoutParamBias(
                 [x], z, x_init=x_arr, atol=atol, place=place, eps=eps)
 
 
+class TestBatchNormDoubleGradCheck(unittest.TestCase):
+    def setUp(self):
+        self.init_test()
+
+    def init_test(self):
+        self.data_layout = 'NCHW'
+        self.use_global_stats = False
+        self.shape = [2, 3, 4, 5]
+
+    @prog_scope()
+    def func(self, place):
+        prog = fluid.Program()
+        with fluid.program_guard(prog):
+            np.random.seed()
+            dtype = "float32"
+            eps = 0.005
+            atol = 1e-4
+            x = layers.create_parameter(dtype=dtype, shape=self.shape, name='x')
+            z = fluid.layers.batch_norm(
+                input=x,
+                data_layout=self.data_layout,
+                use_global_stats=self.use_global_stats)
+            x_arr = np.random.uniform(-1, 1, self.shape).astype(dtype)
+            gradient_checker.double_grad_check(
+                [x], z, x_init=x_arr, atol=atol, place=place, eps=eps)
+
+    def test_grad(self):
+        places = [fluid.CPUPlace()]
+        if core.is_compiled_with_cuda():
+            places.append(fluid.CUDAPlace(0))
+        for p in places:
+            self.func(p)
+
+
+class TestBatchNormDoubleGradCheckCase1(TestBatchNormDoubleGradCheck):
+    def init_test(self):
+        self.data_layout = 'NHWC'
+        self.use_global_stats = False
+        self.shape = [2, 3, 4, 5]
+
+
+class TestBatchNormDoubleGradCheckCase2(TestBatchNormDoubleGradCheck):
+    def init_test(self):
+        self.data_layout = 'NCHW'
+        self.use_global_stats = True
+        self.shape = [2, 3, 4, 5]
+
+
+class TestBatchNormDoubleGradCheckCase3(TestBatchNormDoubleGradCheck):
+    def init_test(self):
+        self.data_layout = 'NHWC'
+        self.use_global_stats = True
+        self.shape = [2, 3, 4, 5]
+
+
+class TestBatchNormDoubleGradCheckCase4(TestBatchNormDoubleGradCheck):
+    def init_test(self):
+        self.data_layout = 'NCHW'
+        self.use_global_stats = False
+        self.shape = [2, 2, 3, 4, 5]
+
+
 if __name__ == "__main__":
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_normal.py b/python/paddle/fluid/tests/unittests/test_normal.py
index 3e6855feaf491727203063f5c75c68301abbe05e..995a1f26ff6eb86c9198a164bcef80bebe3a8e79 100644
--- a/python/paddle/fluid/tests/unittests/test_normal.py
+++ b/python/paddle/fluid/tests/unittests/test_normal.py
@@ -18,6 +18,7 @@ import paddle
 import copy
 
 np.random.seed(10)
+paddle.manual_seed(10)
 
 
 class TestNormalAPI(unittest.TestCase):
diff --git a/python/paddle/fluid/tests/unittests/test_optimizer.py b/python/paddle/fluid/tests/unittests/test_optimizer.py
index 2e6e516aa2edde79e6524b4b35507ea95876ec53..91d705223316360b8c05954259724a5f7d246440 100644
--- a/python/paddle/fluid/tests/unittests/test_optimizer.py
+++ b/python/paddle/fluid/tests/unittests/test_optimizer.py
@@ -832,8 +832,8 @@ class TestRecomputeOptimizer(unittest.TestCase):
         recompute_optimizer = optimizer.RecomputeOptimizer(sgd_optimizer)
         recompute_optimizer._set_checkpoints([b1_out])
         try:
-            stat_dict = {}
-            recompute_optimizer.load(stat_dict)
+            state_dict = {}
+            recompute_optimizer.load(state_dict)
         except NotImplementedError as e:
             self.assertEqual(
                 "load function is not supported by Recompute Optimizer for now",
diff --git a/python/paddle/fluid/tests/unittests/test_optimizer_in_control_flow.py b/python/paddle/fluid/tests/unittests/test_optimizer_in_control_flow.py
index 4b2914c223a08c52444e085f0ef9e41518694593..c1992d0d539a5c6499b9b8d022b88997729ef782 100644
--- a/python/paddle/fluid/tests/unittests/test_optimizer_in_control_flow.py
+++ b/python/paddle/fluid/tests/unittests/test_optimizer_in_control_flow.py
@@ -261,7 +261,13 @@ class TestMultiOptimizersMultiCardsError(unittest.TestCase):
             exe.run(startup_program)
 
             np.random.seed(SEED)
+
+            # NOTE(liym27):
+            # This test needs to run in multi cards to test NotImplementedError.
+            # Here, move this test from RUN_TYPE=DIST in tests/unittests/CMakeList.txt,
+            # to use multi cards ** only on CPU ** not GPU to reduce CI time.
             os.environ['CPU_NUM'] = str(2)
+
             pe_exe = fluid.ParallelExecutor(
                 use_cuda=use_cuda,
                 main_program=main_program,
diff --git a/python/paddle/fluid/tests/unittests/test_parallel_dygraph_mnist.py b/python/paddle/fluid/tests/unittests/test_parallel_dygraph_mnist.py
index bac196b1ab52b604a85321a5473d455d2616bf0d..9cc507aa9b7918e854d56f1c8482f1b875910fb4 100644
--- a/python/paddle/fluid/tests/unittests/test_parallel_dygraph_mnist.py
+++ b/python/paddle/fluid/tests/unittests/test_parallel_dygraph_mnist.py
@@ -47,5 +47,21 @@ class TestParallelDygraphMnistSpawn(TestDistSpawnRunner):
             self.check_dist_result_with_spawn(test_class=TestMnist, delta=1e-5)
 
 
+class TestFleetDygraphMnist(TestDistBase):
+    def _setup_config(self):
+        self._sync_mode = False
+        self._nccl2_mode = True
+        self._dygraph = True
+        self._gpu_fleet_api = True
+
+    def test_mnist(self):
+        if fluid.core.is_compiled_with_cuda():
+            self.check_with_place(
+                "parallel_dygraph_mnist.py",
+                delta=1e-5,
+                check_error_log=True,
+                log_name=flag_name)
+
+
 if __name__ == "__main__":
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_parallel_executor_crf.py b/python/paddle/fluid/tests/unittests/test_parallel_executor_crf.py
index 6671a2def3cccd2acd76025e73486b06b4bb1471..ea59a7f584a2dd5a06d37ede160ace130fc93580 100644
--- a/python/paddle/fluid/tests/unittests/test_parallel_executor_crf.py
+++ b/python/paddle/fluid/tests/unittests/test_parallel_executor_crf.py
@@ -176,7 +176,7 @@ class TestCRFModel(unittest.TestCase):
                     place=fluid.CPUPlace())
 
             data = train_data()
-            for i in range(10):
+            for i in range(4):
                 cur_batch = next(data)
                 print(exe.run(train_cp,
                               feed=feeder.feed(cur_batch),
diff --git a/python/paddle/fluid/tests/unittests/test_pool1d_api.py b/python/paddle/fluid/tests/unittests/test_pool1d_api.py
index 1c05b96f1fc61234028e940f6403ae08a0186027..25216175d59935535a352b02afc3c8f371cedd63 100644
--- a/python/paddle/fluid/tests/unittests/test_pool1d_api.py
+++ b/python/paddle/fluid/tests/unittests/test_pool1d_api.py
@@ -143,6 +143,27 @@ class TestPool1d_API(unittest.TestCase):
             result = avg_pool1d_dg(input)
             self.assertTrue(np.allclose(result.numpy(), result_np))
 
+    def check_avg_dygraph_padding_results(self, place):
+        with fluid.dygraph.guard(place):
+            input_np = np.random.random([2, 3, 32]).astype("float32")
+            input = fluid.dygraph.to_variable(input_np)
+            result = F.avg_pool1d(
+                input,
+                kernel_size=2,
+                stride=2,
+                padding=[1],
+                count_include_pad=True)
+
+            result_np = avg_pool1D_forward_naive(
+                input_np, ksize=[2], strides=[2], paddings=[1], exclusive=False)
+
+            self.assertTrue(np.allclose(result.numpy(), result_np))
+
+            avg_pool1d_dg = paddle.nn.AvgPool1d(
+                kernel_size=2, stride=None, padding=1, count_include_pad=True)
+            result = avg_pool1d_dg(input)
+            self.assertTrue(np.allclose(result.numpy(), result_np))
+
     def check_max_static_results(self, place):
         with fluid.program_guard(fluid.Program(), fluid.Program()):
             input = fluid.data(name="input", shape=[2, 3, 32], dtype="float32")
diff --git a/python/paddle/fluid/tests/unittests/test_pool2d_api.py b/python/paddle/fluid/tests/unittests/test_pool2d_api.py
index 93a2be6de342efc4e8284e7c352137d0a3a1bcb9..91faf78418b0d3a92a3cb6a167b6024b1beb3898 100644
--- a/python/paddle/fluid/tests/unittests/test_pool2d_api.py
+++ b/python/paddle/fluid/tests/unittests/test_pool2d_api.py
@@ -12,7 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-from test_pool2d_op import adaptive_start_index, adaptive_end_index, pool2D_forward_naive
+from test_pool2d_op import adaptive_start_index, adaptive_end_index, pool2D_forward_naive, avg_pool2D_forward_naive, max_pool2D_forward_naive
 import unittest
 from op_test import OpTest
 import numpy as np
@@ -68,6 +68,47 @@ class TestPool2d_API(unittest.TestCase):
             result = avg_pool2d_dg(input)
             self.assertTrue(np.allclose(result.numpy(), result_np))
 
+    def check_avg_dygraph_padding_results(self, place):
+        with fluid.dygraph.guard(place):
+            input_np = np.random.random([2, 3, 32, 32]).astype("float32")
+            input = fluid.dygraph.to_variable(input_np)
+            result = avg_pool2d(
+                input, kernel_size=2, stride=2, padding=1, ceil_mode=False)
+
+            result_np = avg_pool2D_forward_naive(
+                input_np,
+                ksize=[2, 2],
+                strides=[2, 2],
+                paddings=[1, 1],
+                ceil_mode=False,
+                exclusive=False)
+            self.assertTrue(np.allclose(result.numpy(), result_np))
+
+            avg_pool2d_dg = paddle.nn.layer.AvgPool2d(
+                kernel_size=2, stride=2, padding=1, ceil_mode=False)
+            result = avg_pool2d_dg(input)
+            self.assertTrue(np.allclose(result.numpy(), result_np))
+
+    def check_avg_dygraph_ceilmode_results(self, place):
+        with fluid.dygraph.guard(place):
+            input_np = np.random.random([2, 3, 32, 32]).astype("float32")
+            input = fluid.dygraph.to_variable(input_np)
+            result = avg_pool2d(
+                input, kernel_size=2, stride=2, padding=0, ceil_mode=True)
+
+            result_np = avg_pool2D_forward_naive(
+                input_np,
+                ksize=[2, 2],
+                strides=[2, 2],
+                paddings=[0, 0],
+                ceil_mode=True)
+            self.assertTrue(np.allclose(result.numpy(), result_np))
+
+            avg_pool2d_dg = paddle.nn.layer.AvgPool2d(
+                kernel_size=2, stride=2, padding=0, ceil_mode=True)
+            result = avg_pool2d_dg(input)
+            self.assertTrue(np.allclose(result.numpy(), result_np))
+
     def check_max_static_results(self, place):
         with fluid.program_guard(fluid.Program(), fluid.Program()):
             input = fluid.data(
@@ -108,6 +149,70 @@ class TestPool2d_API(unittest.TestCase):
             result = max_pool2d_dg(input)
             self.assertTrue(np.allclose(result.numpy(), result_np))
 
+    def check_max_dygraph_nhwc_results(self, place):
+        with fluid.dygraph.guard(place):
+            input_np = np.random.random([2, 3, 32, 32]).astype("float32")
+            input = fluid.dygraph.to_variable(
+                np.transpose(input_np, [0, 2, 3, 1]))
+            result = max_pool2d(
+                input,
+                kernel_size=2,
+                stride=2,
+                padding=0,
+                return_indices=False,
+                data_format="NHWC")
+
+            result_np = pool2D_forward_naive(
+                input_np,
+                ksize=[2, 2],
+                strides=[2, 2],
+                paddings=[0, 0],
+                pool_type='max')
+            self.assertTrue(
+                np.allclose(
+                    np.transpose(result.numpy(), [0, 3, 1, 2]), result_np))
+
+    def check_max_dygraph_padding_results(self, place):
+        with fluid.dygraph.guard(place):
+            input_np = np.random.random([2, 3, 32, 32]).astype("float32")
+            input = fluid.dygraph.to_variable(input_np)
+            result = max_pool2d(
+                input, kernel_size=2, stride=2, padding=1, ceil_mode=False)
+
+            result_np = max_pool2D_forward_naive(
+                input_np,
+                ksize=[2, 2],
+                strides=[2, 2],
+                paddings=[1, 1],
+                ceil_mode=False,
+                exclusive=False)
+            self.assertTrue(np.allclose(result.numpy(), result_np))
+
+            max_pool2d_dg = paddle.nn.layer.MaxPool2d(
+                kernel_size=2, stride=2, padding=1, ceil_mode=False)
+            result = max_pool2d_dg(input)
+            self.assertTrue(np.allclose(result.numpy(), result_np))
+
+    def check_max_dygraph_ceilmode_results(self, place):
+        with fluid.dygraph.guard(place):
+            input_np = np.random.random([2, 3, 32, 32]).astype("float32")
+            input = fluid.dygraph.to_variable(input_np)
+            result = max_pool2d(
+                input, kernel_size=2, stride=2, padding=0, ceil_mode=True)
+
+            result_np = max_pool2D_forward_naive(
+                input_np,
+                ksize=[2, 2],
+                strides=[2, 2],
+                paddings=[0, 0],
+                ceil_mode=True)
+            self.assertTrue(np.allclose(result.numpy(), result_np))
+
+            max_pool2d_dg = paddle.nn.layer.MaxPool2d(
+                kernel_size=2, stride=2, padding=0, ceil_mode=True)
+            result = max_pool2d_dg(input)
+            self.assertTrue(np.allclose(result.numpy(), result_np))
+
     def check_max_dygraph_stride_is_none(self, place):
         with fluid.dygraph.guard(place):
             input_np = np.random.random([2, 3, 32, 32]).astype("float32")
@@ -215,6 +320,9 @@ class TestPool2d_API(unittest.TestCase):
             self.check_avg_dygraph_stride_is_none(place)
             self.check_max_dygraph_padding(place)
             self.check_avg_divisor(place)
+            self.check_max_dygraph_padding_results(place)
+            self.check_max_dygraph_ceilmode_results(place)
+            self.check_max_dygraph_nhwc_results(place)
 
 
 class TestPool2dError_API(unittest.TestCase):
@@ -370,6 +478,22 @@ class TestPool2dError_API(unittest.TestCase):
 
         self.assertRaises(ValueError, run8)
 
+        def run9():
+            with fluid.dygraph.guard():
+                input_np = np.random.uniform(-1, 1,
+                                             [2, 3, 32, 32]).astype(np.float32)
+                input_pd = fluid.dygraph.to_variable(input_np)
+                res_pd = max_pool2d(
+                    input_pd,
+                    kernel_size=2,
+                    stride=2,
+                    padding=0,
+                    ceil_mode=False,
+                    data_format='NHWC',
+                    return_indices=True)
+
+        self.assertRaises(ValueError, run9)
+
 
 if __name__ == '__main__':
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_pool3d_api.py b/python/paddle/fluid/tests/unittests/test_pool3d_api.py
index cc078e9aae7aafe55e937b80270dd012fd64ff70..505a1c738384194032329f66c33fa27e3ed3045c 100644
--- a/python/paddle/fluid/tests/unittests/test_pool3d_api.py
+++ b/python/paddle/fluid/tests/unittests/test_pool3d_api.py
@@ -22,7 +22,7 @@ import paddle.fluid.core as core
 from op_test import OpTest
 import paddle.fluid as fluid
 from paddle.nn.functional import avg_pool3d, max_pool3d
-from test_pool3d_op import adaptive_start_index, adaptive_end_index, pool3D_forward_naive
+from test_pool3d_op import adaptive_start_index, adaptive_end_index, pool3D_forward_naive, avg_pool3D_forward_naive, max_pool3D_forward_naive
 
 
 class TestPool3d_API(unittest.TestCase):
@@ -73,6 +73,58 @@ class TestPool3d_API(unittest.TestCase):
             result = avg_pool3d_dg(input)
             self.assertTrue(np.allclose(result.numpy(), result_np))
 
+    def check_avg_dygraph_padding_results(self, place):
+        with fluid.dygraph.guard(place):
+            input_np = np.random.random([2, 3, 32, 32, 32]).astype("float32")
+            input = fluid.dygraph.to_variable(input_np)
+            result = avg_pool3d(
+                input,
+                kernel_size=2,
+                stride=2,
+                padding=1,
+                ceil_mode=False,
+                count_include_pad=True)
+
+            result_np = avg_pool3D_forward_naive(
+                input_np,
+                ksize=[2, 2, 2],
+                strides=[2, 2, 2],
+                paddings=[1, 1, 1],
+                ceil_mode=False,
+                exclusive=False)
+
+            self.assertTrue(np.allclose(result.numpy(), result_np))
+
+            avg_pool3d_dg = paddle.nn.layer.AvgPool3d(
+                kernel_size=2,
+                stride=None,
+                padding=1,
+                ceil_mode=False,
+                count_include_pad=True)
+            result = avg_pool3d_dg(input)
+            self.assertTrue(np.allclose(result.numpy(), result_np))
+
+    def check_avg_dygraph_ceilmode_results(self, place):
+        with fluid.dygraph.guard(place):
+            input_np = np.random.random([2, 3, 32, 32, 32]).astype("float32")
+            input = fluid.dygraph.to_variable(input_np)
+            result = avg_pool3d(
+                input, kernel_size=2, stride=2, padding=0, ceil_mode=True)
+
+            result_np = avg_pool3D_forward_naive(
+                input_np,
+                ksize=[2, 2, 2],
+                strides=[2, 2, 2],
+                paddings=[0, 0, 0],
+                ceil_mode=True)
+
+            self.assertTrue(np.allclose(result.numpy(), result_np))
+
+            avg_pool3d_dg = paddle.nn.layer.AvgPool3d(
+                kernel_size=2, stride=None, padding=0, ceil_mode=True)
+            result = avg_pool3d_dg(input)
+            self.assertTrue(np.allclose(result.numpy(), result_np))
+
     def check_max_static_results(self, place):
         with fluid.program_guard(fluid.Program(), fluid.Program()):
             input = fluid.data(
@@ -112,6 +164,72 @@ class TestPool3d_API(unittest.TestCase):
             result = max_pool3d_dg(input)
             self.assertTrue(np.allclose(result.numpy(), result_np))
 
+    def check_max_dygraph_ndhwc_results(self, place):
+        with fluid.dygraph.guard(place):
+            input_np = np.random.random([2, 3, 32, 32, 32]).astype("float32")
+            input = fluid.dygraph.to_variable(
+                np.transpose(input_np, [0, 2, 3, 4, 1]))
+            result = max_pool3d(
+                input,
+                kernel_size=2,
+                stride=2,
+                padding=0,
+                data_format="NDHWC",
+                return_indices=False)
+
+            result_np = pool3D_forward_naive(
+                input_np,
+                ksize=[2, 2, 2],
+                strides=[2, 2, 2],
+                paddings=[0, 0, 0],
+                pool_type='max')
+
+            self.assertTrue(
+                np.allclose(
+                    np.transpose(result.numpy(), [0, 4, 1, 2, 3]), result_np))
+
+    def check_max_dygraph_ceilmode_results(self, place):
+        with fluid.dygraph.guard(place):
+            input_np = np.random.random([2, 3, 32, 32, 32]).astype("float32")
+            input = fluid.dygraph.to_variable(input_np)
+            result = max_pool3d(
+                input, kernel_size=2, stride=2, padding=0, ceil_mode=True)
+
+            result_np = max_pool3D_forward_naive(
+                input_np,
+                ksize=[2, 2, 2],
+                strides=[2, 2, 2],
+                paddings=[0, 0, 0],
+                ceil_mode=True)
+
+            self.assertTrue(np.allclose(result.numpy(), result_np))
+
+            max_pool3d_dg = paddle.nn.layer.MaxPool3d(
+                kernel_size=2, stride=None, padding=0, ceil_mode=True)
+            result = max_pool3d_dg(input)
+            self.assertTrue(np.allclose(result.numpy(), result_np))
+
+    def check_max_dygraph_padding_results(self, place):
+        with fluid.dygraph.guard(place):
+            input_np = np.random.random([2, 3, 32, 32, 32]).astype("float32")
+            input = fluid.dygraph.to_variable(input_np)
+            result = max_pool3d(
+                input, kernel_size=2, stride=2, padding=1, ceil_mode=False)
+
+            result_np = max_pool3D_forward_naive(
+                input_np,
+                ksize=[2, 2, 2],
+                strides=[2, 2, 2],
+                paddings=[1, 1, 1],
+                ceil_mode=False)
+
+            self.assertTrue(np.allclose(result.numpy(), result_np))
+
+            max_pool3d_dg = paddle.nn.layer.MaxPool3d(
+                kernel_size=2, stride=None, padding=1, ceil_mode=False)
+            result = max_pool3d_dg(input)
+            self.assertTrue(np.allclose(result.numpy(), result_np))
+
     def check_max_dygraph_stride_is_none(self, place):
         with fluid.dygraph.guard(place):
             input_np = np.random.random([2, 3, 32, 32, 32]).astype("float32")
@@ -205,6 +323,8 @@ class TestPool3d_API(unittest.TestCase):
             self.check_max_dygraph_stride_is_none(place)
             self.check_max_dygraph_padding(place)
             self.check_avg_divisor(place)
+            self.check_max_dygraph_ndhwc_results(place)
+            self.check_max_dygraph_ceilmode_results(place)
 
 
 class TestPool3dError_API(unittest.TestCase):
@@ -336,6 +456,21 @@ class TestPool3dError_API(unittest.TestCase):
 
         self.assertRaises(ValueError, run9)
 
+        def run10():
+            with fluid.dygraph.guard():
+                input_np = np.random.uniform(
+                    -1, 1, [2, 3, 32, 32, 32]).astype(np.float32)
+                input_pd = fluid.dygraph.to_variable(input_np)
+                res_pd = max_pool3d(
+                    input_pd,
+                    kernel_size=2,
+                    stride=2,
+                    padding=0,
+                    data_format='NDHWC',
+                    return_indices=True)
+
+        self.assertRaises(ValueError, run10)
+
 
 if __name__ == '__main__':
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_py_reader_combination.py b/python/paddle/fluid/tests/unittests/test_py_reader_combination.py
index 2d977caa03369840d4ac31344195878a9998f685..624927d809fba4f13e30a62748c8cb6747d4eda3 100644
--- a/python/paddle/fluid/tests/unittests/test_py_reader_combination.py
+++ b/python/paddle/fluid/tests/unittests/test_py_reader_combination.py
@@ -12,6 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+import paddle
 import paddle.fluid as fluid
 import unittest
 import numpy as np
@@ -60,8 +61,8 @@ class TestPyReaderCombination(unittest.TestCase):
             py_reader2 = fluid.io.PyReader(
                 feed_list=[image, label], capacity=16, iterable=True)
 
-            reader1 = fluid.io.cache(self.create_reader(self.n1))
-            reader2 = fluid.io.cache(self.create_reader(self.n2))
+            reader1 = paddle.reader.cache(self.create_reader(self.n1))
+            reader2 = paddle.reader.cache(self.create_reader(self.n2))
             py_reader1.decorate_batch_generator(reader1, places=place)
             py_reader2.decorate_batch_generator(reader2, places=place)
 
diff --git a/python/paddle/fluid/tests/unittests/test_rand_op.py b/python/paddle/fluid/tests/unittests/test_rand_op.py
index 1eceeaadfec651ade5031ddc7e6a012244050e84..4b8fe8c7e4786417de2f80dbb9953530781f9189 100644
--- a/python/paddle/fluid/tests/unittests/test_rand_op.py
+++ b/python/paddle/fluid/tests/unittests/test_rand_op.py
@@ -120,24 +120,24 @@ class TestRandDtype(unittest.TestCase):
     def test_default_dtype(self):
         paddle.disable_static()
 
-        def test_default_fp_16():
+        def test_default_fp16():
             paddle.framework.set_default_dtype('float16')
             paddle.tensor.random.rand([2, 3])
 
-        self.assertRaises(TypeError, test_default_fp_16)
+        self.assertRaises(TypeError, test_default_fp16)
 
-        def test_default_fp_32():
+        def test_default_fp32():
             paddle.framework.set_default_dtype('float32')
             out = paddle.tensor.random.rand([2, 3])
             self.assertEqual(out.dtype, fluid.core.VarDesc.VarType.FP32)
 
-        def test_default_fp_64():
+        def test_default_fp64():
             paddle.framework.set_default_dtype('float64')
             out = paddle.tensor.random.rand([2, 3])
             self.assertEqual(out.dtype, fluid.core.VarDesc.VarType.FP64)
 
-        test_default_fp_64()
-        test_default_fp_32()
+        test_default_fp64()
+        test_default_fp32()
 
         paddle.enable_static()
 
diff --git a/python/paddle/fluid/tests/unittests/test_randint_op.py b/python/paddle/fluid/tests/unittests/test_randint_op.py
index 88b07f5df83f8f967f8ba76e78b37ecfb2c54276..7880b48cd7d5a006d78b836be3d9d2f0b1e04c5e 100644
--- a/python/paddle/fluid/tests/unittests/test_randint_op.py
+++ b/python/paddle/fluid/tests/unittests/test_randint_op.py
@@ -58,6 +58,11 @@ class TestRandintOpError(unittest.TestCase):
             self.assertRaises(TypeError, paddle.randint, 5, dtype='float32')
             self.assertRaises(ValueError, paddle.randint, 5, 5)
             self.assertRaises(ValueError, paddle.randint, -5)
+            self.assertRaises(TypeError, paddle.randint, 5, shape=['2'])
+            shape_tensor = paddle.static.data('X', [1])
+            self.assertRaises(TypeError, paddle.randint, 5, shape=shape_tensor)
+            self.assertRaises(
+                TypeError, paddle.randint, 5, shape=[shape_tensor])
 
 
 class TestRandintOp_attr_tensorlist(OpTest):
diff --git a/python/paddle/fluid/tests/unittests/test_reduce_op.py b/python/paddle/fluid/tests/unittests/test_reduce_op.py
index cf35f9dbcdaaae1357ccdfd6b5cba85ac98d2037..80b201d0842183750361d5e08bab5f78f40a858b 100644
--- a/python/paddle/fluid/tests/unittests/test_reduce_op.py
+++ b/python/paddle/fluid/tests/unittests/test_reduce_op.py
@@ -67,6 +67,22 @@ class TestSumOp6D(OpTest):
         self.check_grad(['X'], 'Out')
 
 
+class TestSumOp8D(OpTest):
+    def setUp(self):
+        self.op_type = "reduce_sum"
+        self.inputs = {
+            'X': np.random.random((1, 3, 1, 2, 1, 4, 3, 10)).astype("float64")
+        }
+        self.attrs = {'dim': (0, 3)}
+        self.outputs = {'Out': self.inputs['X'].sum(axis=(0, 3))}
+
+    def test_check_output(self):
+        self.check_output()
+
+    def test_check_grad(self):
+        self.check_grad(['X'], 'Out')
+
+
 @skip_check_grad_ci(
     reason="reduce_max is discontinuous non-derivable function,"
     " its gradient check is not supported by unittest framework.")
@@ -103,6 +119,40 @@ class TestMinOp(OpTest):
         self.check_output()
 
 
+class TestMin6DOp(OpTest):
+    """Remove Min with subgradient from gradient check to confirm the success of CI."""
+
+    def setUp(self):
+        self.op_type = "reduce_min"
+        self.inputs = {
+            'X': np.random.random((2, 4, 3, 5, 6, 10)).astype("float64")
+        }
+        self.attrs = {'dim': [2, 4]}
+        self.outputs = {
+            'Out': self.inputs['X'].min(axis=tuple(self.attrs['dim']))
+        }
+
+    def test_check_output(self):
+        self.check_output()
+
+
+class TestMin8DOp(OpTest):
+    """Remove Min with subgradient from gradient check to confirm the success of CI."""
+
+    def setUp(self):
+        self.op_type = "reduce_min"
+        self.inputs = {
+            'X': np.random.random((2, 4, 3, 5, 6, 3, 2, 4)).astype("float64")
+        }
+        self.attrs = {'dim': [2, 3, 4]}
+        self.outputs = {
+            'Out': self.inputs['X'].min(axis=tuple(self.attrs['dim']))
+        }
+
+    def test_check_output(self):
+        self.check_output()
+
+
 class TestProdOp(OpTest):
     def setUp(self):
         self.op_type = "reduce_prod"
@@ -116,6 +166,42 @@ class TestProdOp(OpTest):
         self.check_grad(['X'], 'Out')
 
 
+class TestProd6DOp(OpTest):
+    def setUp(self):
+        self.op_type = "reduce_prod"
+        self.inputs = {
+            'X': np.random.random((5, 6, 2, 3, 4, 2)).astype("float64")
+        }
+        self.attrs = {'dim': [2, 3, 4]}
+        self.outputs = {
+            'Out': self.inputs['X'].prod(axis=tuple(self.attrs['dim']))
+        }
+
+    def test_check_output(self):
+        self.check_output()
+
+    def test_check_grad(self):
+        self.check_grad(['X'], 'Out')
+
+
+class TestProd8DOp(OpTest):
+    def setUp(self):
+        self.op_type = "reduce_prod"
+        self.inputs = {
+            'X': np.random.random((2, 5, 3, 2, 2, 3, 4, 2)).astype("float64")
+        }
+        self.attrs = {'dim': [2, 3, 4]}
+        self.outputs = {
+            'Out': self.inputs['X'].prod(axis=tuple(self.attrs['dim']))
+        }
+
+    def test_check_output(self):
+        self.check_output()
+
+    def test_check_grad(self):
+        self.check_grad(['X'], 'Out')
+
+
 class TestAllOp(OpTest):
     def setUp(self):
         self.op_type = "reduce_all"
@@ -127,12 +213,40 @@ class TestAllOp(OpTest):
         self.check_output()
 
 
+class TestAll8DOp(OpTest):
+    def setUp(self):
+        self.op_type = "reduce_all"
+        self.inputs = {
+            'X': np.random.randint(0, 2,
+                                   (2, 5, 3, 2, 2, 3, 4, 2)).astype("bool")
+        }
+        self.attrs = {'reduce_all': True, 'dim': (2, 3, 4)}
+        self.outputs = {'Out': self.inputs['X'].all(axis=self.attrs['dim'])}
+
+    def test_check_output(self):
+        self.check_output()
+
+
 class TestAllOpWithDim(OpTest):
     def setUp(self):
         self.op_type = "reduce_all"
         self.inputs = {'X': np.random.randint(0, 2, (5, 6, 10)).astype("bool")}
-        self.attrs = {'dim': [1]}
-        self.outputs = {'Out': self.inputs['X'].all(axis=1)}
+        self.attrs = {'dim': (1, )}
+        self.outputs = {'Out': self.inputs['X'].all(axis=self.attrs['dim'])}
+
+    def test_check_output(self):
+        self.check_output()
+
+
+class TestAll8DOpWithDim(OpTest):
+    def setUp(self):
+        self.op_type = "reduce_all"
+        self.inputs = {
+            'X': np.random.randint(0, 2,
+                                   (2, 5, 3, 2, 2, 3, 4, 2)).astype("bool")
+        }
+        self.attrs = {'dim': (1, 3, 4)}
+        self.outputs = {'Out': self.inputs['X'].all(axis=self.attrs['dim'])}
 
     def test_check_output(self):
         self.check_output()
@@ -152,6 +266,23 @@ class TestAllOpWithKeepDim(OpTest):
         self.check_output()
 
 
+class TestAll8DOpWithKeepDim(OpTest):
+    def setUp(self):
+        self.op_type = "reduce_all"
+        self.inputs = {
+            'X': np.random.randint(0, 2,
+                                   (2, 5, 3, 2, 2, 3, 4, 2)).astype("bool")
+        }
+        self.attrs = {'dim': (5, ), 'keep_dim': True}
+        self.outputs = {
+            'Out': np.expand_dims(
+                self.inputs['X'].all(axis=self.attrs['dim']), axis=5)
+        }
+
+    def test_check_output(self):
+        self.check_output()
+
+
 class TestAllOpError(unittest.TestCase):
     def test_errors(self):
         with program_guard(Program(), Program()):
@@ -175,6 +306,20 @@ class TestAnyOp(OpTest):
         self.check_output()
 
 
+class TestAny8DOp(OpTest):
+    def setUp(self):
+        self.op_type = "reduce_any"
+        self.inputs = {
+            'X': np.random.randint(0, 2,
+                                   (2, 5, 3, 2, 2, 3, 4, 2)).astype("bool")
+        }
+        self.attrs = {'reduce_all': True, 'dim': (3, 5, 4)}
+        self.outputs = {'Out': self.inputs['X'].any(axis=self.attrs['dim'])}
+
+    def test_check_output(self):
+        self.check_output()
+
+
 class TestAnyOpWithDim(OpTest):
     def setUp(self):
         self.op_type = "reduce_any"
@@ -186,14 +331,45 @@ class TestAnyOpWithDim(OpTest):
         self.check_output()
 
 
+class TestAny8DOpWithDim(OpTest):
+    def setUp(self):
+        self.op_type = "reduce_any"
+        self.inputs = {
+            'X': np.random.randint(0, 2,
+                                   (2, 5, 3, 2, 2, 3, 4, 2)).astype("bool")
+        }
+        self.attrs = {'dim': (3, 6)}
+        self.outputs = {'Out': self.inputs['X'].any(axis=self.attrs['dim'])}
+
+    def test_check_output(self):
+        self.check_output()
+
+
 class TestAnyOpWithKeepDim(OpTest):
     def setUp(self):
         self.op_type = "reduce_any"
         self.inputs = {'X': np.random.randint(0, 2, (5, 6, 10)).astype("bool")}
-        self.attrs = {'dim': [1], 'keep_dim': True}
+        self.attrs = {'dim': (1, ), 'keep_dim': True}
+        self.outputs = {
+            'Out': np.expand_dims(
+                self.inputs['X'].any(axis=self.attrs['dim']), axis=1)
+        }
+
+    def test_check_output(self):
+        self.check_output()
+
+
+class TestAny8DOpWithKeepDim(OpTest):
+    def setUp(self):
+        self.op_type = "reduce_any"
+        self.inputs = {
+            'X': np.random.randint(0, 2,
+                                   (2, 5, 3, 2, 2, 3, 4, 2)).astype("bool")
+        }
+        self.attrs = {'dim': (1, ), 'keep_dim': True}
         self.outputs = {
             'Out': np.expand_dims(
-                self.inputs['X'].any(axis=1), axis=1)
+                self.inputs['X'].any(axis=self.attrs['dim']), axis=1)
         }
 
     def test_check_output(self):
@@ -283,6 +459,18 @@ class Test3DReduce3(Test1DReduce):
         }
 
 
+class Test8DReduce0(Test1DReduce):
+    def setUp(self):
+        self.op_type = "reduce_sum"
+        self.attrs = {'dim': (4, 2, 3)}
+        self.inputs = {
+            'X': np.random.random((2, 5, 3, 2, 2, 3, 4, 2)).astype("float64")
+        }
+        self.outputs = {
+            'Out': self.inputs['X'].sum(axis=tuple(self.attrs['dim']))
+        }
+
+
 class TestKeepDimReduce(Test1DReduce):
     def setUp(self):
         self.op_type = "reduce_sum"
@@ -294,6 +482,19 @@ class TestKeepDimReduce(Test1DReduce):
         }
 
 
+class TestKeepDim8DReduce(Test1DReduce):
+    def setUp(self):
+        self.op_type = "reduce_sum"
+        self.inputs = {
+            'X': np.random.random((2, 5, 3, 2, 2, 3, 4, 2)).astype("float64")
+        }
+        self.attrs = {'dim': (3, 4, 5), 'keep_dim': True}
+        self.outputs = {
+            'Out': self.inputs['X'].sum(axis=tuple(self.attrs['dim']),
+                                        keepdims=self.attrs['keep_dim'])
+        }
+
+
 class TestReduceAll(Test1DReduce):
     def setUp(self):
         self.op_type = "reduce_sum"
@@ -302,6 +503,16 @@ class TestReduceAll(Test1DReduce):
         self.outputs = {'Out': self.inputs['X'].sum()}
 
 
+class TestReduceAll(Test1DReduce):
+    def setUp(self):
+        self.op_type = "reduce_sum"
+        self.inputs = {
+            'X': np.random.random((2, 5, 3, 2, 2, 3, 4, 2)).astype("float64")
+        }
+        self.attrs = {'reduce_all': True, 'dim': (3, 4, 5)}
+        self.outputs = {'Out': self.inputs['X'].sum(axis=self.attrs['dim'])}
+
+
 @skip_check_grad_ci(
     reason="reduce_max is discontinuous non-derivable function,"
     " its gradient check is not supported by unittest framework.")
@@ -475,87 +686,71 @@ class API_TestSumOpError(unittest.TestCase):
     def test_errors(self):
         def test_dtype1():
             with fluid.program_guard(fluid.Program(), fluid.Program()):
-                data = fluid.data(name="data", shape=[10], dtype="float32")
-                paddle.sum(data, dtype="int32")
+                data = fluid.data(name="data", shape=[10], dtype="float64")
+                paddle.sum(data, dtype="float32")
 
         self.assertRaises(ValueError, test_dtype1)
 
         def test_dtype2():
             with fluid.program_guard(fluid.Program(), fluid.Program()):
-                data = fluid.data(name="data", shape=[10], dtype="float32")
-                paddle.sum(data, dtype="float32")
+                data = fluid.data(name="data", shape=[10], dtype="int64")
+                paddle.sum(data, dtype="int32")
 
         self.assertRaises(ValueError, test_dtype2)
 
         def test_dtype3():
             with fluid.program_guard(fluid.Program(), fluid.Program()):
-                data = fluid.data(name="data", shape=[10], dtype="int32")
-                paddle.sum(data, dtype="bool")
+                data = fluid.data(name="data", shape=[10], dtype="float64")
+                paddle.sum(data, dtype="int32")
 
         self.assertRaises(ValueError, test_dtype3)
 
-        def test_dtype4():
+        def test_type():
             with fluid.program_guard(fluid.Program(), fluid.Program()):
                 data = fluid.data(name="data", shape=[10], dtype="int32")
-                paddle.sum(data, dtype="int32")
+                paddle.sum(data, dtype="bool")
 
-        self.assertRaises(ValueError, test_dtype3)
+        self.assertRaises(TypeError, test_type)
 
 
 class API_TestSumOp(unittest.TestCase):
-    def test_static(self):
-        with fluid.program_guard(fluid.Program(), fluid.Program()):
-            data = fluid.data("data", shape=[10, 10], dtype="float32")
-            result_sum = paddle.sum(x=data, axis=1, dtype="float64")
-            place = fluid.CPUPlace()
-            exe = fluid.Executor(place)
-            input_data = np.random.rand(10, 10).astype(np.float32)
-            res, = exe.run(feed={"data": input_data}, fetch_list=[result_sum])
-        self.assertEqual(
-            (res == np.sum(input_data.astype(np.float64), axis=1)).all(), True)
+    def run_static(self,
+                   shape,
+                   x_dtype,
+                   attr_axis,
+                   attr_dtype=None,
+                   np_axis=None):
+        if np_axis is None:
+            np_axis = attr_axis
 
         with fluid.program_guard(fluid.Program(), fluid.Program()):
-            data = fluid.data("data", shape=[10, 10], dtype="int32")
-            result_sum = paddle.sum(x=data, axis=1, dtype="int64")
-            place = fluid.CPUPlace()
-            exe = fluid.Executor(place)
-            input_data = np.random.randint(10, size=(10, 10)).astype(np.int32)
-            res, = exe.run(feed={"data": input_data}, fetch_list=[result_sum])
-        self.assertEqual(
-            (res == np.sum(input_data.astype(np.int64), axis=1)).all(), True)
+            data = fluid.data("data", shape=shape, dtype=x_dtype)
+            result_sum = paddle.sum(x=data, axis=attr_axis, dtype=attr_dtype)
 
-        with fluid.program_guard(fluid.Program(), fluid.Program()):
-            data = fluid.data("data", shape=[10, 10], dtype="int32")
-            result_sum = paddle.sum(x=data, axis=1)
-            place = fluid.CPUPlace()
-            exe = fluid.Executor(place)
-            input_data = np.random.randint(10, size=(10, 10)).astype(np.int32)
+            exe = fluid.Executor(fluid.CPUPlace())
+            input_data = np.random.rand(*shape).astype(x_dtype)
             res, = exe.run(feed={"data": input_data}, fetch_list=[result_sum])
-        self.assertEqual((res == np.sum(input_data, axis=1)).all(), True)
 
-        with fluid.program_guard(fluid.Program(), fluid.Program()):
-            data = fluid.data("data", shape=[10, 10], dtype="int32")
-            result_sum = paddle.sum(x=data, axis=1)
-            place = fluid.CPUPlace()
-            exe = fluid.Executor(place)
-            input_data = np.random.randint(10, size=(10, 10)).astype(np.int32)
-            res, = exe.run(feed={"data": input_data}, fetch_list=[result_sum])
-        self.assertEqual((res == np.sum(input_data, axis=1)).all(), True)
+        self.assertTrue(
+            np.allclose(
+                res, np.sum(input_data.astype(attr_dtype), axis=np_axis)))
 
-        with fluid.program_guard(fluid.Program(), fluid.Program()):
-            input_data = np.random.randint(10, size=(5, 5, 5)).astype(np.int32)
-            data = fluid.data("data", shape=[5, 5, 5], dtype="int32")
-            sum1 = paddle.sum(x=data, axis=[0, 1])
-            sum2 = paddle.sum(x=data, axis=())
-
-            place = fluid.CPUPlace()
-            exe = fluid.Executor(place)
-            res1, res2 = exe.run(feed={"data": input_data},
-                                 fetch_list=[sum1, sum2])
-
-        self.assertEqual((res1 == np.sum(input_data, axis=(0, 1))).all(), True)
-        self.assertEqual(
-            (res2 == np.sum(input_data, axis=(0, 1, 2))).all(), True)
+    def test_static(self):
+        shape = [10, 10]
+        axis = 1
+
+        self.run_static(shape, "int32", axis, attr_dtype=None)
+        self.run_static(shape, "int32", axis, attr_dtype="int32")
+        self.run_static(shape, "int32", axis, attr_dtype="int64")
+
+        self.run_static(shape, "float32", axis, attr_dtype=None)
+        self.run_static(shape, "float32", axis, attr_dtype="float32")
+        self.run_static(shape, "float32", axis, attr_dtype="float64")
+
+        shape = [5, 5, 5]
+        self.run_static(shape, "int32", (0, 1), attr_dtype="int32")
+        self.run_static(
+            shape, "int32", (), attr_dtype="int32", np_axis=(0, 1, 2))
 
     def test_dygraph(self):
         np_x = np.random.random([2, 3, 4]).astype('int32')
diff --git a/python/paddle/fluid/tests/unittests/test_reducescatter.py b/python/paddle/fluid/tests/unittests/test_reducescatter.py
index 58bcc11cd89c0573bc572008eb174e7070937cad..7c355d46285c59197759689d9a457aec96b89135 100644
--- a/python/paddle/fluid/tests/unittests/test_reducescatter.py
+++ b/python/paddle/fluid/tests/unittests/test_reducescatter.py
@@ -15,9 +15,12 @@
 from __future__ import print_function
 import unittest
 import numpy as np
+import paddle
 
 from test_collective_base import TestDistBase
 
+paddle.enable_static()
+
 
 class TestReduceScatterOp(TestDistBase):
     def _setup_config(self):
diff --git a/python/paddle/fluid/tests/unittests/test_reducescatter_api.py b/python/paddle/fluid/tests/unittests/test_reducescatter_api.py
index 5fa75cc3effe37197195da7555a1a3266e30754b..5a494b5529efbef420c6e65532352fd58cc1db11 100644
--- a/python/paddle/fluid/tests/unittests/test_reducescatter_api.py
+++ b/python/paddle/fluid/tests/unittests/test_reducescatter_api.py
@@ -16,9 +16,12 @@ from __future__ import print_function
 import unittest
 import numpy as np
 import paddle.fluid as fluid
+import paddle
 
 from test_collective_base import TestDistBase
 
+paddle.enable_static()
+
 
 class TestReduceScatterAPI(TestDistBase):
     def _setup_config(self):
diff --git a/python/paddle/fluid/tests/unittests/test_regularizer.py b/python/paddle/fluid/tests/unittests/test_regularizer.py
index 44087c5421a5ee66273ef35b935926d42dcc37ae..167a8a017c24a01a6475a03835222d33c601396e 100644
--- a/python/paddle/fluid/tests/unittests/test_regularizer.py
+++ b/python/paddle/fluid/tests/unittests/test_regularizer.py
@@ -106,9 +106,9 @@ def bow_net(data,
             label,
             dict_dim,
             is_sparse=False,
-            emb_dim=128,
-            hid_dim=128,
-            hid_dim2=96,
+            emb_dim=8,
+            hid_dim=8,
+            hid_dim2=6,
             class_dim=2):
     """
     BOW net
@@ -132,8 +132,8 @@ class TestRegularizer(unittest.TestCase):
     def setUp(self):
         self.word_dict = paddle.dataset.imdb.word_dict()
         reader = paddle.batch(
-            paddle.dataset.imdb.train(self.word_dict), batch_size=8)()
-        self.train_data = [next(reader) for _ in range(5)]
+            paddle.dataset.imdb.train(self.word_dict), batch_size=1)()
+        self.train_data = [next(reader) for _ in range(1)]
 
     def get_places(self):
         places = [core.CPUPlace()]
@@ -245,14 +245,14 @@ class TestRegularizer(unittest.TestCase):
             sgd.minimize(loss)
         with fluid.dygraph.guard():
             input = fluid.dygraph.to_variable(
-                np.random.randn(3, 5).astype('float32'))
+                np.random.randn(3, 2).astype('float32'))
             paddle.manual_seed(1)
             paddle.framework.random._manual_program_seed(1)
 
             linear1 = fluid.dygraph.Linear(
-                5, 2, param_attr=fc_param_attr, bias_attr=fc_param_attr)
+                2, 2, param_attr=fc_param_attr, bias_attr=fc_param_attr)
             linear2 = fluid.dygraph.Linear(
-                5, 2, param_attr=fc_param_attr, bias_attr=fc_param_attr)
+                2, 2, param_attr=fc_param_attr, bias_attr=fc_param_attr)
 
             loss1 = linear1(input)
             loss1.backward()
diff --git a/python/paddle/fluid/tests/unittests/test_regularizer_api.py b/python/paddle/fluid/tests/unittests/test_regularizer_api.py
new file mode 100644
index 0000000000000000000000000000000000000000..76186d2e39feafe772fce6cc7f9099e97d833232
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/test_regularizer_api.py
@@ -0,0 +1,204 @@
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+
+import unittest
+from functools import partial
+import contextlib
+import numpy as np
+import paddle
+import paddle.fluid.core as core
+import paddle.fluid as fluid
+import paddle.fluid.framework as framework
+import paddle.fluid.optimizer as optimizer
+import paddle.regularizer as regularizer
+from paddle.fluid.backward import append_backward
+
+
+def bow_net(data,
+            label,
+            dict_dim,
+            is_sparse=False,
+            emb_dim=8,
+            hid_dim=8,
+            hid_dim2=6,
+            class_dim=2):
+    """
+    BOW net
+    This model is from https://github.com/PaddlePaddle/models:
+    fluid/PaddleNLP/text_classification/nets.py
+    """
+    emb = fluid.layers.embedding(
+        input=data, is_sparse=is_sparse, size=[dict_dim, emb_dim])
+    bow = fluid.layers.sequence_pool(input=emb, pool_type='sum')
+    bow_tanh = fluid.layers.tanh(bow)
+    fc_1 = fluid.layers.fc(input=bow_tanh, size=hid_dim, act="tanh")
+    fc_2 = fluid.layers.fc(input=fc_1, size=hid_dim2, act="tanh")
+    prediction = fluid.layers.fc(input=[fc_2], size=class_dim, act="softmax")
+    cost = fluid.layers.cross_entropy(input=prediction, label=label)
+    avg_cost = fluid.layers.mean(x=cost)
+
+    return avg_cost
+
+
+class TestRegularizer(unittest.TestCase):
+    def setUp(self):
+        self.word_dict = paddle.dataset.imdb.word_dict()
+        reader = paddle.batch(
+            paddle.dataset.imdb.train(self.word_dict), batch_size=1)()
+        self.train_data = [next(reader) for _ in range(1)]
+
+    def get_places(self):
+        places = [core.CPUPlace()]
+        if core.is_compiled_with_cuda():
+            places.append(core.CUDAPlace(0))
+        return places
+
+    @contextlib.contextmanager
+    def scope_prog_guard(self, main_prog, startup_prog):
+        scope = fluid.core.Scope()
+        with fluid.unique_name.guard():
+            with fluid.scope_guard(scope):
+                with fluid.program_guard(main_prog, startup_prog):
+                    yield
+
+    def run_program(self, place, feed_list):
+        exe = fluid.Executor(place)
+        feeder = fluid.DataFeeder(feed_list=feed_list, place=place)
+        exe.run(fluid.default_startup_program())
+
+        main_prog = fluid.default_main_program()
+        param_list = [var.name for var in main_prog.block(0).all_parameters()]
+
+        param_sum = []
+        for data in self.train_data:
+            out = exe.run(main_prog,
+                          feed=feeder.feed(data),
+                          fetch_list=param_list)
+            p_sum = 0
+            for v in out:
+                p_sum += np.sum(np.abs(v))
+            param_sum.append(p_sum)
+        return param_sum
+
+    def check_l2decay_regularizer(self, place, model):
+        paddle.manual_seed(1)
+        paddle.framework.random._manual_program_seed(1)
+        main_prog = fluid.framework.Program()
+        startup_prog = fluid.framework.Program()
+        with self.scope_prog_guard(
+                main_prog=main_prog, startup_prog=startup_prog):
+            data = fluid.layers.data(
+                name="words", shape=[1], dtype="int64", lod_level=1)
+            label = fluid.layers.data(name="label", shape=[1], dtype="int64")
+
+            avg_cost = model(data, label, len(self.word_dict))
+
+            optimizer = fluid.optimizer.Adagrad(
+                learning_rate=0.1,
+                regularization=paddle.regularizer.L2Decay(1.0))
+            optimizer.minimize(avg_cost)
+            param_sum = self.run_program(place, [data, label])
+        return param_sum
+
+    def check_l2decay(self, place, model):
+        paddle.manual_seed(1)
+        paddle.framework.random._manual_program_seed(1)
+        main_prog = fluid.framework.Program()
+        startup_prog = fluid.framework.Program()
+
+        with self.scope_prog_guard(
+                main_prog=main_prog, startup_prog=startup_prog):
+            data = fluid.layers.data(
+                name="words", shape=[1], dtype="int64", lod_level=1)
+            label = fluid.layers.data(name="label", shape=[1], dtype="int64")
+
+            avg_cost_l2 = model(data, label, len(self.word_dict))
+
+            param_list = fluid.default_main_program().block(0).all_parameters()
+            para_sum = []
+            for para in param_list:
+                para_mul = fluid.layers.square(x=para)
+                para_sum.append(fluid.layers.reduce_sum(input=para_mul))
+            avg_cost_l2 += fluid.layers.sums(para_sum) * .5
+
+            optimizer = fluid.optimizer.Adagrad(learning_rate=0.1)
+            optimizer.minimize(avg_cost_l2)
+            param_sum = self.run_program(place, [data, label])
+        return param_sum
+
+    def test_l2(self):
+        for place in self.get_places():
+            dense_sparse_p_sum = []
+            for sparse in [True, False]:
+                model = partial(bow_net, is_sparse=sparse)
+                framework_l2 = self.check_l2decay_regularizer(place, model)
+                l2 = self.check_l2decay(place, model)
+                assert len(l2) == len(framework_l2)
+                for i in range(len(l2)):
+                    assert np.isclose(a=framework_l2[i], b=l2[i], rtol=5e-5)
+                dense_sparse_p_sum.append(framework_l2)
+
+            assert len(dense_sparse_p_sum[0]) == len(dense_sparse_p_sum[1])
+            for i in range(len(dense_sparse_p_sum[0])):
+                assert np.isclose(
+                    a=dense_sparse_p_sum[0][i],
+                    b=dense_sparse_p_sum[1][i],
+                    rtol=5e-5)
+
+    def test_repeated_regularization(self):
+        l1 = paddle.regularizer.L1Decay(0.1)
+        l2 = paddle.regularizer.L2Decay(0.01)
+        fc_param_attr = fluid.ParamAttr(regularizer=l1)
+        with fluid.program_guard(fluid.Program(), fluid.Program()):
+            x = fluid.layers.uniform_random([2, 2, 3])
+            out = fluid.layers.fc(x, 5, param_attr=fc_param_attr)
+            loss = fluid.layers.reduce_sum(out)
+            sgd = fluid.optimizer.SGD(learning_rate=0.1, regularization=l2)
+            sgd.minimize(loss)
+        with fluid.dygraph.guard():
+            input = fluid.dygraph.to_variable(
+                np.random.randn(3, 2).astype('float32'))
+            paddle.manual_seed(1)
+            paddle.framework.random._manual_program_seed(1)
+
+            linear1 = fluid.dygraph.Linear(
+                2, 2, param_attr=fc_param_attr, bias_attr=fc_param_attr)
+            linear2 = fluid.dygraph.Linear(
+                2, 2, param_attr=fc_param_attr, bias_attr=fc_param_attr)
+
+            loss1 = linear1(input)
+            loss1.backward()
+            # set l2 regularizer in optimizer, but l1 in fluid.ParamAttr
+
+            fluid.optimizer.SGD(parameter_list=linear1.parameters(),
+                                learning_rate=1e-2,
+                                regularization=l2).minimize(loss1)
+            # only set l1 in fluid.ParamAttr
+            loss2 = linear2(input)
+            loss2.backward()
+            fluid.optimizer.SGD(parameter_list=linear2.parameters(),
+                                learning_rate=1e-2).minimize(loss2)
+            # they should both be applied by l1, and keep the same
+            self.assertTrue(
+                np.allclose(linear1.weight.numpy(), linear2.weight.numpy()),
+                "weight should use the regularization in fluid.ParamAttr!")
+            self.assertTrue(
+                np.allclose(linear1.bias.numpy(), linear2.bias.numpy()),
+                "bias should use the regularization in fluid.ParamAttr!")
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_rnn_decode_api.py b/python/paddle/fluid/tests/unittests/test_rnn_decode_api.py
index 7e2ef36c1a7fda5c31049ec9c752c5226bfb89dc..6ca194b2694b6c7537ceb94e11eb1a1a0aeb8d8d 100644
--- a/python/paddle/fluid/tests/unittests/test_rnn_decode_api.py
+++ b/python/paddle/fluid/tests/unittests/test_rnn_decode_api.py
@@ -248,8 +248,7 @@ class PolicyGradient(object):
             func=reward_func, x=[action, length], out=reward)
         neg_log_prob = layers.cross_entropy(act_prob, action)
         cost = neg_log_prob * reward
-        cost = (layers.reduce_sum(cost) /
-                layers.cast(layers.reduce_sum(length), "float32")
+        cost = (layers.reduce_sum(cost) / layers.reduce_sum(length)
                 ) if length is not None else layers.reduce_mean(cost)
         optimizer = fluid.optimizer.Adam(self.lr)
         optimizer.minimize(cost)
diff --git a/python/paddle/fluid/tests/unittests/test_roi_align_op.py b/python/paddle/fluid/tests/unittests/test_roi_align_op.py
index b01863880866e247f2aee4b94ae3121c9d891f92..fb8a090b80700d9b884a72f7f430723754523a13 100644
--- a/python/paddle/fluid/tests/unittests/test_roi_align_op.py
+++ b/python/paddle/fluid/tests/unittests/test_roi_align_op.py
@@ -181,16 +181,11 @@ class TestROIAlignInLodOp(TestROIAlignOp):
         self.calc_roi_align()
 
         seq_len = self.rois_lod[0]
-        cur_len = 0
-        lod = [cur_len]
-        for l in seq_len:
-            cur_len += l
-            lod.append(cur_len)
 
         self.inputs = {
             'X': self.x,
             'ROIs': (self.rois[:, 1:5], self.rois_lod),
-            'RoisLod': np.asarray(lod).astype('int64')
+            'RoisNum': np.asarray(seq_len).astype('int32')
         }
 
         self.attrs = {
diff --git a/python/paddle/fluid/tests/unittests/test_roi_pool_op.py b/python/paddle/fluid/tests/unittests/test_roi_pool_op.py
index 1200b0e3470f650dce4365ee46458c8184281292..c6622cf8d9ce8ae655a6b2e5c130ed9990fd2a5b 100644
--- a/python/paddle/fluid/tests/unittests/test_roi_pool_op.py
+++ b/python/paddle/fluid/tests/unittests/test_roi_pool_op.py
@@ -174,16 +174,11 @@ class TestROIPoolInLodOp(TestROIPoolOp):
         self.calc_roi_pool()
 
         seq_len = self.rois_lod[0]
-        cur_len = 0
-        lod = [cur_len]
-        for l in seq_len:
-            cur_len += l
-            lod.append(cur_len)
 
         self.inputs = {
             'X': self.x,
             'ROIs': (self.rois[:, 1:5], self.rois_lod),
-            'RoisLod': np.asarray(lod).astype('int64')
+            'RoisNum': np.asarray(seq_len).astype('int32')
         }
 
         self.attrs = {
diff --git a/python/paddle/fluid/tests/unittests/test_save_model_without_var.py b/python/paddle/fluid/tests/unittests/test_save_model_without_var.py
index b74a6e10917f71b669a2d2e6906d6c7310c59c7e..4c63dced83b199acaffc3e703785b3eb8ec8913b 100644
--- a/python/paddle/fluid/tests/unittests/test_save_model_without_var.py
+++ b/python/paddle/fluid/tests/unittests/test_save_model_without_var.py
@@ -50,7 +50,7 @@ class TestSaveModelWithoutVar(unittest.TestCase):
                 params_filename='params')
             expected_warn = "no variable in your model, please ensure there are any variables in your model to save"
             self.assertTrue(len(w) > 0)
-            self.assertTrue(expected_warn == str(w[0].message))
+            self.assertTrue(expected_warn == str(w[-1].message))
 
 
 if __name__ == '__main__':
diff --git a/python/paddle/fluid/tests/unittests/test_segment_ops.py b/python/paddle/fluid/tests/unittests/test_segment_ops.py
new file mode 100644
index 0000000000000000000000000000000000000000..b58d66676b05524766366d9587d395aadc32a7b4
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/test_segment_ops.py
@@ -0,0 +1,202 @@
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+
+import unittest
+import numpy as np
+import sys
+from op_test import OpTest
+
+
+def compute_segment_sum(x, segment_ids):
+    length = segment_ids[-1] + 1
+    target_shape = list(x.shape)
+    target_shape[0] = length
+    results = np.zeros(target_shape, dtype=x.dtype)
+    for index, ids in enumerate(segment_ids):
+        results[ids, :] += x[index, :]
+    return results
+
+
+def compute_segment_mean(x, segment_ids):
+    length = segment_ids[-1] + 1
+    target_shape = list(x.shape)
+    target_shape[0] = length
+    results = np.zeros(target_shape, dtype=x.dtype)
+    count = np.zeros(length, dtype=x.dtype) + 1e-8
+    for index, ids in enumerate(segment_ids):
+        results[ids, :] += x[index, :]
+        count[ids] += 1
+    results = results / count.reshape([-1, 1])
+    return results
+
+
+def compute_segment_min_max(x, segment_ids, pooltype="MAX"):
+    length = segment_ids[-1] + 1
+    target_shape = list(x.shape)
+    target_shape[0] = length
+    gradient = np.zeros_like(x)
+    results = np.zeros(target_shape, dtype=x.dtype)
+    last_idx = 0
+    current_id = segment_ids[0]
+    for idx in range(1, len(segment_ids) + 1):
+        if idx < len(segment_ids):
+            if segment_ids[idx] == current_id:
+                continue
+        sub_x = x[last_idx:idx, :]
+        if pooltype == "MAX":
+            results[current_id] = np.amax(sub_x, axis=0)
+        elif pooltype == "MIN":
+            results[current_id] = np.amin(sub_x, axis=0)
+        else:
+            raise ValueError("Invalid pooltype, only MAX, MIN supported!")
+        gradient[last_idx:idx, :][sub_x == results[current_id]] = 1
+        last_idx = idx
+        if idx < len(segment_ids):
+            current_id = segment_ids[idx]
+
+    return results, gradient / results.size
+
+
+class TestSegmentOps(OpTest):
+    def set_data(self):
+        x = np.random.uniform(-1, 1, self.shape).astype(self.dtype)
+        segment_ids = self.set_segment(len(x), len(x) // 5 + 1)
+        return x, segment_ids
+
+    def set_segment(self, origin_len, reduce_len):
+        segment = np.zeros(reduce_len, dtype='int64')
+        segment = np.random.randint(0, reduce_len, size=[origin_len])
+        segment = np.sort(segment)
+        return segment.astype('int64')
+
+    def compute(self, x, segment_ids):
+        return compute_segment_sum(x, segment_ids)
+
+    def prepare(self):
+        self.op_type = "segment_pool"
+        self.dtype = np.float64
+        self.shape = [30, 15]
+        self.attrs = {"pooltype": "SUM"}
+
+    def setUp(self):
+        self.prepare()
+        x, segment_ids = self.set_data()
+        result = self.compute(x, segment_ids)
+        self.inputs = {
+            'X': x.astype(self.dtype),
+            'SegmentIds': segment_ids.astype(np.int64)
+        }
+        self.outputs = {'Out': result.astype(self.dtype)}
+
+    def test_check_output(self):
+        self.check_output()
+
+    def test_check_grad(self):
+        self.check_grad(["X"], "Out")
+
+
+class TestSegmentSum2(TestSegmentOps):
+    def prepare(self):
+        super(TestSegmentSum2, self).prepare()
+        self.shape = [40, 20]
+        self.dtype = np.float32
+
+    def setUp(self):
+        self.prepare()
+        x, segment_ids = self.set_data()
+        result = self.compute(x, segment_ids)
+        self.inputs = {
+            'X': x.astype(self.dtype),
+            'SegmentIds': segment_ids.astype(np.int32)
+        }
+        self.outputs = {'Out': result.astype(self.dtype)}
+
+
+class TestSegmentMax(TestSegmentOps):
+    def compute(self, x, segment_ids):
+        return compute_segment_min_max(x, segment_ids, pooltype="MAX")
+
+    def prepare(self):
+        super(TestSegmentMax, self).prepare()
+        self.shape = [40, 20]
+        self.attrs = {'pooltype': "MAX"}
+
+    def setUp(self):
+        self.prepare()
+        x, segment_ids = self.set_data()
+        result, self.gradient = self.compute(x, segment_ids)
+        self.inputs = {
+            'X': x.astype(self.dtype),
+            'SegmentIds': segment_ids.astype(np.int32)
+        }
+        self.outputs = {'Out': result.astype(self.dtype)}
+
+    def test_check_grad(self):
+        self.check_grad(["X"], "Out", user_defined_grads=[self.gradient])
+
+
+class TestSegmentMax2(TestSegmentMax):
+    def prepare(self):
+        super(TestSegmentMax2, self).prepare()
+        self.dtype = np.float32
+
+
+class TestSegmentMin(TestSegmentMax):
+    def compute(self, x, segment_ids):
+        return compute_segment_min_max(x, segment_ids, pooltype="MIN")
+
+    def prepare(self):
+        super(TestSegmentMin, self).prepare()
+        self.attrs = {'pooltype': "MIN"}
+
+
+class TestSegmentMin2(TestSegmentMin):
+    def prepare(self):
+        super(TestSegmentMin2, self).prepare()
+        self.dtype = np.float32
+
+
+class TestSegmentMean(TestSegmentOps):
+    def compute(self, x, segment_ids):
+        return compute_segment_mean(x, segment_ids)
+
+    def prepare(self):
+        super(TestSegmentMean, self).prepare()
+        self.shape = [40, 20]
+        self.attrs = {'pooltype': "MEAN"}
+
+    def setUp(self):
+        self.prepare()
+        x, segment_ids = self.set_data()
+        result = self.compute(x, segment_ids)
+        self.inputs = {'X': x, 'SegmentIds': segment_ids}
+        self.outputs = {
+            'Out': result,
+            'SummedIds': compute_segment_sum(
+                np.ones([len(x), 1]).astype(self.dtype), segment_ids)
+        }
+
+
+class TestSegmentMean2(TestSegmentMean):
+    def prepare(self):
+        super(TestSegmentMean2, self).prepare()
+        self.dtype = np.float32
+        self.shape = [30, 20]
+        self.attrs = {'pooltype': "MEAN"}
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_stack_op.py b/python/paddle/fluid/tests/unittests/test_stack_op.py
index fd5c02c55db4c22d9edd604b7998a5405961d596..8dd71c5a558094ce6f259105eeb1aafb834ad6dc 100644
--- a/python/paddle/fluid/tests/unittests/test_stack_op.py
+++ b/python/paddle/fluid/tests/unittests/test_stack_op.py
@@ -182,6 +182,11 @@ class API_test(unittest.TestCase):
             expected_result = np.stack([input1, input2, input3], axis=0)
             self.assertTrue(np.allclose(expected_result, result))
 
+    def test_single_tensor_error(self):
+        with fluid.program_guard(fluid.Program(), fluid.Program()):
+            x = paddle.rand([2, 3])
+            self.assertRaises(TypeError, paddle.stack, x)
+
 
 class API_DygraphTest(unittest.TestCase):
     def test_out(self):
@@ -192,18 +197,23 @@ class API_DygraphTest(unittest.TestCase):
             x1 = fluid.dygraph.to_variable(data1)
             x2 = fluid.dygraph.to_variable(data2)
             x3 = fluid.dygraph.to_variable(data3)
-            result = paddle.stack([x1, x2, x3], axis=0)
+            result = paddle.stack([x1, x2, x3])
             result_np = result.numpy()
-        expected_result = np.stack([data1, data2, data3], axis=0)
+        expected_result = np.stack([data1, data2, data3])
         self.assertTrue(np.allclose(expected_result, result_np))
 
         with fluid.dygraph.guard():
             y1 = fluid.dygraph.to_variable(data1)
-            result = paddle.stack(y1, axis=0)
+            result = paddle.stack([y1], axis=0)
             result_np_2 = result.numpy()
-        expected_result_2 = np.stack(data1, axis=0)
+        expected_result_2 = np.stack([data1], axis=0)
         self.assertTrue(np.allclose(expected_result_2, result_np_2))
 
+    def test_single_tensor_error(self):
+        with fluid.dygraph.guard():
+            x = paddle.to_tensor([1, 2, 3])
+            self.assertRaises(Exception, paddle.stack, x)
+
 
 if __name__ == '__main__':
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_sync_batch_norm_op.py b/python/paddle/fluid/tests/unittests/test_sync_batch_norm_op.py
index 09cd40d9cc59914c82cc343bb78b72fbc2b29e59..1c11e831b0ad31a3c450c70e7f7c258455409d05 100644
--- a/python/paddle/fluid/tests/unittests/test_sync_batch_norm_op.py
+++ b/python/paddle/fluid/tests/unittests/test_sync_batch_norm_op.py
@@ -227,14 +227,15 @@ class TestConvertSyncBatchNorm(unittest.TestCase):
             return
 
         with program_guard(Program(), Program()):
+            compare_model = paddle.nn.Sequential(
+                paddle.nn.Conv2d(3, 5, 3), paddle.nn.BatchNorm2d(5))
             model = paddle.nn.Sequential(
                 paddle.nn.Conv2d(3, 5, 3), paddle.nn.BatchNorm2d(5))
-            sync_model = paddle.nn.SyncBatchNorm.convert_sync_batchnorm(model)
-            for idx, sublayer in enumerate(model.sublayers()):
+            model = paddle.nn.SyncBatchNorm.convert_sync_batchnorm(model)
+            for idx, sublayer in enumerate(compare_model.sublayers()):
                 if isinstance(sublayer, paddle.nn.BatchNorm2d):
                     self.assertEqual(
-                        isinstance(sync_model[idx], paddle.nn.SyncBatchNorm),
-                        True)
+                        isinstance(model[idx], paddle.nn.SyncBatchNorm), True)
 
 
 if __name__ == '__main__':
diff --git a/python/paddle/fluid/tests/unittests/test_top_k_v2_op.py b/python/paddle/fluid/tests/unittests/test_top_k_v2_op.py
index 54e7765c0fb76844a6123fceea6c1ef79dc0c2bf..b9d96f329b5bb48f7167d005f11f64136fdf5d01 100644
--- a/python/paddle/fluid/tests/unittests/test_top_k_v2_op.py
+++ b/python/paddle/fluid/tests/unittests/test_top_k_v2_op.py
@@ -63,28 +63,28 @@ class TestTopkOp(OpTest):
         self.check_grad(set(['X']), 'Out')
 
 
-class TestTopOp1(TestTopkOp):
+class TestTopkOp1(TestTopkOp):
     def init_args(self):
         self.k = 3
         self.axis = 0
         self.largest = True
 
 
-class TestTopOp2(TestTopkOp):
+class TestTopkOp2(TestTopkOp):
     def init_args(self):
         self.k = 3
         self.axis = 0
         self.largest = False
 
 
-class TestTopOp3(TestTopkOp):
+class TestTopkOp3(TestTopkOp):
     def init_args(self):
         self.k = 4
         self.axis = 0
         self.largest = False
 
 
-class TestTopOp4(TestTopkOp):
+class TestTopkOp4(TestTopkOp):
     def init_args(self):
         self.k = 4
         self.axis = 0
@@ -189,6 +189,8 @@ class TestTopKAPI(unittest.TestCase):
             result1 = paddle.topk(input_tensor, k=2)
             result2 = paddle.topk(input_tensor, k=2, axis=-1)
             result3 = paddle.topk(input_tensor, k=k_tensor, axis=1)
+            self.assertEqual(result3[0].shape, (6, -1, 8))
+            self.assertEqual(result3[1].shape, (6, -1, 8))
             result4 = paddle.topk(input_tensor, k=2, axis=1, largest=False)
             result5 = paddle.topk(input_tensor, k=2, axis=-1, largest=False)
             result6 = paddle.topk(large_input_tensor, k=1, axis=-1)
@@ -239,6 +241,15 @@ class TestTopKAPI(unittest.TestCase):
             self.run_dygraph(place)
             self.run_static(place)
 
+    def test_errors(self):
+        paddle.disable_static()
+        x = paddle.to_tensor([1, 2, 3])
+        with self.assertRaises(BaseException):
+            paddle.topk(x, k=-1)
+
+        with self.assertRaises(BaseException):
+            paddle.topk(x, k=0)
+
 
 if __name__ == "__main__":
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_transformer_api.py b/python/paddle/fluid/tests/unittests/test_transformer_api.py
index 5fea9f69a18c83be0f6af05784735ea53d0993d2..bd76edc9d8cadf14c6cf224b7708ff4acd6efef4 100644
--- a/python/paddle/fluid/tests/unittests/test_transformer_api.py
+++ b/python/paddle/fluid/tests/unittests/test_transformer_api.py
@@ -474,6 +474,141 @@ class TestTransformer(unittest.TestCase):
             trans_output = transformer(src, tgt, src_mask, tgt_mask,
                                        memory_mask)
 
+    def test_transformer_attr_1(self):
+        batch_size, d_model, n_head, dim_feedforward, dropout, _, _, source_length, target_length = generate_basic_params(
+            mode="decoder_layer")
+
+        # batch_size, source_length, target_length, d_model, n_head = 4, 8, 8, 64, 8
+        with fluid.dygraph.guard(fluid.CPUPlace()):
+            transformer = Transformer(
+                d_model,
+                n_head,
+                dim_feedforward=dim_feedforward,
+                dropout=dropout,
+                weight_attr=[None],
+                bias_attr=[False])
+            src = paddle.to_variable(
+                np.random.rand(batch_size, source_length, d_model).astype(
+                    "float32"))
+            tgt = paddle.to_variable(
+                np.random.rand(batch_size, target_length, d_model).astype(
+                    "float32"))
+            src_mask = np.zeros((batch_size, n_head, source_length,
+                                 source_length)).astype("float32")
+            src_mask[0][0][0][0] = -np.inf
+            src_mask = paddle.to_variable(src_mask)
+            tgt_mask = np.zeros((batch_size, n_head, target_length,
+                                 target_length)).astype("float32")
+            tgt_mask[0][0][0][0] = -1e9
+            memory_mask = np.zeros((batch_size, n_head, target_length,
+                                    source_length)).astype("float32")
+            memory_mask[0][0][0][0] = -1e9
+            tgt_mask, memory_mask = paddle.to_variable(
+                tgt_mask), paddle.to_variable(memory_mask)
+            trans_output = transformer(src, tgt, src_mask, tgt_mask,
+                                       memory_mask)
+
+    def test_transformer_attr_2(self):
+        batch_size, d_model, n_head, dim_feedforward, dropout, _, _, source_length, target_length = generate_basic_params(
+            mode="decoder_layer")
+
+        # batch_size, source_length, target_length, d_model, n_head = 4, 8, 8, 64, 8
+        with fluid.dygraph.guard(fluid.CPUPlace()):
+            transformer = Transformer(
+                d_model,
+                n_head,
+                dim_feedforward=dim_feedforward,
+                dropout=dropout,
+                weight_attr=[None, None],
+                bias_attr=[False, False])
+            src = paddle.to_variable(
+                np.random.rand(batch_size, source_length, d_model).astype(
+                    "float32"))
+            tgt = paddle.to_variable(
+                np.random.rand(batch_size, target_length, d_model).astype(
+                    "float32"))
+            src_mask = np.zeros((batch_size, n_head, source_length,
+                                 source_length)).astype("float32")
+            src_mask[0][0][0][0] = -np.inf
+            src_mask = paddle.to_variable(src_mask)
+            tgt_mask = np.zeros((batch_size, n_head, target_length,
+                                 target_length)).astype("float32")
+            tgt_mask[0][0][0][0] = -1e9
+            memory_mask = np.zeros((batch_size, n_head, target_length,
+                                    source_length)).astype("float32")
+            memory_mask[0][0][0][0] = -1e9
+            tgt_mask, memory_mask = paddle.to_variable(
+                tgt_mask), paddle.to_variable(memory_mask)
+            trans_output = transformer(src, tgt, src_mask, tgt_mask,
+                                       memory_mask)
+
+    def test_transformer_attr_3(self):
+        batch_size, d_model, n_head, dim_feedforward, dropout, _, _, source_length, target_length = generate_basic_params(
+            mode="decoder_layer")
+
+        # batch_size, source_length, target_length, d_model, n_head = 4, 8, 8, 64, 8
+        with fluid.dygraph.guard(fluid.CPUPlace()):
+            transformer = Transformer(
+                d_model,
+                n_head,
+                dim_feedforward=dim_feedforward,
+                dropout=dropout,
+                weight_attr=[None, None, None],
+                bias_attr=[False, False, True])
+            src = paddle.to_variable(
+                np.random.rand(batch_size, source_length, d_model).astype(
+                    "float32"))
+            tgt = paddle.to_variable(
+                np.random.rand(batch_size, target_length, d_model).astype(
+                    "float32"))
+            src_mask = np.zeros((batch_size, n_head, source_length,
+                                 source_length)).astype("float32")
+            src_mask[0][0][0][0] = -np.inf
+            src_mask = paddle.to_variable(src_mask)
+            tgt_mask = np.zeros((batch_size, n_head, target_length,
+                                 target_length)).astype("float32")
+            tgt_mask[0][0][0][0] = -1e9
+            memory_mask = np.zeros((batch_size, n_head, target_length,
+                                    source_length)).astype("float32")
+            memory_mask[0][0][0][0] = -1e9
+            tgt_mask, memory_mask = paddle.to_variable(
+                tgt_mask), paddle.to_variable(memory_mask)
+            trans_output = transformer(src, tgt, src_mask, tgt_mask,
+                                       memory_mask)
+
+    def test_transformer_attr_boolean(self):
+        batch_size, d_model, n_head, dim_feedforward, dropout, _, _, source_length, target_length = generate_basic_params(
+            mode="decoder_layer")
+
+        # batch_size, source_length, target_length, d_model, n_head = 4, 8, 8, 64, 8
+        with fluid.dygraph.guard(fluid.CPUPlace()):
+            transformer = Transformer(
+                d_model,
+                n_head,
+                dim_feedforward=dim_feedforward,
+                dropout=dropout,
+                bias_attr=False)
+            src = paddle.to_variable(
+                np.random.rand(batch_size, source_length, d_model).astype(
+                    "float32"))
+            tgt = paddle.to_variable(
+                np.random.rand(batch_size, target_length, d_model).astype(
+                    "float32"))
+            src_mask = np.zeros((batch_size, n_head, source_length,
+                                 source_length)).astype("float32")
+            src_mask[0][0][0][0] = -np.inf
+            src_mask = paddle.to_variable(src_mask)
+            tgt_mask = np.zeros((batch_size, n_head, target_length,
+                                 target_length)).astype("float32")
+            tgt_mask[0][0][0][0] = -1e9
+            memory_mask = np.zeros((batch_size, n_head, target_length,
+                                    source_length)).astype("float32")
+            memory_mask[0][0][0][0] = -1e9
+            tgt_mask, memory_mask = paddle.to_variable(
+                tgt_mask), paddle.to_variable(memory_mask)
+            trans_output = transformer(src, tgt, src_mask, tgt_mask,
+                                       memory_mask)
+
 
 if __name__ == "__main__":
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_translated_layer.py b/python/paddle/fluid/tests/unittests/test_translated_layer.py
new file mode 100644
index 0000000000000000000000000000000000000000..e5dc279750d3d9605aeba1d27dbb84a35cf31921
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/test_translated_layer.py
@@ -0,0 +1,188 @@
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+
+import unittest
+import numpy as np
+import paddle
+import paddle.nn as nn
+import paddle.optimizer as opt
+
+BATCH_SIZE = 16
+BATCH_NUM = 4
+EPOCH_NUM = 4
+SEED = 10
+
+IMAGE_SIZE = 784
+CLASS_NUM = 10
+
+
+# define a random dataset
+class RandomDataset(paddle.io.Dataset):
+    def __init__(self, num_samples):
+        self.num_samples = num_samples
+
+    def __getitem__(self, idx):
+        np.random.seed(SEED)
+        image = np.random.random([IMAGE_SIZE]).astype('float32')
+        label = np.random.randint(0, CLASS_NUM - 1, (1, )).astype('int64')
+        return image, label
+
+    def __len__(self):
+        return self.num_samples
+
+
+class LinearNet(nn.Layer):
+    def __init__(self):
+        super(LinearNet, self).__init__()
+        self._linear = nn.Linear(IMAGE_SIZE, CLASS_NUM)
+
+    @paddle.jit.to_static(input_spec=[
+        paddle.static.InputSpec(
+            shape=[None, IMAGE_SIZE], dtype='float32', name='x')
+    ])
+    def forward(self, x):
+        return self._linear(x)
+
+
+def train(layer, loader, loss_fn, opt):
+    for epoch_id in range(EPOCH_NUM):
+        for batch_id, (image, label) in enumerate(loader()):
+            out = layer(image)
+            loss = loss_fn(out, label)
+            loss.backward()
+            opt.step()
+            opt.clear_grad()
+            print("Epoch {} batch {}: loss = {}".format(epoch_id, batch_id,
+                                                        np.mean(loss.numpy())))
+    return loss
+
+
+class TestTranslatedLayer(unittest.TestCase):
+    def setUp(self):
+        # enable dygraph mode
+        place = paddle.CPUPlace()
+        paddle.disable_static(place)
+
+        # config seed
+        paddle.manual_seed(SEED)
+        paddle.framework.random._manual_program_seed(SEED)
+
+        # create network
+        self.layer = LinearNet()
+        self.loss_fn = nn.CrossEntropyLoss()
+        self.sgd = opt.SGD(learning_rate=0.001,
+                           parameters=self.layer.parameters())
+
+        # create data loader
+        dataset = RandomDataset(BATCH_NUM * BATCH_SIZE)
+        self.loader = paddle.io.DataLoader(
+            dataset,
+            places=place,
+            batch_size=BATCH_SIZE,
+            shuffle=True,
+            drop_last=True,
+            num_workers=2)
+
+        # train
+        train(self.layer, self.loader, self.loss_fn, self.sgd)
+
+        # save
+        self.model_path = "linear.example.model"
+        paddle.jit.save(self.layer, self.model_path)
+
+    def test_inference_and_fine_tuning(self):
+        self.load_and_inference()
+        self.load_and_fine_tuning()
+
+    def load_and_inference(self):
+        # load
+        translated_layer = paddle.jit.load(self.model_path)
+
+        # inference
+        x = paddle.randn([1, IMAGE_SIZE], 'float32')
+
+        self.layer.eval()
+        orig_pred = self.layer(x)
+
+        translated_layer.eval()
+        pred = translated_layer(x)
+
+        self.assertTrue(np.array_equal(orig_pred.numpy(), pred.numpy()))
+
+    def load_and_fine_tuning(self):
+        # load
+        translated_layer = paddle.jit.load(self.model_path)
+
+        # train original layer continue
+        self.layer.train()
+        orig_loss = train(self.layer, self.loader, self.loss_fn, self.sgd)
+
+        # fine-tuning
+        translated_layer.train()
+        sgd = opt.SGD(learning_rate=0.001,
+                      parameters=translated_layer.parameters())
+        loss = train(translated_layer, self.loader, self.loss_fn, sgd)
+
+        self.assertTrue(
+            np.array_equal(orig_loss.numpy(), loss.numpy()),
+            msg="original loss:\n{}\nnew loss:\n{}\n".format(orig_loss.numpy(),
+                                                             loss.numpy()))
+
+    def test_get_program(self):
+        # load
+        translated_layer = paddle.jit.load(self.model_path)
+
+        program = translated_layer.program()
+        self.assertTrue(isinstance(program, paddle.static.Program))
+
+    def test_get_program_method_not_exists(self):
+        # load
+        translated_layer = paddle.jit.load(self.model_path)
+
+        with self.assertRaises(ValueError):
+            program = translated_layer.program('not_exists')
+
+    def test_get_input_spec(self):
+        # load
+        translated_layer = paddle.jit.load(self.model_path)
+
+        expect_spec = [
+            paddle.static.InputSpec(
+                shape=[None, IMAGE_SIZE], dtype='float32', name='x')
+        ]
+        actual_spec = translated_layer._input_spec()
+
+        for spec_x, spec_y in zip(expect_spec, actual_spec):
+            self.assertEqual(spec_x, spec_y)
+
+    def test_get_output_spec(self):
+        # load
+        translated_layer = paddle.jit.load(self.model_path)
+
+        expect_spec = [
+            paddle.static.InputSpec(
+                shape=[None, CLASS_NUM],
+                dtype='float32',
+                name='translated_layer/scale_0.tmp_1')
+        ]
+        actual_spec = translated_layer._output_spec()
+
+        for spec_x, spec_y in zip(expect_spec, actual_spec):
+            self.assertEqual(spec_x, spec_y)
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_transpose_op.py b/python/paddle/fluid/tests/unittests/test_transpose_op.py
index d5d1fdc5b20b9d786f1861d63bb4a646117b80ed..56333211469db5705d04cc5ca253bf01679190a5 100644
--- a/python/paddle/fluid/tests/unittests/test_transpose_op.py
+++ b/python/paddle/fluid/tests/unittests/test_transpose_op.py
@@ -99,6 +99,18 @@ class TestCase7(TestTransposeOp):
         self.axis = (0, 1, 3, 2)
 
 
+class TestCase8(TestTransposeOp):
+    def initTestCase(self):
+        self.shape = (2, 3, 2, 3, 2, 4, 3, 3)
+        self.axis = (0, 1, 3, 2, 4, 5, 6, 7)
+
+
+class TestCase9(TestTransposeOp):
+    def initTestCase(self):
+        self.shape = (2, 3, 2, 3, 2, 4, 3, 3)
+        self.axis = (6, 1, 3, 5, 0, 2, 4, 7)
+
+
 class TestTransposeOpError(unittest.TestCase):
     def test_errors(self):
         with program_guard(Program(), Program()):
diff --git a/python/paddle/fluid/tests/unittests/test_tril_triu_op.py b/python/paddle/fluid/tests/unittests/test_tril_triu_op.py
index aed265b21b5781d88da0380b04872061e893d736..2cd2599f2ea2f4fb26b2d2730ca45384a3b664a7 100644
--- a/python/paddle/fluid/tests/unittests/test_tril_triu_op.py
+++ b/python/paddle/fluid/tests/unittests/test_tril_triu_op.py
@@ -142,6 +142,18 @@ class TestTrilTriuOpAPI(unittest.TestCase):
             self.assertTrue(np.allclose(tril_out, np.tril(data)))
             self.assertTrue(np.allclose(triu_out, np.triu(data)))
 
+    def test_fluid_api(self):
+        data = np.random.random([1, 9, 9, 4]).astype('float32')
+        x = fluid.data(shape=[1, 9, -1, 4], dtype='float32', name='x')
+        triu_out = fluid.layers.triu(x)
+
+        place = fluid.CUDAPlace(0) if fluid.core.is_compiled_with_cuda(
+        ) else fluid.CPUPlace()
+        exe = fluid.Executor(place)
+        triu_out = exe.run(fluid.default_main_program(),
+                           feed={"x": data},
+                           fetch_list=[triu_out])
+
 
 if __name__ == '__main__':
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_trilinear_interp_v2_op.py b/python/paddle/fluid/tests/unittests/test_trilinear_interp_v2_op.py
index 49924b44441aa9ae323f0d7921d71bf58b8c2cf2..245c2623b869af30acfb5d0379c7597813645031 100755
--- a/python/paddle/fluid/tests/unittests/test_trilinear_interp_v2_op.py
+++ b/python/paddle/fluid/tests/unittests/test_trilinear_interp_v2_op.py
@@ -26,6 +26,9 @@ def trilinear_interp_np(input,
                         out_d,
                         out_h,
                         out_w,
+                        scale_d=0,
+                        scale_h=0,
+                        scale_w=0,
                         out_size=None,
                         actual_shape=None,
                         align_corners=True,
@@ -49,17 +52,26 @@ def trilinear_interp_np(input,
         if (align_corners):
             ratio_d = (in_d - 1.0) / (out_d - 1.0)
         else:
-            ratio_d = 1.0 * in_d / out_d
+            if scale_d > 0:
+                ratio_d = 1.0 / scale_d
+            else:
+                ratio_d = 1.0 * in_d / out_d
     if out_h > 1:
         if (align_corners):
             ratio_h = (in_h - 1.0) / (out_h - 1.0)
         else:
-            ratio_h = 1.0 * in_h / out_h
+            if scale_h > 0:
+                ratio_h = 1.0 / scale_h
+            else:
+                ratio_h = 1.0 * in_h / out_h
     if out_w > 1:
         if (align_corners):
             ratio_w = (in_w - 1.0) / (out_w - 1.0)
         else:
-            ratio_w = 1.0 * in_w / out_w
+            if scale_w > 0:
+                ratio_w = 1.0 / scale_w
+            else:
+                ratio_w = 1.0 * in_w / out_w
 
     out = np.zeros((batch_size, channel, out_d, out_h, out_w))
 
@@ -133,6 +145,9 @@ class TestTrilinearInterpOp(OpTest):
         self.op_type = "trilinear_interp_v2"
         input_np = np.random.random(self.input_shape).astype("float32")
 
+        scale_w = 0
+        scale_h = 0
+        scale_d = 0
         if self.data_layout == "NCDHW":
             in_d = self.input_shape[2]
             in_h = self.input_shape[3]
@@ -159,9 +174,10 @@ class TestTrilinearInterpOp(OpTest):
             out_h = self.out_h
             out_w = self.out_w
 
-        output_np = trilinear_interp_np(
-            input_np, out_d, out_h, out_w, self.out_size, self.actual_shape,
-            self.align_corners, self.align_mode, self.data_layout)
+        output_np = trilinear_interp_np(input_np, out_d, out_h, out_w, scale_d,
+                                        scale_h, scale_w, self.out_size,
+                                        self.actual_shape, self.align_corners,
+                                        self.align_mode, self.data_layout)
         self.inputs = {'X': input_np}
         if self.out_size is not None:
             self.inputs['OutSize'] = self.out_size
@@ -359,7 +375,7 @@ class TestTrilinearInterpOpUint8(OpTest):
             out_h = self.out_h
             out_w = self.out_w
 
-        output_np = trilinear_interp_np(input_np, out_d, out_h, out_w,
+        output_np = trilinear_interp_np(input_np, out_d, out_h, out_w, 0, 0, 0,
                                         self.out_size, self.actual_shape,
                                         self.align_corners, self.align_mode)
         self.inputs = {'X': input_np}
@@ -482,7 +498,7 @@ class TestTrilinearInterpZero(TestTrilinearInterpOp):
         self.out_d = 60
         self.out_h = 40
         self.out_w = 25
-        self.scale = 0.2
+        self.scale = 0.0
         self.align_corners = False
         self.align_mode = 0
 
@@ -541,7 +557,7 @@ class TestTrilinearInterpOp_attr_tensor(OpTest):
             if isinstance(self.scale, list) and len(self.scale) == 1:
                 self.scale = [self.scale[0], self.scale[0], self.scale[0]]
             self.attrs['scale'] = self.scale
-        output_np = trilinear_interp_np(input_np, out_d, out_h, out_w,
+        output_np = trilinear_interp_np(input_np, out_d, out_h, out_w, 0, 0, 0,
                                         self.out_size, self.actual_shape,
                                         self.align_corners, self.align_mode)
         self.outputs = {'Out': output_np}
diff --git a/python/paddle/fluid/tests/unittests/test_uniform_random_op.py b/python/paddle/fluid/tests/unittests/test_uniform_random_op.py
index 56dc27a9a5b136829ce410b50998e23b77510665..5ecf25c53b794f07e298b986eff5700698b8bff7 100644
--- a/python/paddle/fluid/tests/unittests/test_uniform_random_op.py
+++ b/python/paddle/fluid/tests/unittests/test_uniform_random_op.py
@@ -239,12 +239,12 @@ class TestUniformRandomOpSelectedRows(unittest.TestCase):
         op = Operator(
             "uniform_random",
             Out="X",
-            shape=[100, 784],
+            shape=[1000, 784],
             min=-5.0,
             max=10.0,
             seed=10)
         op.run(scope, place)
-        self.assertEqual(out.get_tensor().shape(), [100, 784])
+        self.assertEqual(out.get_tensor().shape(), [1000, 784])
         hist, prob = output_hist(np.array(out.get_tensor()))
         self.assertTrue(
             np.allclose(
@@ -260,15 +260,15 @@ class TestUniformRandomOpSelectedRowsWithDiagInit(
         op = Operator(
             "uniform_random",
             Out="X",
-            shape=[100, 784],
+            shape=[500, 784],
             min=-5.0,
             max=10.0,
             seed=10,
-            diag_num=100,
+            diag_num=500,
             diag_step=784,
             diag_val=1.0)
         op.run(scope, place)
-        self.assertEqual(out.get_tensor().shape(), [100, 784])
+        self.assertEqual(out.get_tensor().shape(), [500, 784])
         hist, prob = output_hist_diag(np.array(out.get_tensor()))
         self.assertTrue(
             np.allclose(
@@ -391,7 +391,7 @@ class TestUniformRandomOpSelectedRowsShapeTensor(unittest.TestCase):
         scope = core.Scope()
         out = scope.var("X").get_selected_rows()
         shape_tensor = scope.var("Shape").get_tensor()
-        shape_tensor.set(np.array([100, 784]).astype("int64"), place)
+        shape_tensor.set(np.array([1000, 784]).astype("int64"), place)
         paddle.manual_seed(10)
         op = Operator(
             "uniform_random",
@@ -401,7 +401,7 @@ class TestUniformRandomOpSelectedRowsShapeTensor(unittest.TestCase):
             max=10.0,
             seed=10)
         op.run(scope, place)
-        self.assertEqual(out.get_tensor().shape(), [100, 784])
+        self.assertEqual(out.get_tensor().shape(), [1000, 784])
         hist, prob = output_hist(np.array(out.get_tensor()))
         self.assertTrue(
             np.allclose(
@@ -423,7 +423,7 @@ class TestUniformRandomOpSelectedRowsShapeTensorList(unittest.TestCase):
         scope = core.Scope()
         out = scope.var("X").get_selected_rows()
         shape_1 = scope.var("shape1").get_tensor()
-        shape_1.set(np.array([100]).astype("int64"), place)
+        shape_1.set(np.array([1000]).astype("int64"), place)
         shape_2 = scope.var("shape2").get_tensor()
         shape_2.set(np.array([784]).astype("int64"), place)
         paddle.manual_seed(10)
@@ -435,7 +435,7 @@ class TestUniformRandomOpSelectedRowsShapeTensorList(unittest.TestCase):
             max=10.0,
             seed=10)
         op.run(scope, place)
-        self.assertEqual(out.get_tensor().shape(), [100, 784])
+        self.assertEqual(out.get_tensor().shape(), [1000, 784])
         hist, prob = output_hist(np.array(out.get_tensor()))
         self.assertTrue(
             np.allclose(
@@ -540,24 +540,24 @@ class TestUniformDtype(unittest.TestCase):
     def test_default_dtype(self):
         paddle.disable_static()
 
-        def test_default_fp_16():
+        def test_default_fp16():
             paddle.framework.set_default_dtype('float16')
             paddle.tensor.random.uniform([2, 3])
 
-        self.assertRaises(TypeError, test_default_fp_16)
+        self.assertRaises(TypeError, test_default_fp16)
 
-        def test_default_fp_32():
+        def test_default_fp32():
             paddle.framework.set_default_dtype('float32')
             out = paddle.tensor.random.uniform([2, 3])
             self.assertEqual(out.dtype, fluid.core.VarDesc.VarType.FP32)
 
-        def test_default_fp_64():
+        def test_default_fp64():
             paddle.framework.set_default_dtype('float64')
             out = paddle.tensor.random.uniform([2, 3])
             self.assertEqual(out.dtype, fluid.core.VarDesc.VarType.FP64)
 
-        test_default_fp_64()
-        test_default_fp_32()
+        test_default_fp64()
+        test_default_fp32()
 
         paddle.enable_static()
 
diff --git a/python/paddle/fluid/tests/unittests/test_unsqueeze_op.py b/python/paddle/fluid/tests/unittests/test_unsqueeze_op.py
index 9382d53e7fec6ba9e1217f99ba5006b3dfe5c150..1975e4306026ee459aa585c47afa74fce6a6aeed 100644
--- a/python/paddle/fluid/tests/unittests/test_unsqueeze_op.py
+++ b/python/paddle/fluid/tests/unittests/test_unsqueeze_op.py
@@ -134,29 +134,61 @@ class API_TestUnsqueeze3(unittest.TestCase):
             result1, = exe.run(feed={"data1": input,
                                      "data2": input2},
                                fetch_list=[result_squeeze])
-            self.assertTrue(np.allclose(input1, result1))
+            self.assertTrue(np.array_equal(input1, result1))
+            self.assertEqual(input1.shape, result1.shape)
 
 
 class API_TestDyUnsqueeze(unittest.TestCase):
     def test_out(self):
         with fluid.dygraph.guard():
             input_1 = np.random.random([5, 1, 10]).astype("int32")
-            input1 = np.squeeze(input_1, axis=1)
+            input1 = np.expand_dims(input_1, axis=1)
             input = fluid.dygraph.to_variable(input_1)
             output = paddle.unsqueeze(input, axis=[1])
             out_np = output.numpy()
-            self.assertTrue(np.allclose(input1, out_np))
+            self.assertTrue(np.array_equal(input1, out_np))
+            self.assertEqual(input1.shape, out_np.shape)
 
 
 class API_TestDyUnsqueeze2(unittest.TestCase):
     def test_out(self):
         with fluid.dygraph.guard():
-            input_1 = np.random.random([5, 1, 10]).astype("int32")
-            input1 = np.squeeze(input_1, axis=1)
-            input = fluid.dygraph.to_variable(input_1)
+            input1 = np.random.random([5, 10]).astype("int32")
+            out1 = np.expand_dims(input1, axis=1)
+            input = fluid.dygraph.to_variable(input1)
             output = paddle.unsqueeze(input, axis=1)
             out_np = output.numpy()
-            self.assertTrue(np.allclose(input1, out_np))
+            self.assertTrue(np.array_equal(out1, out_np))
+            self.assertEqual(out1.shape, out_np.shape)
+
+
+class API_TestDyUnsqueezeAxisTensor(unittest.TestCase):
+    def test_out(self):
+        with fluid.dygraph.guard():
+            input1 = np.random.random([5, 10]).astype("int32")
+            out1 = np.expand_dims(input1, axis=1)
+            out1 = np.expand_dims(out1, axis=2)
+            input = fluid.dygraph.to_variable(input1)
+            output = paddle.unsqueeze(input, axis=paddle.to_tensor([1, 2]))
+            out_np = output.numpy()
+            self.assertTrue(np.array_equal(out1, out_np))
+            self.assertEqual(out1.shape, out_np.shape)
+
+
+class API_TestDyUnsqueezeAxisTensorList(unittest.TestCase):
+    def test_out(self):
+        with fluid.dygraph.guard():
+            input1 = np.random.random([5, 10]).astype("int32")
+            # Actually, expand_dims supports tuple since version 1.18.0
+            out1 = np.expand_dims(input1, axis=1)
+            out1 = np.expand_dims(out1, axis=2)
+            input = fluid.dygraph.to_variable(input1)
+            output = paddle.unsqueeze(
+                fluid.dygraph.to_variable(input1),
+                axis=[paddle.to_tensor([1]), paddle.to_tensor([2])])
+            out_np = output.numpy()
+            self.assertTrue(np.array_equal(out1, out_np))
+            self.assertEqual(out1.shape, out_np.shape)
 
 
 if __name__ == "__main__":
diff --git a/python/paddle/fluid/tests/unittests/test_update_loss_scaling_op.py b/python/paddle/fluid/tests/unittests/test_update_loss_scaling_op.py
new file mode 100644
index 0000000000000000000000000000000000000000..fb93334415c3046362090a143f6c15069793709a
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/test_update_loss_scaling_op.py
@@ -0,0 +1,250 @@
+#   Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+import numpy as np
+from op_test import OpTest
+import paddle.fluid as fluid
+import paddle.fluid.contrib.mixed_precision.amp_nn as amp_nn
+
+
+class TestUpdateLossScalingOp(OpTest):
+    def setUp(self):
+        self.op_type = "update_loss_scaling"
+        self.init()
+        found_inf = np.array([False], dtype=np.bool)
+        x = np.random.random((1024, 1024)).astype(self.dtype)
+
+        self.inputs = {
+            'X': [('x0', x)],
+            'FoundInfinite': found_inf,
+            'PrevLossScaling': self.prev_loss_scaling,
+            'InGoodSteps': self.num_good_steps,
+            'InBadSteps': self.num_bad_steps
+        }
+
+        self.outputs = {
+            'Out': [('out0', np.zeros_like(x))],
+            'LossScaling': self.prev_loss_scaling * self.incr_ratio,
+            'OutGoodSteps': self.zero_steps,
+            'OutBadSteps': self.zero_steps
+        }
+
+    def init(self):
+        self.incr_ratio = 2.0
+        self.decr_ratio = 0.8
+        self.dtype = np.float32
+        self.prev_loss_scaling = np.array([2048]).astype(self.dtype)
+        self.num_good_steps = np.array([999], dtype=np.int32)
+        self.num_bad_steps = np.array([1], dtype=np.int32)
+        self.zero_steps = np.array([0], dtype=np.int32)
+        self.attrs = {
+            'incr_every_n_steps': 1000,
+            'decr_every_n_nan_or_inf': 2,
+            'incr_ratio': self.incr_ratio,
+            'decr_ratio': self.decr_ratio,
+        }
+
+    def test_check_output(self):
+        self.check_output(no_check_set=['Out'])
+
+
+class TestUpdateLossScalingOpBad(TestUpdateLossScalingOp):
+    def setUp(self):
+        self.op_type = "update_loss_scaling"
+        self.init()
+        found_inf = np.array([True], dtype=np.bool)
+        x = np.random.random((1024, 1024)).astype(self.dtype)
+        i = np.random.randint(0, 1024, 1)
+        j = np.random.randint(0, 1024, 1)
+        x[i[0]][j[0]] = np.inf
+
+        self.inputs = {
+            'X': [('x0', x)],
+            'FoundInfinite': found_inf,
+            'PrevLossScaling': self.prev_loss_scaling,
+            'InGoodSteps': self.num_good_steps,
+            'InBadSteps': self.num_bad_steps
+        }
+
+        self.outputs = {
+            'Out': [('out0', np.zeros_like(x))],
+            'LossScaling': self.prev_loss_scaling * self.decr_ratio,
+            'OutGoodSteps': self.zero_steps,
+            'OutBadSteps': self.zero_steps
+        }
+
+    def test_check_output(self):
+        self.check_output()
+
+
+class TestUpdateLossScalingLayer(unittest.TestCase):
+    def loss_scaling_check(self, use_cuda=True, scope=fluid.Scope()):
+        a = fluid.data(name="a", shape=[1024, 1024], dtype='float32')
+        b = fluid.data(name="b", shape=[512, 128], dtype='float32')
+        x = [a, b]
+        found_inf = fluid.data(name="found_inf", shape=[1], dtype='bool')
+        prev_loss_scaling = fluid.data(
+            name="prev_loss_scaling", shape=[1], dtype='float32')
+        num_good_steps = fluid.data(
+            name="num_good_steps", shape=[1], dtype='int32')
+        num_bad_steps = fluid.data(
+            name="num_bad_steps", shape=[1], dtype='int32')
+
+        a_v = np.random.random([1024, 1024]).astype('float32')
+        b_v = np.random.random([512, 128]).astype('float32')
+        found_inf_v = np.array([False]).astype('bool')
+        prev_loss_scaling_v = np.array([2048]).astype('float32')
+        num_good_steps_v = np.array([999], dtype=np.int32)
+        num_bad_steps_v = np.array([1], dtype=np.int32)
+
+        incr_every_n_steps = 1000
+        decr_every_n_nan_or_inf = 2
+        incr_ratio = 2
+        decr_ratio = 0.8
+
+        result = amp_nn.update_loss_scaling(
+            x,
+            found_inf,
+            prev_loss_scaling,
+            num_good_steps,
+            num_bad_steps,
+            incr_every_n_steps,
+            decr_every_n_nan_or_inf,
+            incr_ratio,
+            decr_ratio,
+            name="update_loss_scaling")
+
+        place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace()
+        exe = fluid.Executor(place)
+        with fluid.scope_guard(scope):
+            exe.run(fluid.default_startup_program())
+            result_v = exe.run(feed={
+                'a': a_v,
+                'b': b_v,
+                'found_inf': found_inf_v,
+                'prev_loss_scaling': prev_loss_scaling_v,
+                'num_good_steps': num_good_steps_v,
+                'num_bad_steps': num_bad_steps_v
+            },
+                               fetch_list=[
+                                   result, x, found_inf, prev_loss_scaling,
+                                   num_good_steps, num_bad_steps
+                               ])
+        assert np.array_equal(result_v[0], a_v)
+        assert np.array_equal(result_v[1], b_v)
+        assert np.array_equal(result_v[0], result_v[2])
+        assert np.array_equal(result_v[1], result_v[3])
+        assert np.array_equal(result_v[4], found_inf_v)
+        assert np.array_equal(result_v[5], prev_loss_scaling_v * incr_ratio)
+        assert np.array_equal(result_v[6], np.zeros_like(num_good_steps_v))
+        assert np.array_equal(result_v[7], np.zeros_like(num_bad_steps_v))
+
+    def loss_scaling_check_inf(self, use_cuda=True, scope=fluid.Scope()):
+        a = fluid.data(name="a", shape=[1024, 1024], dtype='float32')
+        b = fluid.data(name="b", shape=[512, 128], dtype='float32')
+        x = [a, b]
+        found_inf = fluid.data(name="found_inf", shape=[1], dtype='bool')
+        prev_loss_scaling = fluid.data(
+            name="prev_loss_scaling", shape=[1], dtype='float32')
+        num_good_steps = fluid.data(
+            name="num_good_steps", shape=[1], dtype='int32')
+        num_bad_steps = fluid.data(
+            name="num_bad_steps", shape=[1], dtype='int32')
+
+        a_v = np.random.random([1024, 1024]).astype('float32')
+        b_v = np.random.random([512, 128]).astype('float32')
+        i = np.random.randint(0, 1024, 1)
+        j = np.random.randint(0, 1024, 1)
+        a_v[i[0]][j[0]] = np.inf
+        found_inf_v = np.array([True]).astype('bool')
+        prev_loss_scaling_v = np.array([2048]).astype('float32')
+        num_good_steps_v = np.array([999], dtype=np.int32)
+        num_bad_steps_v = np.array([1], dtype=np.int32)
+
+        incr_every_n_steps = 1000
+        decr_every_n_nan_or_inf = 2
+        incr_ratio = 2
+        decr_ratio = 0.8
+
+        result = amp_nn.update_loss_scaling(
+            x,
+            found_inf,
+            prev_loss_scaling,
+            num_good_steps,
+            num_bad_steps,
+            incr_every_n_steps,
+            decr_every_n_nan_or_inf,
+            incr_ratio,
+            decr_ratio,
+            name="update_loss_scaling")
+
+        place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace()
+        exe = fluid.Executor(place)
+        with fluid.scope_guard(scope):
+            exe.run(fluid.default_startup_program())
+            result_v = exe.run(feed={
+                'a': a_v,
+                'b': b_v,
+                'found_inf': found_inf_v,
+                'prev_loss_scaling': prev_loss_scaling_v,
+                'num_good_steps': num_good_steps_v,
+                'num_bad_steps': num_bad_steps_v
+            },
+                               fetch_list=[
+                                   result, x, found_inf, prev_loss_scaling,
+                                   num_good_steps, num_bad_steps
+                               ])
+        assert np.array_equal(result_v[0], np.zeros_like(a_v))
+        assert np.array_equal(result_v[1], np.zeros_like(b_v))
+        assert np.array_equal(result_v[2], np.zeros_like(a_v))
+        assert np.array_equal(result_v[3], np.zeros_like(b_v))
+        assert np.array_equal(result_v[4], found_inf_v)
+        assert np.array_equal(result_v[5], prev_loss_scaling_v * decr_ratio)
+        assert np.array_equal(result_v[6], np.zeros_like(num_good_steps_v))
+        assert np.array_equal(result_v[7], np.zeros_like(num_bad_steps_v))
+
+    def test_loss_scaling_cpu(self):
+        main = fluid.Program()
+        startup = fluid.Program()
+        with fluid.unique_name.guard():
+            with fluid.program_guard(main, startup):
+                self.loss_scaling_check(use_cuda=False)
+
+    def test_loss_scaling_cpu_inf(self):
+        main = fluid.Program()
+        startup = fluid.Program()
+        with fluid.unique_name.guard():
+            with fluid.program_guard(main, startup):
+                self.loss_scaling_check_inf(use_cuda=False)
+
+    def test_loss_scaling_gpu(self):
+        if fluid.core.is_compiled_with_cuda():
+            main = fluid.Program()
+            startup = fluid.Program()
+            with fluid.unique_name.guard():
+                with fluid.program_guard(main, startup):
+                    self.loss_scaling_check(use_cuda=True)
+
+    def test_loss_scaling_gpu_inf(self):
+        if fluid.core.is_compiled_with_cuda():
+            main = fluid.Program()
+            startup = fluid.Program()
+            with fluid.unique_name.guard():
+                with fluid.program_guard(main, startup):
+                    self.loss_scaling_check_inf(use_cuda=True)
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_var_base.py b/python/paddle/fluid/tests/unittests/test_var_base.py
index c8383bb950d3ed7b2cdfafa185b0ad156bf7c7bf..deb49a3ffc2b5febf97680bc652e9695fb253373 100644
--- a/python/paddle/fluid/tests/unittests/test_var_base.py
+++ b/python/paddle/fluid/tests/unittests/test_var_base.py
@@ -33,16 +33,28 @@ class TestVarBase(unittest.TestCase):
         def _test_place(place):
             with fluid.dygraph.guard():
                 paddle.set_default_dtype('float32')
+                # set_default_dtype should not take effect on int
                 x = paddle.to_tensor(1, place=place, stop_gradient=False)
                 self.assertTrue(np.array_equal(x.numpy(), [1]))
                 self.assertNotEqual(x.dtype, core.VarDesc.VarType.FP32)
 
+                # set_default_dtype should not take effect on numpy
+                x = paddle.to_tensor(
+                    np.array([1.2]).astype('float16'),
+                    place=place,
+                    stop_gradient=False)
+                self.assertTrue(
+                    np.array_equal(x.numpy(), np.array([1.2], 'float16')))
+                self.assertEqual(x.dtype, core.VarDesc.VarType.FP16)
+
+                # set_default_dtype take effect on float
                 x = paddle.to_tensor(1.2, place=place, stop_gradient=False)
                 self.assertTrue(
                     np.array_equal(x.numpy(), np.array([1.2]).astype(
                         'float32')))
                 self.assertEqual(x.dtype, core.VarDesc.VarType.FP32)
 
+                # set_default_dtype take effect on complex
                 x = paddle.to_tensor(1 + 2j, place=place, stop_gradient=False)
                 self.assertTrue(np.array_equal(x.numpy(), [1 + 2j]))
                 self.assertEqual(x.dtype, 'complex64')
diff --git a/python/paddle/fluid/tests/unittests/white_list/check_shape_white_list.py b/python/paddle/fluid/tests/unittests/white_list/check_shape_white_list.py
index 227e6cc28fb4a6d05d73cbf2c3c92bda623b7d58..e19641e710dda6cd2614a75a3ca4b2f7ec1c0b58 100644
--- a/python/paddle/fluid/tests/unittests/white_list/check_shape_white_list.py
+++ b/python/paddle/fluid/tests/unittests/white_list/check_shape_white_list.py
@@ -26,4 +26,5 @@ NEED_TO_FIX_OP_LIST = [
     'squared_l2_distance',
     'tree_conv',
     'cvm',
+    'cudnn_lstm',
 ]
diff --git a/python/paddle/fluid/tests/unittests/white_list/no_check_set_white_list.py b/python/paddle/fluid/tests/unittests/white_list/no_check_set_white_list.py
index 0de0eeb464ad700abb2144e49a822582b8653589..afd3414943e9c94799aba5e5e747182623b0a095 100644
--- a/python/paddle/fluid/tests/unittests/white_list/no_check_set_white_list.py
+++ b/python/paddle/fluid/tests/unittests/white_list/no_check_set_white_list.py
@@ -25,6 +25,7 @@ no_check_set_white_list = [
     'unsqueeze2',
     'cross_entropy2',
     'seed',
-    'amp_check_finite_and_scale',
+    'check_finite_and_unscale',
+    'update_loss_scaling',
     'cudnn_lstm',
 ]
diff --git a/python/paddle/framework/__init__.py b/python/paddle/framework/__init__.py
index af788874191335ad31d1540bcc0db90cc12383c6..f33e4e0fca8727574bcd1970e26c6eaee2139a05 100644
--- a/python/paddle/framework/__init__.py
+++ b/python/paddle/framework/__init__.py
@@ -20,8 +20,8 @@ __all__ = [
 ]
 
 __all__ += [
-    'grad', 'LayerList', 'load', 'save', 'to_variable', 'no_grad',
-    'DataParallel'
+    'grad', 'LayerList', 'load', 'save', 'SaveLoadConfig', 'to_variable',
+    'no_grad', 'DataParallel'
 ]
 
 __all__ += [
@@ -50,6 +50,7 @@ from ..fluid.dygraph.base import to_variable  #DEFINE_ALIAS
 from ..fluid.dygraph.base import grad  #DEFINE_ALIAS
 from ..fluid.dygraph.checkpoint import load_dygraph as load  #DEFINE_ALIAS
 from ..fluid.dygraph.checkpoint import save_dygraph as save  #DEFINE_ALIAS
+from ..fluid.dygraph.jit import SaveLoadConfig  #DEFINE_ALIAS
 from ..fluid.dygraph.parallel import DataParallel  #DEFINE_ALIAS
 
 from ..fluid.dygraph.learning_rate_scheduler import NoamDecay  #DEFINE_ALIAS
diff --git a/python/paddle/framework/random.py b/python/paddle/framework/random.py
index 2555d24464112ed8446d863dc8e65cfa37680b36..ba2cf603d4a69f118320e40f1f953cb4c5fcfb39 100644
--- a/python/paddle/framework/random.py
+++ b/python/paddle/framework/random.py
@@ -16,7 +16,7 @@
 import paddle.fluid as fluid
 from paddle.fluid import core
 
-__all__ = ['manual_seed']
+__all__ = ['manual_seed', 'get_cuda_rng_state', 'set_cuda_rng_state']
 
 
 def manual_seed(seed):
@@ -42,10 +42,69 @@ def manual_seed(seed):
 
     seed = int(seed)
 
+    if core.is_compiled_with_cuda():
+        for i in range(core.get_cuda_device_count()):
+            core.default_cuda_generator(i)._is_init_py = True
+            core.default_cuda_generator(i).manual_seed(seed)
+
     core.default_cpu_generator()._is_init_py = True
     return core.default_cpu_generator().manual_seed(seed)
 
 
+def get_cuda_rng_state():
+    """
+
+    Get random state of cuda generators.
+
+    Args:
+        None
+
+    Returns:
+        GeneratorState:  object.
+
+    Examples:
+        .. code-block:: python
+
+            import paddle
+            sts = paddle.get_cuda_rng_state()
+
+    """
+    state_list = []
+    if core.is_compiled_with_cuda():
+        for i in range(core.get_cuda_device_count()):
+            state_list.append(core.default_cuda_generator(i).get_state())
+
+    return state_list
+
+
+def set_cuda_rng_state(state_list):
+    """
+
+    Sets generator state for all cuda generators
+
+    Args:
+        state_list(list): The cuda states to set back to cuda generators. state_list is obtained from get_cuda_rng_state().
+
+    Returns:
+        None
+
+    Examples:
+        .. code-block:: python
+
+            import paddle
+            sts = paddle.get_cuda_rng_state()
+            paddle.set_cuda_rng_state(sts)
+
+    """
+    if core.is_compiled_with_cuda():
+        if not len(state_list) == core.get_cuda_device_count():
+            raise ValueError(
+                "Length of cuda state list shoule be equal to the cuda device count"
+            )
+        for i in range(core.get_cuda_device_count()):
+            core.default_cuda_generator(i).set_state(state_list[i])
+
+
 def _manual_program_seed(seed):
     """
     Sets global seed for generating random numbers.
diff --git a/python/paddle/hapi/__init__.py b/python/paddle/hapi/__init__.py
index 87f5a82525cdfa36e48d40c6d12488d359fe99db..67965de5d97621e188acfa1e0384325b9ec5b7aa 100644
--- a/python/paddle/hapi/__init__.py
+++ b/python/paddle/hapi/__init__.py
@@ -14,14 +14,12 @@
 
 from . import logger
 from . import callbacks
+from . import model_summary
 
 from . import model
 from .model import *
-
-from .dygraph_layer_patch import monkey_patch_layer
+from .model_summary import summary
 
 logger.setup_logger()
 
-__all__ = ['callbacks'] + model.__all__
-
-monkey_patch_layer()
+__all__ = ['callbacks'] + model.__all__ + ['summary']
diff --git a/python/paddle/hapi/dygraph_layer_patch.py b/python/paddle/hapi/dygraph_layer_patch.py
deleted file mode 100644
index e3a2948b69305fcb08c14c850f5738ac46aea2be..0000000000000000000000000000000000000000
--- a/python/paddle/hapi/dygraph_layer_patch.py
+++ /dev/null
@@ -1,103 +0,0 @@
-# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import warnings
-
-import paddle.fluid as fluid
-from paddle.fluid.framework import in_dygraph_mode
-from paddle.fluid.framework import _current_expected_place as _get_device
-
-
-def monkey_patch_layer():
-    def load_dict(self,
-                  stat_dict,
-                  include_sublayers=True,
-                  use_structured_name=True):
-        '''
-        Set parameters from stat_dict. All the parameters will be reset by the
-        tensor in the stat_dict
-
-        This api will be Deprecated. Please use set_dict
-
-        Parameters:
-            state_dict(dict) : Dict contains all the parameters
-            include_sublayers(bool, optional) : If true, also include the
-                parameters from sublayers. Default: True
-            use_structured_name(bool, optional) : If true, use structured name
-                as key, otherwise, use parameter name as key. Default: True
-        Returns:
-            None
-
-        Examples:
-            .. code-block:: python
-
-                import paddle.fluid as fluid
-                with fluid.dygraph.guard():
-                    emb = fluid.dygraph.Embedding([10, 10])
-
-                    state_dict = emb.state_dict()
-                    fluid.save_dygraph( state_dict, "paddle_dy")
-                    
-                    para_state_dict, _ = fluid.load_dygraph( "paddle_dy")
-                    emb.load_dict( para_state_dict )
-
-        '''
-
-        def _check_match(key, param):
-            state = stat_dict.get(key, None)
-            if state is None:
-                raise ValueError(
-                    "{} is not found in the providing file.".format(key))
-            if list(state.shape) != list(param.shape):
-                raise ValueError(
-                    "{} receives a shape {}, but the expected shape is {}.".
-                    format(key, list(state.shape), list(param.shape)))
-            return param, state
-
-        matched_param_state = []
-        for key, param in self.state_dict().items():
-            key_name = key if use_structured_name else param.name
-            try:
-                match_res = _check_match(key_name, param)
-                matched_param_state.append(match_res)
-            except ValueError as err:
-                warnings.warn(("Skip loading for {}. ".format(key) + str(err)))
-
-        if in_dygraph_mode():
-            for param, state in matched_param_state:
-                param.set_value(state)
-        else:
-
-            def _set_var(var, ndarray):
-                t = fluid.global_scope().find_var(var.name).get_tensor()
-                p = t._place()
-                if p.is_cpu_place():
-                    place = fluid.CPUPlace()
-                elif p.is_cuda_pinned_place():
-                    place = fluid.CUDAPinnedPlace()
-                else:
-                    p = fluid.core.Place()
-                    p.set_place(t._place())
-                    place = fluid.CUDAPlace(p.gpu_device_id())
-                t.set(ndarray, place)
-
-            executor = fluid.Executor(_get_device())._default_executor
-            # restore parameter states
-            fluid.core._create_loaded_parameter(
-                [param for param, state in matched_param_state],
-                fluid.global_scope(), executor)
-            for param, state in matched_param_state:
-                _set_var(param, state)
-
-    setattr(fluid.dygraph.Layer, 'load_dict', load_dict)
diff --git a/python/paddle/hapi/model.py b/python/paddle/hapi/model.py
index 5aa689ca324c099f239a29e2ee21b8283e378341..d41852c9d7f4f5812a852d6a5c644e75d137f530 100644
--- a/python/paddle/hapi/model.py
+++ b/python/paddle/hapi/model.py
@@ -47,10 +47,10 @@ from paddle.io import DataLoader, Dataset, DistributedBatchSampler
 from paddle.fluid.executor import scope_guard, Executor
 from paddle.fluid.dygraph.layers import Layer
 from paddle.metric import Metric
-
 from paddle.static import InputSpec as Input
 
 from .callbacks import config_callbacks
+from .model_summary import summary
 
 __all__ = ['Model', ]
 
@@ -731,8 +731,8 @@ class DynamicGraphAdapter(object):
         if not self.model._optimizer or not optim_state:
             return
 
-        # If optimizer performs set_dict when state vars haven't been created,
-        # which would happen when set_dict before minimize, the state would be
+        # If optimizer performs set_state_dict when state vars haven't been created,
+        # which would happen when set_state_dict before minimize, the state would be
         # stored in optimizer._accumulators_holder and loaded lazily.
         # To contrive this when loading from static-graph saved states, extend
         # state dict to include keys named accoring to dygraph naming rules.
@@ -776,7 +776,13 @@ class DynamicGraphAdapter(object):
                                      accum_name + "_0")
                     converted_state[dy_state_name] = state_var
 
-        self.model._optimizer.set_dict(converted_state)
+        if not hasattr(self.model._optimizer, 'set_state_dict'):
+            warnings.warn(
+                "paddle.fluid.optimizer is deprecated in API 2.0, please use paddle.optimizer instead"
+            )
+            self.model._optimizer.set_dict(converted_state)
+        else:
+            self.model._optimizer.set_state_dict(converted_state)
 
 
 class Model(object):
@@ -786,15 +792,14 @@ class Model(object):
     switched by `paddle.disable_static()`. The usage is as follows.
     But note, the switching between dynamic and static should be before
     instantiating a Model. The input description, i.e, paddle.static.InputSpec,
-    must be required for static graph.
+    must be required.
 
     Args:
         network (paddle.nn.Layer): The network is an instance of
             paddle.nn.Layer.
         inputs (InputSpec|list|dict|None): `inputs`, entry points of network,
             could be a InputSpec instance, or lits of InputSpec instances,
-            or dict ({name: InputSpec}), or None. For static graph,
-            inputs must be set. For dynamic graph, it could be None.
+            or dict ({name: InputSpec}), and it couldn't be None.
         labels (InputSpec|list|None): `labels`, entry points of network,
             could be a InputSpec instnace or lits of InputSpec instances,
             or None. For static graph, if labels is required in loss,
@@ -843,10 +848,9 @@ class Model(object):
         self._optimizer = None
         self._test_dataloader = None
 
-        if not in_dygraph_mode():
-            if not isinstance(inputs, (list, dict, Input)):
-                raise TypeError(
-                    "'inputs' must be list or dict in static graph mode")
+        if not isinstance(inputs, (list, dict, Input)):
+            raise TypeError(
+                "'inputs' must be list or dict, and couldn't be None.")
         self._inputs = self._verify_spec(inputs, True)
         self._labels = self._verify_spec(labels)
 
@@ -998,11 +1002,7 @@ class Model(object):
         have no variable need to save (like SGD), the fill will not generated).
         This function will silently overwrite existing file at the target location.
 
-        If `training` is set to False, only inference model will be saved. It 
-        should be noted that before using `save`, you should run the model, and 
-        the shape of input you saved is as same as the input of its running.
-        `@paddle.jit.to_static` must be added on `forward` function of your layer 
-        in dynamic mode now and these will be optimized later.
+        If `training` is set to False, only inference model will be saved.
 
         Args:
             path (str): The file prefix to save model. The format is
@@ -1031,8 +1031,6 @@ class Model(object):
                             nn.Linear(200, 10),
                             nn.Softmax())
 
-                    # If save for inference in dygraph, need this
-                    @paddle.jit.to_static
                     def forward(self, x):
                         return self.net(x)
 
@@ -1040,7 +1038,7 @@ class Model(object):
                 device = paddle.set_device('cpu')
                 # if use static graph, do not set
                 paddle.disable_static(device) if dynamic else None
-                # inputs and labels are not required for dynamic graph.
+
                 input = InputSpec([None, 784], 'float32', 'x')
                 label = InputSpec([None, 1], 'int64', 'label')
                 model = paddle.Model(Mnist(), input, label)
@@ -1643,10 +1641,6 @@ class Model(object):
                               model_only=False):
         """
         Save inference model can be in static or dynamic mode.
-        It should be noted that before using `save_inference_model`, you should
-        run the model, and the shape you saved is as same as the input of its
-        running. `@paddle.jit.to_static` must be added on `forward` function of
-        your layer in dynamic mode now and these will be optimized later.
 
         Args:
             save_dir (str): The directory path to save the inference model.
@@ -1672,20 +1666,17 @@ class Model(object):
 
             return result_list
 
-        # TODO:
-        # 1. Make it Unnecessary to run model before calling `save_inference_model` for users in dygraph.
-        # 2. Save correct shape of input, now the interface stores the shape that the user sent to 
-        #    the inputs of the model in running.
-        # 3. Make it Unnecessary to add `@paddle.jit.to_static` for users in dynamic mode.
         if fluid.in_dygraph_mode():
             with fluid.framework._dygraph_guard(None):
                 layer = self.network
+                layer.forward = paddle.jit.to_static(
+                    layer.forward, input_spec=self._inputs)
 
                 # 1. input check
                 prog_translator = ProgramTranslator()
-                if not prog_translator.enable_declarative:
+                if not prog_translator.enable_to_static:
                     raise RuntimeError(
-                        "save_inference_model doesn't work when setting ProgramTranslator.enable=False."
+                        "save_inference_model doesn't work when setting ProgramTranslator.enable to False."
                     )
                 if not isinstance(layer, Layer):
                     raise TypeError(
@@ -1822,21 +1813,58 @@ class Model(object):
             return logs, outputs
         return logs
 
+    def summary(self, input_size=None, batch_size=None, dtype=None):
+        """Prints a string summary of the network.
+
+        Args:
+            input_size (tuple|InputSpec|list[tuple|InputSpec], optional): size of input tensor. 
+                    if not set, input_size will get from ``self._inputs`` if network only have 
+                    one input, input_size can be tuple or InputSpec. if model have multiple 
+                    input, input_size must be a list which contain every input's shape. 
+                    Default: None.
+            batch_size (int, optional): batch size of input tensor, Default: None.
+            dtypes (str, optional): if dtypes is None, 'float32' will be used, Default: None.
+
+        Returns:
+            Dict: a summary of the network including total params and total trainable params.
+
+        Examples:
+            .. code-block:: python
+
+              import paddle
+              from paddle.static import InputSpec
+
+              dynamic = True
+              device = paddle.set_device('cpu')
+              paddle.disable_static(device) if dynamic else None
+           
+              input = InputSpec([None, 1, 28, 28], 'float32', 'image')
+              label = InputSpec([None, 1], 'int64', 'label')
+           
+              model = paddle.Model(paddle.vision.LeNet(classifier_activation=None),
+                  input, label)
+              optim = paddle.optimizer.Adam(
+                  learning_rate=0.001, parameters=model.parameters())
+              model.prepare(
+                  optim,
+                  paddle.nn.CrossEntropyLoss())
+
+              params_info = model.summary()
+              print(params_info)
+
+        """
+        assert (input_size is not None or self._inputs is not None
+                ), "'input_size' or 'self._input' must be set"
+        if input_size is not None:
+            _input_size = input_size
+        else:
+            _input_size = self._inputs
+        return summary(self.network, _input_size, batch_size, dtype)
+
     def _verify_spec(self, specs, is_input=False):
         out_specs = []
 
-        if specs is None:
-            # Note(Aurelius84): If not specific specs of `Input`, using argument names of `forward` function
-            # to generate `Input`. But how can we know the actual shape of each input tensor?
-            if is_input:
-                out_specs = [
-                    Input(
-                        name=n, shape=[None])
-                    for n in extract_args(self.network.forward) if n != 'self'
-                ]
-            else:
-                out_specs = to_list(specs)
-        elif isinstance(specs, dict):
+        if isinstance(specs, dict):
             assert is_input == False
             out_specs = [specs[n] \
                 for n in extract_args(self.network.forward) if n != 'self']
@@ -1848,8 +1876,8 @@ class Model(object):
                 assert isinstance(spec, Input)
                 if spec.name is None:
                     raise ValueError(
-                        "Requires Input[{}].name != None, but receive `None` with {}.".
-                        format(i, spec))
+                        "Requires Input[{}].name != None, but receive `None` with {}."
+                        .format(i, spec))
 
         return out_specs
 
diff --git a/python/paddle/hapi/model_summary.py b/python/paddle/hapi/model_summary.py
new file mode 100644
index 0000000000000000000000000000000000000000..d388ba62f2a244f84497810739e5fd6b50f669d2
--- /dev/null
+++ b/python/paddle/hapi/model_summary.py
@@ -0,0 +1,268 @@
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import warnings
+import numpy as np
+import numbers
+
+import paddle
+import paddle.nn as nn
+from paddle.static import InputSpec
+
+from collections import OrderedDict
+
+__all__ = ['summary']
+
+
+def summary(net, input_size, batch_size=None, dtypes=None):
+    """Prints a string summary of the network.
+
+    Args:
+        net (Layer): the network which must be a subinstance of Layer.
+        input_size (tuple|InputSpec|list[tuple|InputSpec]): size of input tensor. if model only 
+                    have one input, input_size can be tuple or InputSpec. if model
+                    have multiple input, input_size must be a list which contain 
+                    every input's shape.
+        batch_size (int, optional): batch size of input tensor, Default: None.
+        dtypes (str, optional): if dtypes is None, 'float32' will be used, Default: None.
+
+    Returns:
+        Dict: a summary of the network including total params and total trainable params.
+
+    Examples:
+        .. code-block:: python
+
+            import paddle
+            import paddle.nn as nn
+
+            class LeNet(nn.Layer):
+                def __init__(self, num_classes=10):
+                    super(LeNet, self).__init__()
+                    self.num_classes = num_classes
+                    self.features = nn.Sequential(
+                        nn.Conv2d(
+                            1, 6, 3, stride=1, padding=1),
+                        nn.ReLU(),
+                        nn.MaxPool2d(2, 2),
+                        nn.Conv2d(
+                            6, 16, 5, stride=1, padding=0),
+                        nn.ReLU(),
+                        nn.MaxPool2d(2, 2))
+
+                    if num_classes > 0:
+                        self.fc = nn.Sequential(
+                            nn.Linear(400, 120),
+                            nn.Linear(120, 84),
+                            nn.Linear(
+                                84, 10))
+
+                def forward(self, inputs):
+                    x = self.features(inputs)
+
+                    if self.num_classes > 0:
+                        x = paddle.flatten(x, 1)
+                        x = self.fc(x)
+                    return x
+
+            lenet = LeNet()
+
+            params_info = paddle.summary(lenet, (1, 28, 28))
+            print(params_info)
+
+    """
+    if isinstance(input_size, InputSpec):
+        _input_size = tuple(input_size.shape[1:])
+        if batch_size is None:
+            batch_size = input_size.shape[0]
+    elif isinstance(input_size, list):
+        _input_size = []
+        for item in input_size:
+            if isinstance(item, int):
+                item = (item, )
+            assert isinstance(item,
+                              (tuple, InputSpec)), 'When input_size is list, \
+            expect item in input_size is a tuple or InputSpec, but got {}'.format(
+                                  type(item))
+
+            if isinstance(item, InputSpec):
+                _input_size.append(tuple(item.shape[1:]))
+                if batch_size is None:
+                    batch_size = item.shape[0]
+            else:
+                _input_size.append(item)
+    elif isinstance(input_size, int):
+        _input_size = (input_size, )
+    else:
+        _input_size = input_size
+
+    if batch_size is None:
+        batch_size = -1
+
+    if not paddle.in_dynamic_mode():
+        warnings.warn(
+            "Your model was created in static mode, this may not get correct summary information!"
+        )
+
+    result, params_info = summary_string(net, _input_size, batch_size, dtypes)
+    print(result)
+
+    return params_info
+
+
+def summary_string(model, input_size, batch_size=-1, dtypes=None):
+    if dtypes == None:
+        dtypes = ['float32'] * len(input_size)
+
+    summary_str = ''
+
+    depth = len(list(model.sublayers()))
+
+    def register_hook(layer):
+        def hook(layer, input, output):
+            class_name = str(layer.__class__).split(".")[-1].split("'")[0]
+
+            try:
+                layer_idx = int(layer._full_name.split('_')[-1])
+            except:
+                layer_idx = len(summary)
+
+            m_key = "%s-%i" % (class_name, layer_idx + 1)
+            summary[m_key] = OrderedDict()
+            summary[m_key]["input_shape"] = list(input[0].shape)
+            summary[m_key]["input_shape"][0] = batch_size
+            if isinstance(output, (list, tuple)):
+                summary[m_key]["output_shape"] = [[-1] + list(o.shape)[1:]
+                                                  for o in output]
+            else:
+                summary[m_key]["output_shape"] = list(output.shape)
+                summary[m_key]["output_shape"][0] = batch_size
+
+            params = 0
+
+            if paddle.in_dynamic_mode():
+                layer_state_dict = layer._parameters
+            else:
+                layer_state_dict = layer.state_dict()
+
+            for k, v in layer_state_dict.items():
+                params += np.prod(v.shape)
+
+                try:
+                    if (getattr(getattr(layer, k), 'trainable')) and (
+                            not getattr(getattr(layer, k), 'stop_gradient')):
+                        summary[m_key]["trainable"] = True
+                    else:
+                        summary[m_key]["trainable"] = False
+                except:
+                    summary[m_key]["trainable"] = True
+
+            summary[m_key]["nb_params"] = params
+
+        if (not isinstance(layer, nn.Sequential) and
+                not isinstance(layer, nn.LayerList) and
+            (not (layer == model) or depth < 1)):
+
+            hooks.append(layer.register_forward_post_hook(hook))
+
+    def _check_input_size(input_sizes):
+        for input_size in input_sizes:
+            for item in input_size:
+                if not isinstance(item, numbers.Number):
+                    raise TypeError(
+                        "Expected item in input size be a number, but got {}".
+                        format(type(item)))
+
+                if item <= 0:
+                    raise ValueError(
+                        "Expected item in input size greater than zero, but got {}".
+                        format(item))
+
+    if isinstance(input_size, tuple):
+        input_size = [input_size]
+
+    _check_input_size(input_size)
+
+    x = [
+        paddle.rand(
+            [2] + list(in_size), dtype=dtype)
+        for in_size, dtype in zip(input_size, dtypes)
+    ]
+
+    # create properties
+    summary = OrderedDict()
+    hooks = []
+
+    # register hook
+    model.apply(register_hook)
+
+    # make a forward pass
+    model(*x)
+
+    # remove these hooks
+    for h in hooks:
+        h.remove()
+
+    table_width = 80
+    summary_str += "-" * table_width + "\n"
+    line_new = "{:>15} {:>20} {:>20} {:>15}".format(
+        "Layer (type)", "Input Shape", "Output Shape", "Param #")
+    summary_str += line_new + "\n"
+    summary_str += "=" * table_width + "\n"
+    total_params = 0
+    total_output = 0
+    trainable_params = 0
+    for layer in summary:
+        # input_shape, output_shape, trainable, nb_params
+        line_new = "{:>15} {:>20} {:>20} {:>15}".format(
+            layer,
+            str(summary[layer]["input_shape"]),
+            str(summary[layer]["output_shape"]),
+            "{0:,}".format(summary[layer]["nb_params"]), )
+        total_params += summary[layer]["nb_params"]
+
+        try:
+            total_output += np.prod(summary[layer]["output_shape"])
+        except:
+            for output_shape in summary[layer]["output_shape"]:
+                total_output += np.prod(output_shape)
+
+        if "trainable" in summary[layer]:
+            if summary[layer]["trainable"] == True:
+                trainable_params += summary[layer]["nb_params"]
+        summary_str += line_new + "\n"
+
+    # assume 4 bytes/number (float on cuda).
+    total_input_size = abs(
+        np.prod(sum(input_size, ())) * batch_size * 4. / (1024**2.))
+    total_output_size = abs(2. * total_output * 4. /
+                            (1024**2.))  # x2 for gradients
+    total_params_size = abs(total_params * 4. / (1024**2.))
+    total_size = total_params_size + total_output_size + total_input_size
+
+    summary_str += "=" * table_width + "\n"
+    summary_str += "Total params: {0:,}".format(total_params) + "\n"
+    summary_str += "Trainable params: {0:,}".format(trainable_params) + "\n"
+    summary_str += "Non-trainable params: {0:,}".format(total_params -
+                                                        trainable_params) + "\n"
+    summary_str += "-" * table_width + "\n"
+    summary_str += "Input size (MB): %0.2f" % total_input_size + "\n"
+    summary_str += "Forward/backward pass size (MB): %0.2f" % total_output_size + "\n"
+    summary_str += "Params size (MB): %0.2f" % total_params_size + "\n"
+    summary_str += "Estimated Total Size (MB): %0.2f" % total_size + "\n"
+    summary_str += "-" * table_width + "\n"
+    # return summary
+    return summary_str, {
+        'total_params': total_params,
+        'trainable_params': trainable_params
+    }
diff --git a/python/paddle/inference/__init__.py b/python/paddle/inference/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..c388301ec3408e436eacb2567e8e529d0bbc03bb
--- /dev/null
+++ b/python/paddle/inference/__init__.py
@@ -0,0 +1,16 @@
+#   Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from ..fluid.inference import Config, DataType, PlaceType, PrecisionType, Tensor, \
+    Predictor, create_predictor, get_version, get_num_bytes_of_data_type, PredictorPool
diff --git a/python/paddle/io/__init__.py b/python/paddle/io/__init__.py
index b67779cb2a2ae699c8206dc717670bf6eb23b25e..6f0b0f3c9c135e00a01c69869742a40ff615a96b 100644
--- a/python/paddle/io/__init__.py
+++ b/python/paddle/io/__init__.py
@@ -31,15 +31,6 @@ __all__ = [
     'set_program_state',
     'load_inference_model',
     'save_inference_model',
-    'batch',
-    'shuffle',
-    'buffered',
-    'cache',
-    'chain',
-    'firstn',
-    'compose',
-    'map_readers',
-    'xmap_readers'
 ]
 
 from ..fluid.io import DataLoader
@@ -47,4 +38,3 @@ from ..fluid.dataloader import Dataset, IterableDataset, BatchSampler, get_worke
         TensorDataset, Sampler, SequenceSampler, RandomSampler, DistributedBatchSampler
 from ..fluid.io import load, save, load_program_state, set_program_state, \
         load_inference_model, save_inference_model, batch
-from ..reader import shuffle, buffered, cache, chain, firstn, compose, map_readers, xmap_readers
diff --git a/python/paddle/jit/__init__.py b/python/paddle/jit/__init__.py
index 03299a3bb9823d31c40ae4faab601ed89570c71e..d04a65ad6ea99ee2e2e67e47fd9d656f1572a02d 100644
--- a/python/paddle/jit/__init__.py
+++ b/python/paddle/jit/__init__.py
@@ -14,7 +14,6 @@
 
 from ..fluid.dygraph.jit import save  #DEFINE_ALIAS
 from ..fluid.dygraph.jit import load  #DEFINE_ALIAS
-from ..fluid.dygraph.jit import SaveLoadConfig  #DEFINE_ALIAS
 from ..fluid.dygraph.jit import TracedLayer  #DEFINE_ALIAS
 from ..fluid.dygraph.jit import set_code_level  #DEFINE_ALIAS
 from ..fluid.dygraph.jit import set_verbosity  #DEFINE_ALIAS
@@ -23,6 +22,6 @@ from ..fluid.dygraph import ProgramTranslator  #DEFINE_ALIAS
 from ..fluid.dygraph.io import TranslatedLayer  #DEFINE_ALIAS
 
 __all__ = [
-    'save', 'load', 'SaveLoadConfig', 'TracedLayer', 'to_static',
-    'ProgramTranslator', 'TranslatedLayer', 'set_code_level', 'set_verbosity'
+    'save', 'load', 'TracedLayer', 'to_static', 'ProgramTranslator',
+    'TranslatedLayer', 'set_code_level', 'set_verbosity'
 ]
diff --git a/python/paddle/nn/__init__.py b/python/paddle/nn/__init__.py
index 66caba540f2fed8c035d0f1af14f9e40a329bca5..79583f344f0c1f642586c4a8ecc08f2aa4e24008 100644
--- a/python/paddle/nn/__init__.py
+++ b/python/paddle/nn/__init__.py
@@ -89,7 +89,7 @@ from .layer.common import CosineSimilarity  #DEFINE_ALIAS
 from .layer.common import Embedding  #DEFINE_ALIAS
 from .layer.common import Linear  #DEFINE_ALIAS
 from .layer.common import Flatten  #DEFINE_ALIAS
-from .layer.common import UpSample  #DEFINE_ALIAS
+from .layer.common import Upsample  #DEFINE_ALIAS
 from .layer.common import UpsamplingNearest2d  #DEFINE_ALIAS
 from .layer.common import UpsamplingBilinear2d  #DEFINE_ALIAS
 from .layer.common import Bilinear  #DEFINE_ALIAS
diff --git a/python/paddle/nn/functional/__init__.py b/python/paddle/nn/functional/__init__.py
index 325eaa64d5ca4bd3d65bf266ff0a42226a3199e6..163c249ab37457d7d4566553c71e3231f384a8b1 100644
--- a/python/paddle/nn/functional/__init__.py
+++ b/python/paddle/nn/functional/__init__.py
@@ -39,7 +39,7 @@ from .activation import hard_sigmoid  #DEFINE_ALIAS
 from .activation import hard_swish  #DEFINE_ALIAS
 from .activation import hsigmoid  #DEFINE_ALIAS
 from .activation import leaky_relu  #DEFINE_ALIAS
-from .activation import logsigmoid  #DEFINE_ALIAS
+from .activation import log_sigmoid  #DEFINE_ALIAS
 from .activation import maxout  #DEFINE_ALIAS
 from .activation import prelu  #DEFINE_ALIAS
 from .activation import relu  #DEFINE_ALIAS
@@ -72,6 +72,7 @@ from .common import unfold  #DEFINE_ALIAS
 # from .common import bilinear_tensor_product        #DEFINE_ALIAS
 from .common import assign  #DEFINE_ALIAS
 from .common import interpolate  #DEFINE_ALIAS
+from .common import upsample  #DEFINE_ALIAS
 from .common import bilinear  #DEFINE_ALIAS
 from .conv import conv1d  #DEFINE_ALIAS
 from .conv import conv_transpose1d  #DEFINE_ALIAS
diff --git a/python/paddle/nn/functional/activation.py b/python/paddle/nn/functional/activation.py
index ffedb027330bda94db86dc0943a5c4a7281f254f..f7bbe0c94e03dc48ebfb21a62aeded9f446afc63 100644
--- a/python/paddle/nn/functional/activation.py
+++ b/python/paddle/nn/functional/activation.py
@@ -35,7 +35,7 @@ __all__ = [
     'hard_swish',
     'hsigmoid',
     'leaky_relu',
-    'logsigmoid',
+    'log_sigmoid',
     'maxout',
     'prelu',
     'relu',
@@ -552,13 +552,13 @@ def relu(x, name=None):
     return out
 
 
-def logsigmoid(x, name=None):
+def log_sigmoid(x, name=None):
     """
-    logsigmoid activation.
+    log_sigmoid activation.
 
     .. math::
 
-        logsigmoid(x) = log \\frac{1}{1 + e^{-x}}
+        log\\_sigmoid(x) = log \\frac{1}{1 + e^{-x}}
     
     Parameters:
         x (Tensor): The input Tensor with data type float32, float64.
@@ -573,20 +573,19 @@ def logsigmoid(x, name=None):
 
             import paddle
             import paddle.nn.functional as F
-            import numpy as np
 
             paddle.disable_static()
 
-            x = paddle.to_tensor(np.array([1.0, 2.0, 3.0, 4.0]))
-            out = F.logsigmoid(x) # [-0.313262 -0.126928 -0.0485874 -0.0181499]
+            x = paddle.to_tensor([1.0, 2.0, 3.0, 4.0])
+            out = F.log_sigmoid(x) # [-0.313262 -0.126928 -0.0485874 -0.0181499]
     """
 
     if in_dygraph_mode():
         return core.ops.logsigmoid(x)
 
     check_variable_and_dtype(x, 'x', ['float16', 'float32', 'float64'],
-                             'logsigmoid')
-    helper = LayerHelper("logsigmoid", **locals())
+                             'log_sigmoid')
+    helper = LayerHelper("log_sigmoid", **locals())
     out = helper.create_variable_for_type_inference(x.dtype)
     helper.append_op(type='logsigmoid', inputs={'X': x}, outputs={'Out': out})
     return out
diff --git a/python/paddle/nn/functional/common.py b/python/paddle/nn/functional/common.py
index 623af3277fba0e29fb77b02c711e258602f1f75a..9f7fb0185133f580deba64634b62d82955670641 100644
--- a/python/paddle/nn/functional/common.py
+++ b/python/paddle/nn/functional/common.py
@@ -80,6 +80,8 @@ def interpolate(x,
     The input must be a 3-D Tensor of the shape (num_batches, channels, in_w)
     or 4-D (num_batches, channels, in_h, in_w), or a 5-D Tensor of the shape
     (num_batches, channels, in_d, in_h, in_w) or (num_batches, in_d, in_h, in_w, channels),
+    Where in_w is width of the input tensor, in_h is the height of the input tensor,
+    in_d is the depth of the intput tensor.
     and the resizing only applies on the three dimensions(depth, height and width).
 
     Supporting resample methods:
@@ -88,6 +90,7 @@ def interpolate(x,
         'trilinear' : Trilinear interpolation
         'nearest' : Nearest neighbor interpolation
         'bicubic' : Bicubic interpolation
+        'area': Area interpolation
 
     Linear interpolation is the method of using a line connecting two known quantities 
     to determine the value of an unknown quantity between the two known quantities. 
@@ -114,6 +117,12 @@ def interpolate(x,
     smoother than corresponding surfaces obtained by bilinear interpolation or
     nearest-neighbor interpolation.
 
+    Area interpolation is to perform area interpolation
+    in both the 3rd dimension(in height direction) , the 4th dimension(in width
+    direction) and the 5th dimension(in depth direction) on input tensor. Set to 
+    area will directly call `paddle.nn.functional.adaptive_avg_pool1d` or 
+    `paddle.nn.functional.adaptive_avg_pool2d` or `paddle.nn.functional.adaptive_avg_pool3d`.
+
     Example:
 
     .. code-block:: text
@@ -207,11 +216,11 @@ def interpolate(x,
              when input is a 4-D Tensor and is (out_d, out_h, out_w) when input is a 5-D Tensor. 
              Default: None. If a list, each element can be an integer or a Tensor Variable of shape: [1].
              If a Tensor Variable, its dimensions size should be a 1.
-        scale_factor (float|Tensor|list|None): The multiplier for the input height or width. At
-             least one of :attr:`out_shape` or :attr:`scale_factor` must be set.
-             And :attr:`out_shape` has a higher priority than :attr:`scale_factor`.Has to match input size if it is a list.
+        scale_factor (float|Tensor|list|tuple|None): The multiplier for the input height or width. At
+             least one of :attr:`size` or :attr:`scale_factor` must be set.
+             And :attr:`size` has a higher priority than :attr:`scale_factor`.Has to match input size if it is either a list or a tuple or a Tensor.
              Default: None.
-        mode (str): The resample method. It supports 'linear', 'nearest', 'bilinear',
+        mode (str): The resample method. It supports 'linear', 'area', 'nearest', 'bilinear',
                        'bicubic' and 'trilinear' currently. Default: 'nearest'
         align_corners(bool) :  An optional bool, If True, the centers of the 4 corner pixels of the
                                input and output tensors are aligned, preserving the values at the
@@ -235,7 +244,7 @@ def interpolate(x,
     Raises:
         TypeError: size should be a list or tuple or Tensor.
         ValueError: The 'mode' of image_resize can only be 'linear', 'bilinear',
-                    'trilinear', 'bicubic', or 'nearest' currently.
+                    'trilinear', 'bicubic', 'area' or 'nearest' currently.
         ValueError: 'linear' only support 3-D tensor.
         ValueError: 'bilinear', 'bicubic' and 'nearest' only support 4-D tensor.
         ValueError: 'trilinear' only support 5-D tensor.
@@ -283,10 +292,11 @@ def interpolate(x,
         'TRILINEAR',
         'NEAREST',
         'BICUBIC',
+        'AREA',
     ]
     if resample not in resample_methods:
         raise ValueError(
-            "The 'resample' of image_resize can only be 'linaer', 'bilinear', 'trilinear', "
+            "The 'resample' of image_resize can only be 'area', 'linear', 'bilinear', 'trilinear', "
             " 'bicubic' or 'nearest' currently.")
 
     if resample in ['LINEAR'] and len(x.shape) != 3:
@@ -310,8 +320,17 @@ def interpolate(x,
         raise ValueError(
             "align_corners option can only be set with the interpolating modes: linear | bilinear | bicubic | trilinear"
         )
+
+    if resample == 'AREA' and len(x.shape) == 3:
+        return paddle.nn.functional.adaptive_avg_pool1d(x, size)
+
+    if resample == 'AREA' and len(x.shape) == 4:
+        return paddle.nn.functional.adaptive_avg_pool2d(x, size)
+    if resample == 'AREA' and len(x.shape) == 5:
+        return paddle.nn.functional.adaptive_avg_pool3d(x, size)
+
     helper = LayerHelper('{}_interp_v2'.format(resample_type), **locals())
-    dtype = helper.input_dtype()
+    dtype = helper.input_dtype(input_param_name='x')
     if len(x.shape) == 3 and data_format not in ['NCW', 'NWC']:
         raise ValueError(
             "Got wrong value for param `data_format`: " + data_format +
@@ -349,14 +368,15 @@ def interpolate(x,
 
     out_shape = size
     scale = scale_factor
+    if out_shape is not None and scale is not None:
+        raise ValueError("Only one of size or scale_factor should be defined.")
     if out_shape is not None:
         if isinstance(out_shape, Variable):
             out_shape.stop_gradient = True
             inputs['OutSize'] = out_shape
         else:
             if not (_is_list_or_turple_(out_shape)):
-                raise TypeError(
-                    "out_shape should be a list or tuple or Variable.")
+                raise TypeError("size should be a list or tuple or Variable.")
             # Validate the shape
             contain_var = False
             for dim_idx, dim_size in enumerate(out_shape):
@@ -388,7 +408,7 @@ def interpolate(x,
             if len(x.shape) == 3:
                 if len(out_shape) != 1:
                     raise ValueError(
-                        "out_shape length should be 2 for input 3-D tensor")
+                        "size length should be 2 for input 3-D tensor")
                 if contain_var:
                     attrs['out_w'] = size_list[0]
                 else:
@@ -396,7 +416,7 @@ def interpolate(x,
                     attrs['out_w'] = out_shape[0]
             if len(x.shape) == 4:
                 if len(out_shape) != 2:
-                    raise ValueError("out_shape length should be 2 for "
+                    raise ValueError("size length should be 2 for "
                                      "input 4-D tensor.")
                 if contain_var:
                     attrs['out_h'] = size_list[0]
@@ -407,7 +427,7 @@ def interpolate(x,
                     attrs['out_w'] = out_shape[1]
             if len(x.shape) == 5:
                 if len(out_shape) != 3:
-                    raise ValueError("out_shape length should be 3 for "
+                    raise ValueError("size length should be 3 for "
                                      "input 5-D tensor.")
                 if contain_var:
                     attrs['out_d'] = size_list[0]
@@ -430,7 +450,7 @@ def interpolate(x,
             for i in range(len(x.shape) - 2):
                 scale_list.append(scale)
             attrs['scale'] = list(map(float, scale_list))
-        elif isinstance(scale, list):
+        elif isinstance(scale, list) or isinstance(scale, tuple):
             if len(scale) != len(x.shape) - 2:
                 raise ValueError("scale_shape length should be {} for "
                                  "input {}-D tensor.".format(
@@ -441,7 +461,8 @@ def interpolate(x,
             attrs['scale'] = list(map(float, scale))
         else:
             raise TypeError(
-                "Attr(scale)'s type should be float, int, list or Tensor.")
+                "Attr(scale)'s type should be float, int, list, tuple, or Tensor."
+            )
 
     if in_dygraph_mode():
         attr_list = []
@@ -480,9 +501,12 @@ def upsample(x,
              name=None):
     """
     This op resizes a batch of images.
+
     The input must be a 3-D Tensor of the shape (num_batches, channels, in_w)
     or 4-D (num_batches, channels, in_h, in_w), or a 5-D Tensor of the shape
     (num_batches, channels, in_d, in_h, in_w) or (num_batches, in_d, in_h, in_w, channels),
+    Where in_w is width of the input tensor, in_h is the height of the input tensor,
+    in_d is the depth of the intput tensor.
     and the resizing only applies on the three dimensions(depth, height and width).
 
     Supporting resample methods:
@@ -507,12 +531,21 @@ def upsample(x,
     data points on a two-dimensional regular grid. The interpolated surface is
     smoother than corresponding surfaces obtained by bilinear interpolation or
     nearest-neighbor interpolation.
+
     Trilinear interpolation is an extension of linear interpolation for
     interpolating functions of three variables (e.g. D-direction,
     H-direction and W-direction in this op) on a rectilinear 3D grid.
+
     The linear interpolation is performed on three directions.
     align_corners and align_mode are optional parameters,the calculation method
     of interpolation can be selected by them.
+
+    Area interpolation is to perform area interpolation
+    in both the 3rd dimension(in height direction) , the 4th dimension(in width
+    direction) and the 5th dimension(in depth direction) on input tensor. Set to
+    area will directly call `paddle.nn.functional.adaptive_avg_pool1d` or
+    `paddle.nn.functional.adaptive_avg_pool2d` or `paddle.nn.functional.adaptive_avg_pool3d`.
+
     Example:
     .. code-block:: text
         For scale_factor:
@@ -605,9 +638,10 @@ def upsample(x,
              when input is a 4-D Tensor and is (out_d, out_h, out_w) when input is a 5-D Tensor. 
              Default: None. If a list, each element can be an integer or a Tensor Variable of shape: [1].
              If a Tensor Variable, its dimensions size should be a 1.
-        scale_factor (float|Tensor|list|None): The multiplier for the input height or width. At
-             least one of :attr:`out_shape` or :attr:`scale_factor` must be set.
-             And :attr:`out_shape` has a higher priority than :attr:`scale_factor`.
+        scale_factor (float|Tensor|list|tuple|None): The multiplier for the input height or width. At
+             least one of :attr:`size` or :attr:`scale_factor` must be set.
+             And :attr:`size` has a higher priority than :attr:`scale_factor`.Has to match input size if 
+             it is either a list or a tuple or a Tensor.
              Default: None.
         mode (str): The resample method. It supports 'linear', 'nearest', 'bilinear',
                        'bicubic' and 'trilinear' currently. Default: 'nearest'
@@ -910,12 +944,12 @@ def dropout(x,
             #get mask shape
             input_shape = x.shape
             drop_axes = [axis] if isinstance(axis, int) else axis
-            if max(drop_axes) > len(input_shape) - 1:
-                raise ValueError("axis value should less than dimensions of x:{}, but get drop_axes value:{} " \
+            if min(drop_axes) < 0 or max(drop_axes) > len(input_shape) - 1:
+                raise ValueError("axis value should be greater than or equal to 0 and less than dimensions of x:{}, but get axis value:{} " \
                                  .format(len(input_shape), max(drop_axes)))
             if len(drop_axes) > len(input_shape):
                 raise ValueError(
-                    "length of axis should not greater than dimensions of x:{}, but get length of drop axes: {}".
+                    "length of axis should not be greater than dimensions of x:{}, but get length of axis: {}".
                     format(len(input_shape), len(drop_axes)))
             mask_shape = [1] * len(input_shape)
             for i in drop_axes:
@@ -1091,6 +1125,8 @@ def alpha_dropout(x, p=0.5, training=True, name=None):
                                  'alpha_dropout')
 
     if training:
+        if p == 1:
+            return layers.scale(x, scale=0.)
         #get transformation params
         alpha = 1.6732632423543772848170429916717
         scale = 1.0507009873554804934193349852946
diff --git a/python/paddle/nn/functional/conv.py b/python/paddle/nn/functional/conv.py
index 42d7d98aefcbbf51f562b98c4c494aeccfe20cf2..5cf4953933242292c6a732513dbee2164811dd35 100644
--- a/python/paddle/nn/functional/conv.py
+++ b/python/paddle/nn/functional/conv.py
@@ -232,7 +232,7 @@ def conv1d(x,
         raise ValueError("Attr(data_format) should be 'NCL' or 'NLC'. "
                          "Received Attr(data_format): {}.".format(data_format))
 
-    channel_last = (data_format == "NHWC")
+    channel_last = (data_format == "NLC")
     channel_dim = -1 if channel_last else 1
     conv2d_data_format = "NHWC" if channel_last else "NCHW"
     num_channels = x.shape[channel_dim]
@@ -267,8 +267,8 @@ def conv1d(x,
     dilation = utils.convert_to_list(dilation, 1, 'dilation') + [1]
 
     l_type = "conv2d"
-    if (num_channels == groups and num_filters % num_channels == 0 and
-            not use_cudnn):
+    if (num_channels == groups and num_channels != 1 and
+            num_filters % num_channels == 0 and not use_cudnn):
         l_type = 'depthwise_conv2d'
         use_cudnn = False
 
@@ -399,7 +399,7 @@ def conv2d(x,
             `[pad_height_top, pad_height_bottom, pad_width_left, pad_width_right]`, and when 
             `data_format` is `"NCHW"`, `padding` can be in the form `[[0,0], [0,0], 
             [pad_height_top, pad_height_bottom], [pad_width_left, pad_width_right]]`.
-            when `data_format` is `"NHWC"`, `pool_padding` can be in the form
+            when `data_format` is `"NHWC"`, `padding` can be in the form
             `[[0,0], [pad_height_top, pad_height_bottom], [pad_width_left, pad_width_right], [0,0]]`.
             Default: padding = 0.
         dilation (int|tuple): The dilation size. It means the spacing between the kernel
@@ -491,7 +491,8 @@ def conv2d(x,
     dilation = utils.convert_to_list(dilation, 2, 'dilation')
 
     l_type = "conv2d"
-    if (num_channels == groups and num_filters % num_channels == 0):
+    if (num_channels == groups and num_channels != 1 and
+            num_filters % num_channels == 0):
         l_type = 'depthwise_conv2d'
         use_cudnn = False
 
@@ -733,24 +734,36 @@ def conv_transpose1d(x,
 
     stride = utils.convert_to_list(stride, 1, 'stride') + [1]
     dilation = utils.convert_to_list(dilation, 1, 'dilation') + [1]
-    output_padding = utils.convert_to_list(output_padding, 1,
-                                           'output_padding') + [0]
-    if output_padding[0] > stride[0]:
-        raise ValueError(
-            "The size of output_padding should not be greater than stride."
-            "But got output_padding={} and stride={}".format(output_padding[0],
-                                                             stride[0]))
 
     if output_size is None:
         output_size = []
-    elif isinstance(output_size, (list, tuple, int)):
-        output_size = utils.convert_to_list(output_size, 1, 'output_size') + [1]
     else:
-        raise ValueError("output_size should be int, or list, tuple of ints")
+        if output_padding != 0:
+            raise ValueError('output_padding option is mutually exclusive with '
+                             'output_size')
+        if isinstance(output_size, (list, tuple, int)):
+            output_size = utils.convert_to_list(output_size, 1,
+                                                'output_size') + [1]
+        else:
+            raise ValueError(
+                "output_size should be int, or list, tuple of ints")
+
+    if output_padding == 0:
+        output_padding = []
+    else:
+        output_padding = utils.convert_to_list(output_padding, 1,
+                                               'output_padding') + [0]
+
+    if len(output_padding) > 0 and output_padding[0] > stride[0]:
+        raise ValueError(
+            "The size of output_padding should not be greater than stride."
+            "But got output_padding={} and stride={}".format(output_padding[0],
+                                                             stride[0]))
 
     op_type = 'conv2d_transpose'
     num_filters = weight.shape[1]
-    if (num_channels == groups and num_filters == 1 and not use_cudnn):
+    if (num_channels == groups and num_channels != 1 and num_filters == 1 and
+            not use_cudnn):
         op_type = 'depthwise_conv2d_transpose'
         use_cudnn = False
 
@@ -761,16 +774,17 @@ def conv_transpose1d(x,
     weight = nn.unsqueeze(input=weight, axes=[-1])
 
     if in_dygraph_mode():
-        attrs = ('output_size', output_size, 'strides', stride, 'paddings',
-                 padding, 'padding_algorithm', padding_algorithm, 'dilations',
-                 dilation, 'groups', groups, 'use_cudnn', use_cudnn,
-                 'data_format', conv2d_data_format)
+        attrs = ('output_padding', output_padding, 'output_size', output_size,
+                 'strides', stride, 'paddings', padding, 'padding_algorithm',
+                 padding_algorithm, 'dilations', dilation, 'groups', groups,
+                 'use_cudnn', use_cudnn, 'data_format', conv2d_data_format)
         out = getattr(core.ops, op_type)(x, weight, *attrs)
         if bias is not None:
             out = nn.elementwise_add(out, bias, axis=channel_dim)
     else:
         inputs = {'Input': [x], 'Filter': [weight]}
         attrs = {
+            'output_padding': output_padding,
             'output_size': output_size,
             'strides': stride,
             'paddings': padding,
@@ -791,12 +805,6 @@ def conv_transpose1d(x,
         if bias is not None:
             out = nn.elementwise_add(out, bias, axis=channel_dim)
 
-    if output_size is None:
-        out = pad2d(
-            out,
-            padding=[0, output_padding, 0, 0],
-            data_format=conv2d_data_format,
-            name=name)
     out = nn.squeeze(input=out, axes=[squeeze_axis])
     return out
 
@@ -888,9 +896,9 @@ def conv_transpose2d(x,
             'SAME' which is the padding algorithm. If padding size is a tuple or list,
             it could be in three forms: `[pad_height, pad_width]` or 
             `[pad_height_top, pad_height_bottom, pad_width_left, pad_width_right]`,
-            and when `data_format` is `"NCHW"`, `pool_padding` can be in the form 
+            and when `data_format` is `"NCHW"`, `padding` can be in the form 
             `[[0,0], [0,0], [pad_height_top, pad_height_bottom], [pad_width_left, pad_width_right]]`.
-            when `data_format` is `"NHWC"`, `pool_padding` can be in the form 
+            when `data_format` is `"NHWC"`, `padding` can be in the form 
             `[[0,0], [pad_height_top, pad_height_bottom], [pad_width_left, pad_width_right], [0,0]]`.
             Default: padding = 0.
         output_padding(int|list|tuple, optional): Additional size added to one side
@@ -1004,7 +1012,7 @@ def conv_transpose2d(x,
 
     op_type = 'conv2d_transpose'
     num_filters = weight.shape[1]
-    if (num_channels == groups and num_filters == 1):
+    if (num_channels == groups and num_channels != 1 and num_filters == 1):
         op_type = 'depthwise_conv2d_transpose'
         use_cudnn = False
 
@@ -1116,9 +1124,9 @@ def conv3d(x,
             'SAME' which is the padding algorithm. If padding size is a tuple or list,
             it could be in three forms: `[pad_depth, pad_height, pad_width]` or
             `[pad_depth_front, pad_depth_back, pad_height_top, pad_height_bottom, pad_width_left, pad_width_right]`,
-            and when `data_format` is `"NCDHW"`, `pool_padding` can be in the form
+            and when `data_format` is `"NCDHW"`, `padding` can be in the form
             `[[0,0], [0,0], [pad_depth_front, pad_depth_back], [pad_height_top, pad_height_bottom], [pad_width_left, pad_width_right]]`.
-            when `data_format` is `"NDHWC"`, `pool_padding` can be in the form
+            when `data_format` is `"NDHWC"`, `padding` can be in the form
             `[[0,0], [pad_depth_front, pad_depth_back], [pad_height_top, pad_height_bottom], [pad_width_left, pad_width_right], [0,0]]`.
             Default: padding = 0.
         dilation (int|tuple): The dilation size. It means the spacing between the kernel points. 
@@ -1340,9 +1348,9 @@ def conv_transpose3d(x,
             'SAME' which is the padding algorithm. If padding size is a tuple or list,
             it could be in three forms: `[pad_depth, pad_height, pad_width]` or
             `[pad_depth_front, pad_depth_back, pad_height_top, pad_height_bottom, pad_width_left, pad_width_right]`,
-            and when `data_format` is `"NCDHW"`, `pool_padding` can be in the form
+            and when `data_format` is `"NCDHW"`, `padding` can be in the form
             `[[0,0], [0,0], [pad_depth_front, pad_depth_back], [pad_height_top, pad_height_bottom], [pad_width_left, pad_width_right]]`.
-            when `data_format` is `"NDHWC"`, `pool_padding` can be in the form
+            when `data_format` is `"NDHWC"`, `padding` can be in the form
             `[[0,0], [pad_depth_front, pad_depth_back], [pad_height_top, pad_height_bottom], [pad_width_left, pad_width_right], [0,0]]`.
             Default: padding = 0.
         output_padding(int|list|tuple, optional): Additional size added to one side
diff --git a/python/paddle/nn/functional/input.py b/python/paddle/nn/functional/input.py
index bc48cc21c29e6683602f37fb3eab6c9485fe4977..0794b95c801011da6845eaf82c32a5428e0d5f41 100644
--- a/python/paddle/nn/functional/input.py
+++ b/python/paddle/nn/functional/input.py
@@ -113,17 +113,18 @@ def one_hot(x, num_classes, name=None):
 
 def embedding(x, weight, padding_idx=None, sparse=False, name=None):
     """
-    The operator is used to lookup embeddings vector of ids provided by :attr:`input` .
+    The operator is used to lookup embeddings vector of ids provided by :attr:`x` .
 
     The shape of output Tensor is generated by appending the last dimension of the input Tensor shape
     with embedding size.
-    **Note:** The id in :attr:`input` must satisfy :math:`0 =< id < weight.shape[0]` ,
+
+    **Note:** The id in :attr:`x` must satisfy :math:`0 =< id < weight.shape[0]` ,
     otherwise the program will throw an exception and exit.
 
     .. code-block:: text
 
         Case 1:
-            input is a Tensor. 
+            x is a Tensor.
                 padding_idx = -1
                 x.data = [[1, 3], [2, 4], [4, 127]]
                 x.shape = [3, 2]
@@ -138,7 +139,7 @@ def embedding(x, weight, padding_idx=None, sparse=False, name=None):
                             [0.0,         0.0,         ..., 0.0        ]]]  # padding data
 
             The input padding_idx is less than 0, it is automatically converted to padding_idx = -1 + 128 = 127
-            It will pad all-zero data when ids is 127.
+            It will pad all-zero data when id is 127.
 
     Args:
         x(Tensor): A Tensor with type int32/int64, which contains the id information. The value of the input id should
@@ -151,10 +152,10 @@ def embedding(x, weight, padding_idx=None, sparse=False, name=None):
             such as :ref:`api_optimizer_AdadeltaOptimizer` , :ref:`api_optimizer_AdamaxOptimizer` ,
             :ref:`api_optimizer_DecayedAdagradOptimizer` , :ref:`api_optimizer_FtrlOptimizer` ,
             :ref:`api_optimizer_LambOptimizer` and :ref:`api_optimizer_LarsMomentumOptimizer` .
-            In these cases, is_sparse must be False. Default: False.
-        padding_idx(int|long|None): padding_idx needs to be in the interval [-vocab_size, vocab_size).
+            In these cases, sparse must be False. Default: False.
+        padding_idx(int|long|None): padding_idx needs to be in the interval [-weight.shape[0], weight.shape[0]).
             If :math:`padding\_idx < 0`, the :math:`padding\_idx` will automatically be converted
-            to :math:`vocab\_size + padding\_idx` . It will output all-zero padding data whenever lookup
+            to :math:`weight.shape[0] + padding\_idx` . It will output all-zero padding data whenever lookup
             encounters :math:`padding\_idx` in id. And the padding data will not be updated while training.
             If set None, it makes no effect to output. Default: None.
         name(str|None): For detailed information, please refer
@@ -162,7 +163,7 @@ def embedding(x, weight, padding_idx=None, sparse=False, name=None):
            None by default.
 
     Returns:
-        Tensor: Embedding Tensor  mapped by input. The data type is the same as :attr:`weight`.
+        Tensor: Embedding Tensor  mapped by x. The data type is the same as :attr:`weight`.
 
     Examples:
 
@@ -209,6 +210,10 @@ def embedding(x, weight, padding_idx=None, sparse=False, name=None):
         padding_idx = -1 if padding_idx is None else padding_idx if padding_idx >= 0 else (
             weight.shape[0] + padding_idx)
 
+        if padding_idx >= weight.shape[0] or padding_idx < -weight.shape[0]:
+            raise ValueError("padding_idx must be within [-{}, {})".format(
+                weight.shape[0], weight.shape[0]))
+
         helper.append_op(
             type='lookup_table_v2',
             inputs={'Ids': x,
diff --git a/python/paddle/nn/functional/loss.py b/python/paddle/nn/functional/loss.py
index d2ddee654f4d04de152d15130ba53c424af3e5b2..4395520eec70e8483cb61097a166576f4040cb4d 100644
--- a/python/paddle/nn/functional/loss.py
+++ b/python/paddle/nn/functional/loss.py
@@ -138,13 +138,10 @@ def binary_cross_entropy(input, label, weight=None, reduction='mean',
         .. code-block:: python
 
             import paddle
-            import numpy as np
-            input_data = np.array([0.5, 0.6, 0.7]).astype("float32")
-            label_data = np.array([1.0, 0.0, 1.0]).astype("float32")
 
             paddle.disable_static()
-            input = paddle.to_tensor(input_data)
-            label = paddle.to_tensor(label_data)
+            input = paddle.to_tensor([0.5, 0.6, 0.7], 'float32')
+            label = paddle.to_tensor([1.0, 0.0, 1.0], 'float32')
             output = paddle.nn.functional.binary_cross_entropy(input, label)
             print(output.numpy())  # [0.65537095]
 
@@ -277,8 +274,8 @@ def binary_cross_entropy_with_logits(logit,
 
             import paddle
             paddle.disable_static()
-            logit = paddle.to_tensor([5.0, 1.0, 3.0], dtype="float32")
-            label = paddle.to_tensor([1.0, 0.0, 1.0], dtype="float32")
+            logit = paddle.to_tensor([5.0, 1.0, 3.0])
+            label = paddle.to_tensor([1.0, 0.0, 1.0])
             output = paddle.nn.functional.binary_cross_entropy_with_logits(logit, label)
             print(output.numpy())  # [0.45618808]
 
@@ -569,13 +566,10 @@ def l1_loss(input, label, reduction='mean', name=None):
     Examples:
         .. code-block:: python
             import paddle
-            import numpy as np
 
             paddle.disable_static()
-            input_data = np.array([[1.5, 0.8], [0.2, 1.3]]).astype("float32")
-            label_data = np.array([[1.7, 1], [0.4, 0.5]]).astype("float32")
-            input = paddle.to_tensor(input_data)
-            label = paddle.to_tensor(label_data)
+            input = paddle.to_tensor([[1.5, 0.8], [0.2, 1.3]])
+            label = paddle.to_tensor([[1.7, 1], [0.4, 0.5]])
 
             l1_loss = paddle.nn.functional.l1_loss(input, label)
             print(l1_loss.numpy())
@@ -786,10 +780,10 @@ def kl_div(input, label, reduction='mean', name=None):
             input = np.random.uniform(-10, 10, shape).astype('float32')
             target = np.random.uniform(-10, 10, shape).astype('float32')
 
-            # 'batchmean' reduction, loss shape will be [N]
+            # 'batchmean' reduction, loss shape will be [1]
             pred_loss = F.kl_div(paddle.to_tensor(input),
                                  paddle.to_tensor(target), reduction='batchmean')
-            # shape=[5]
+            # shape=[1]
 
             # 'mean' reduction, loss shape will be [1]
             pred_loss = F.kl_div(paddle.to_tensor(input),
@@ -868,7 +862,7 @@ def mse_loss(input, label, reduction='mean', name=None):
     Examples:
 
         .. code-block:: python
-            import numpy as np
+
             import paddle
 
 
@@ -878,8 +872,6 @@ def mse_loss(input, label, reduction='mean', name=None):
             input = paddle.data(name="input", shape=[1])
             label = paddle.data(name="label", shape=[1])
             place = paddle.CPUPlace()
-            input_data = np.array([1.5]).astype("float32")
-            label_data = np.array([1.7]).astype("float32")
 
             output = mse_loss(input,label)
             exe = paddle.static.Executor(place)
@@ -894,8 +886,8 @@ def mse_loss(input, label, reduction='mean', name=None):
 
             # dynamic graph mode
             paddle.disable_static()
-            input = paddle.to_variable(input_data)
-            label = paddle.to_variable(label_data)
+            input = paddle.to_tensor(1.5)
+            label = paddle.to_tensor(1.7)
             output = mse_loss(input, label)
             print(output.numpy())
             # [0.04000002]
@@ -1017,8 +1009,7 @@ def ctc_loss(log_probs,
     loss_out = fluid.layers.squeeze(loss_out, [-1])
     assert reduction in ['mean', 'sum', 'none']
     if reduction == 'mean':
-        loss_out = paddle.mean(loss_out / paddle.cast(label_lengths,
-                                                      loss_out.dtype))
+        loss_out = paddle.mean(loss_out / label_lengths)
     elif reduction == 'sum':
         loss_out = paddle.sum(loss_out)
     return loss_out
@@ -1102,7 +1093,7 @@ def cross_entropy(input,
             " 'none', but received %s, which is not allowed." % reduction)
 
     #step 1. log_softmax
-    log_softmax_out = paddle.nn.functional.log_softmax(input)
+    log_softmax_out = paddle.nn.functional.log_softmax(input, axis=1)
     if weight is not None and not isinstance(weight, Variable):
         raise ValueError(
             "The weight' is not a Variable, please convert to Variable.")
diff --git a/python/paddle/nn/functional/norm.py b/python/paddle/nn/functional/norm.py
index e9c1a21ecffb1b64cb5ae9e6b802600625cb4685..9e8f365f6d23a95275b9a696f6088bb287108ec0 100644
--- a/python/paddle/nn/functional/norm.py
+++ b/python/paddle/nn/functional/norm.py
@@ -165,7 +165,7 @@ def batch_norm(x,
           w = paddle.to_tensor(weight_data)
           b = paddle.to_tensor(bias_data)
           batch_norm_out = paddle.nn.functional.batch_norm(x, rm, rv, w, b)
-          print batch_norm_out
+          print(batch_norm_out.numpy())
     """
 
     assert len(x.shape) >= 2, "input dim must be larger than 1"
@@ -176,6 +176,14 @@ def batch_norm(x,
     mean_out = running_mean
     variance_out = running_var
 
+    true_data_format = ['NC', 'NCL', 'NCHW', 'NCDHW']
+    if data_format not in true_data_format:
+        raise ValueError(
+            "data_format must be one of 'NC', 'NCL', 'NCHW', 'NCDHW', but receive {}".
+            format(data_format))
+
+    data_format = 'NCHW'
+
     if in_dygraph_mode():
         # for dygraph need tuple
         attrs = ("momentum", momentum, "epsilon", epsilon, "data_layout",
@@ -270,7 +278,7 @@ def layer_norm(x,
           layer_norm = paddle.nn.functional.layer_norm(x, x.shape[1:])
           layer_norm_out = layer_norm(x)
 
-          print(layer_norm_out.numpy)
+          print(layer_norm_out.numpy())
     """
     input_shape = list(x.shape)
     input_ndim = len(input_shape)
@@ -302,10 +310,10 @@ def layer_norm(x,
     # create output
     helper = LayerHelper('layer_norm', **locals())
     mean_out = helper.create_variable_for_type_inference(
-        dtype=x.type, stop_gradient=True)
+        dtype=x.dtype, stop_gradient=True)
     variance_out = helper.create_variable_for_type_inference(
-        dtype=x.type, stop_gradient=True)
-    layer_norm_out = helper.create_variable_for_type_inference(x.type)
+        dtype=x.dtype, stop_gradient=True)
+    layer_norm_out = helper.create_variable_for_type_inference(x.dtype)
 
     helper.append_op(
         type="layer_norm",
@@ -362,7 +370,7 @@ def instance_norm(x,
           x = paddle.to_tensor(x_data) 
           instance_norm_out = paddle.nn.functional.instancenorm(x)
 
-          print(instance_norm_out.numpy)
+          print(instance_norm_out.numpy())
 
     """
 
diff --git a/python/paddle/nn/functional/pooling.py b/python/paddle/nn/functional/pooling.py
index b4a713a1964f5d99503e0b5a221668656fa657d1..1eb9167d0352f36bfcb87db79ba23dce14bac507 100755
--- a/python/paddle/nn/functional/pooling.py
+++ b/python/paddle/nn/functional/pooling.py
@@ -168,7 +168,7 @@ def avg_pool1d(x,
                count_include_pad=True,
                ceil_mode=False,
                name=None):
-    """ 
+    """
     This API implements average pooling 1d operation,
     See more details in :ref:`api_nn_pooling_AvgPool1d` .
 
@@ -280,7 +280,7 @@ def avg_pool2d(x,
     """
     This API implements average pooling 2d operation.
     See more details in :ref:`api_nn_pooling_AvgPool2d` .
- 
+
     Args:
         x (Tensor): The input tensor of pooling operator which is a 4-D tensor with
                           shape [N, C, H, W]. The format of input tensor is `"NCHW"` or
@@ -389,7 +389,7 @@ def avg_pool3d(x,
                stride=None,
                padding=0,
                ceil_mode=False,
-               count_include_pad=False,
+               count_include_pad=True,
                divisor_override=None,
                data_format="NCDHW",
                name=None):
@@ -640,7 +640,7 @@ def max_pool2d(x,
             5. A list or tuple of pairs of integers. It has the form [[pad_before, pad_after], [pad_before, pad_after], ...]. Note that, the batch dimension and channel dimension should be [0,0] or (0,0).
             The default value is 0.
         ceil_mode (bool): when True, will use `ceil` instead of `floor` to compute the output shape
-        return_indices (bool): Whether to return the max indices along with the outputs.
+        return_indices (bool): Whether to return the max indices along with the outputs. Default False, only support `"NCHW"` data format
         data_format (string): The data format of the input and output data. An optional string from: `"NCHW"`, `"NHWC"`.
                         The default is `"NCHW"`. When it is `"NCHW"`, the data is stored in the order of:
                         `[batch_size, input_channels, input_height, input_width]`.
@@ -690,15 +690,30 @@ def max_pool2d(x,
     padding, padding_algorithm = _update_padding_nd(
         padding, num_dims=2, channel_last=channel_last, ceil_mode=ceil_mode)
 
+    if data_format == "NHWC" and return_indices:
+        raise ValueError(
+            "When setting return_indices to true, data_format must be set to NCHW in API:max_pool2d"
+        )
+
     if in_dygraph_mode():
-        output = core.ops.max_pool2d_with_index(
-            x, 'ksize', kernel_size, 'global_pooling', False, 'strides', stride,
-            'paddings', padding, 'padding_algorithm', padding_algorithm,
-            'use_cudnn', True, 'ceil_mode', ceil_mode, 'use_mkldnn', False,
-            'exclusive', True, 'data_format', data_format)
-        return output if return_indices else output[0]
+        if data_format == "NCHW":
+            output = core.ops.max_pool2d_with_index(
+                x, 'ksize', kernel_size, 'global_pooling', False, 'strides',
+                stride, 'paddings', padding, 'padding_algorithm',
+                padding_algorithm, 'use_cudnn', True, 'ceil_mode', ceil_mode,
+                'use_mkldnn', False, 'exclusive', True, 'data_format',
+                data_format)
+            return output if return_indices else output[0]
+        elif data_format == "NHWC" and not return_indices:
+            output = core.ops.pool2d(
+                x, 'pooling_type', 'max', 'ksize', kernel_size,
+                'global_pooling', False, 'padding_algorithm', padding_algorithm,
+                'strides', stride, 'paddings', padding, 'use_cudnn', True,
+                'ceil_mode', ceil_mode, 'use_mkldnn', False, 'exclusive', True,
+                'data_format', data_format)
+            return output
 
-    op_type = 'max_pool2d_with_index'
+    op_type = 'max_pool2d_with_index' if data_format == "NCHW" else "pool2d"
     helper = LayerHelper(op_type, **locals())
     dtype = helper.input_dtype()
     pool_out = helper.create_variable_for_type_inference(dtype)
@@ -739,7 +754,7 @@ def max_pool3d(x,
     See more details in :ref:`api_nn_pooling_MaxPool3d` .
     Args:
         x (Tensor): The input tensor of pooling operator, which is a 5-D tensor with
-                          shape [N, C, D, H, W]. The format of input tensor is `"NCDHW"` or `"NDHWC"`, where N represents batch size, C represents the number of channels, D, H and W represent the depth, height and width of the feature respectively. 
+                          shape [N, C, D, H, W]. The format of input tensor is `"NCDHW"` or `"NDHWC"`, where N represents batch size, C represents the number of channels, D, H and W represent the depth, height and width of the feature respectively.
         kernel_size (int|list|tuple): The pool kernel size. If the kernel size
             is a tuple or list, it must contain three integers,
             (kernel_size_Depth, kernel_size_Height, kernel_size_Width).
@@ -755,7 +770,7 @@ def max_pool3d(x,
             5. A list or tuple of pairs of integers. It has the form [[pad_before, pad_after], [pad_before, pad_after], ...]. Note that, the batch dimension and channel dimension should be [0,0] or (0,0).
             The default value is 0.
         ceil_mode (bool): ${ceil_mode_comment}
-        return_indices (bool): Whether to return the max indices along with the outputs.
+        return_indices (bool): Whether to return the max indices along with the outputs. Default False. Only support "NDCHW" data_format.
         data_format (string): The data format of the input and output data. An optional string from: `"NCDHW"`, `"NDHWC"`.
                         The default is `"NCDHW"`. When it is `"NCDHW"`, the data is stored in the order of:
                         `[batch_size, input_channels, input_depth, input_height, input_width]`.
@@ -801,15 +816,30 @@ def max_pool3d(x,
     padding, padding_algorithm = _update_padding_nd(
         padding, 3, channel_last=channel_last, ceil_mode=ceil_mode)
 
+    if data_format == "NDHWC" and return_indices:
+        raise ValueError(
+            "When setting return_indices to true, data_format must be set to NCDHW in API:max_pool3d"
+        )
+
     if in_dygraph_mode():
-        output = core.ops.max_pool3d_with_index(
-            x, 'pooling_type', 'max', 'ksize', kernel_size, 'strides', stride,
-            'paddings', padding, 'global_pooling', False, 'padding_algorithm',
-            padding_algorithm, 'use_cudnn', True, 'ceil_mode', ceil_mode,
-            'use_mkldnn', False, 'exclusive', True, 'data_format', data_format)
-        return output if return_indices else output[0]
+        if data_format == "NCDHW":
+            output = core.ops.max_pool3d_with_index(
+                x, 'pooling_type', 'max', 'ksize', kernel_size, 'strides',
+                stride, 'paddings', padding, 'global_pooling', False,
+                'padding_algorithm', padding_algorithm, 'use_cudnn', True,
+                'ceil_mode', ceil_mode, 'use_mkldnn', False, 'exclusive', True,
+                'data_format', data_format)
+            return output if return_indices else output[0]
+        elif data_format == "NDHWC" and not return_indices:
+            output = core.ops.pool3d(
+                x, 'pooling_type', 'max', 'ksize', kernel_size,
+                'global_pooling', False, 'padding_algorithm', padding_algorithm,
+                'strides', stride, 'paddings', padding, 'use_cudnn', True,
+                'ceil_mode', ceil_mode, 'use_mkldnn', False, 'exclusive', True,
+                'data_format', data_format)
+            return output
 
-    op_type = "max_pool3d_with_index"
+    op_type = "max_pool3d_with_index" if data_format == "NCDHW" else "pool3d"
     helper = LayerHelper(op_type, **locals())
     dtype = helper.input_dtype()
     pool_out = helper.create_variable_for_type_inference(dtype)
@@ -841,7 +871,7 @@ def adaptive_avg_pool1d(x, output_size, name=None):
     """
     This API implements adaptive average pooling 1d operation.
     See more details in :ref:`api_nn_pooling_AdaptiveAvgPool1d` .
-    
+
     Args:
         x (Tensor): The input tensor of pooling operator, which is a 3-D tensor
                               with shape [N, C, L].  The format of input tensor is NCL,
@@ -976,6 +1006,7 @@ def adaptive_avg_pool2d(x, output_size, data_format='NCHW', name=None):
     if isinstance(output_size, int):
         output_size = utils.convert_to_list(output_size, 2, 'output_size')
     else:
+        output_size = list(output_size)
         if output_size[0] == None:
             output_size[0] = in_h
         if output_size[1] == None:
@@ -1079,6 +1110,7 @@ def adaptive_avg_pool3d(x, output_size, data_format='NCDHW', name=None):
     if isinstance(output_size, int):
         output_size = utils.convert_to_list(output_size, 3, 'output_size')
     else:
+        output_size = list(output_size)
         if output_size[0] == None:
             output_size[0] = in_l
         if output_size[1] == None:
@@ -1123,8 +1155,7 @@ def adaptive_max_pool1d(x, output_size, return_indices=False, name=None):
                               with shape [N, C, L].  The format of input tensor is NCL,
                               where N is batch size, C is the number of channels, L is the
                               length of the feature. The data type is float32 or float64.
-        output_size (int|list|tuple): The pool kernel size. If pool kernel size is a tuple or list,
-                it must contain one int.
+        output_size (int): The pool kernel size. The value should be an integer.
         return_indices (bool): If true, the index of max pooling point will be returned along
                 with outputs. It cannot be set in average pooling type. Default False.
         name(str, optional): For detailed information, please refer
@@ -1134,9 +1165,10 @@ def adaptive_max_pool1d(x, output_size, return_indices=False, name=None):
             Tensor: The output tensor of adaptive pooling result. The data type is same
                       as input tensor.
     Raises:
-            ValueError: 'output_size' should be a integer or list or tuple with length as 1.
+            ValueError: 'output_size' should be an integer.
     Examples:
         .. code-block:: python
+
               # max adaptive pool1d
               # suppose input data in shape of [N, C, L], `output_size` is m or [m],
               # output shape is [N, C, m], adaptive pool divide L dimension
@@ -1162,7 +1194,7 @@ def adaptive_max_pool1d(x, output_size, return_indices=False, name=None):
     check_variable_and_dtype(x, 'x', ['float32', 'float64'],
                              'adaptive_max_pool1d')
     _check_input(x, 3)
-    check_type(output_size, 'pool_size', (int), 'adaptive_max_pool1d')
+    check_type(output_size, 'pool_size', int, 'adaptive_max_pool1d')
     check_type(return_indices, 'return_indices', bool, 'adaptive_max_pool1d')
 
     pool_size = [1] + utils.convert_to_list(output_size, 1, 'pool_size')
@@ -1201,15 +1233,19 @@ def adaptive_max_pool2d(x, output_size, return_indices=False, name=None):
     """
         This operation applies a 2D adaptive max pooling on input tensor.
         See more details in :ref:`api_nn_pooling_AdaptiveMaxPool2d` .
+
         Args:
             x (Tensor): The input tensor of adaptive max pool2d operator, which is a 4-D tensor. The data type can be float16, float32, float64, int32 or int64.
             output_size (int|list|tuple): The pool kernel size. If pool kernel size is a tuple or list, it must contain two elements, (H, W). H and W can be either a int, or None which means the size will be the same as that of the input.
             return_indices (bool): If true, the index of max pooling point will be returned along with outputs. Default False.
             name(str, optional): For detailed information, please refer to :ref:`api_guide_Name`. Usually name is no need to set and None by default.
+
         Returns:
             Tensor: The output tensor of adaptive max pool2d result. The data type is same as input tensor.
+
         Examples:
             .. code-block:: python
+
               # max adaptive pool2d
               # suppose input data in the shape of [N, C, H, W], `output_size` is [m, n]
               # output shape is [N, C, m, n], adaptive pool divide H and W dimensions
@@ -1247,6 +1283,7 @@ def adaptive_max_pool2d(x, output_size, return_indices=False, name=None):
     if isinstance(output_size, int):
         output_size = utils.convert_to_list(output_size, 2, 'output_size')
     else:
+        output_size = list(output_size)
         if output_size[0] == None:
             output_size[0] = in_h
         if output_size[1] == None:
@@ -1283,15 +1320,19 @@ def adaptive_max_pool3d(x, output_size, return_indices=False, name=None):
     """
         This operation applies a 3D adaptive max pooling on input tensor.
         See more details in :ref:`api_nn_pooling_AdaptiveMaxPool3d` .
+
         Args:
             x (Tensor): The input tensor of adaptive max pool3d operator, which is a 5-D tensor. The data type can be float32, float64.
             output_size (int|list|tuple): The pool kernel size. If pool kernel size is a tuple or list, it must contain three elements, (D, H, W). D, H and W can be either a int, or None which means the size will be the same as that of the input.
             return_indices (bool): If true, the index of max pooling point will be returned along with outputs. Default False.
             name(str, optional): For detailed information, please refer to :ref:`api_guide_Name`. Usually name is no need to set and None by default.
+
         Returns:
             Tensor: The output tensor of adaptive max pool3d result. The data type is same as input tensor.
+
         Examples:
             .. code-block:: python
+
               # adaptive max pool3d
               # suppose input data in the shape of [N, C, D, H, W], `output_size` is [l, m, n]
               # output shape is [N, C, l, m, n], adaptive pool divide D, H and W dimensions
@@ -1333,6 +1374,7 @@ def adaptive_max_pool3d(x, output_size, return_indices=False, name=None):
     if isinstance(output_size, int):
         output_size = utils.convert_to_list(output_size, 3, 'output_size')
     else:
+        output_size = list(output_size)
         if output_size[0] == None:
             output_size[0] = in_l
         if output_size[1] == None:
diff --git a/python/paddle/nn/functional/vision.py b/python/paddle/nn/functional/vision.py
index 1dfdac26e990851ac5f192742acd47fb92633d0d..a74a98d5ed45b9f613b0f2f6d5f04544ffae3d2a 100644
--- a/python/paddle/nn/functional/vision.py
+++ b/python/paddle/nn/functional/vision.py
@@ -249,7 +249,7 @@ def grid_sample(x,
         mode(str, optional): The interpolation method which can be 'bilinear' or 'nearest'.
                          Default: 'bilinear'.
         padding_mode(str, optional) The padding method used when source index
-                   is out of input images. It can be 'zeros', 'reflect' and 'border'.
+                   is out of input images. It can be 'zeros', 'reflection' and 'border'.
                    Default: zeros.
         align_corners(bool, optional): If `align_corners` is true, it will projects
                    -1 and 1 to the centers of the corner pixels. Otherwise, it will
@@ -312,7 +312,7 @@ def grid_sample(x,
     if not isinstance(grid, Variable):
         raise ValueError("The grid should be a Variable")
     _modes = ['bilinear', 'nearest']
-    _padding_modes = ['zeros', 'reflect', 'border']
+    _padding_modes = ['zeros', 'reflection', 'border']
     if mode not in _modes:
         raise ValueError(
             "The mode of grid sample function should be in {}, but got: {}".
diff --git a/python/paddle/nn/layer/__init__.py b/python/paddle/nn/layer/__init__.py
index 7d7a392ebe80c3af8c991dbff746d0f8f216b18b..760af09f1f2f5af066058572f681ec21f9a93180 100644
--- a/python/paddle/nn/layer/__init__.py
+++ b/python/paddle/nn/layer/__init__.py
@@ -59,7 +59,7 @@ from .common import CosineSimilarity  #DEFINE_ALIAS
 from .common import Embedding  #DEFINE_ALIAS
 from .common import Linear  #DEFINE_ALIAS
 from .common import Flatten  #DEFINE_ALIAS
-from .common import UpSample  #DEFINE_ALIAS
+from .common import Upsample  #DEFINE_ALIAS
 from .common import UpsamplingNearest2d  #DEFINE_ALIAS
 from .common import UpsamplingBilinear2d  #DEFINE_ALIAS
 from .common import Dropout  #DEFINE_ALIAS
diff --git a/python/paddle/nn/layer/activation.py b/python/paddle/nn/layer/activation.py
index c38d6018a2500111280a482aa60d072e65e27742..585d369c607e5b6eb6a2a3bcb28bd8999a2e0dca 100644
--- a/python/paddle/nn/layer/activation.py
+++ b/python/paddle/nn/layer/activation.py
@@ -860,11 +860,10 @@ class LogSigmoid(layers.Layer):
         .. code-block:: python
 
             import paddle
-            import numpy as np
 
             paddle.disable_static()
 
-            x = paddle.to_tensor(np.array([1.0, 2.0, 3.0, 4.0]))
+            x = paddle.to_tensor([1.0, 2.0, 3.0, 4.0])
             m = paddle.nn.LogSigmoid()
             out = m(x) # [-0.313262 -0.126928 -0.0485874 -0.0181499]
     """
@@ -874,7 +873,7 @@ class LogSigmoid(layers.Layer):
         self._name = name
 
     def forward(self, x):
-        return F.logsigmoid(x, self._name)
+        return F.log_sigmoid(x, self._name)
 
 
 class Softmax(layers.Layer):
diff --git a/python/paddle/nn/layer/common.py b/python/paddle/nn/layer/common.py
index d8e1d03b02840e76ff865986d8b90ca9d6cdd9f8..433443fee1765a3ecd4cf0bbe53a960bbeaefc71 100644
--- a/python/paddle/nn/layer/common.py
+++ b/python/paddle/nn/layer/common.py
@@ -26,7 +26,7 @@ __all__ = [
     'Pool2D',
     'Embedding',
     'Linear',
-    'UpSample',
+    'Upsample',
     'Pad2D',
     'UpsamplingNearest2d',
     'UpsamplingBilinear2d',
@@ -131,12 +131,15 @@ class Linear(layers.Layer):
         return out
 
 
-class UpSample(layers.Layer):
+class Upsample(layers.Layer):
     """
     This op resizes a batch of images.
+
     The input must be a 3-D Tensor of the shape (num_batches, channels, in_w)
     or 4-D (num_batches, channels, in_h, in_w), or a 5-D Tensor of the shape
     (num_batches, channels, in_d, in_h, in_w) or (num_batches, in_d, in_h, in_w, channels),
+    Where in_w is width of the input tensor, in_h is the height of the input tensor,
+    in_d is the depth of the intput tensor.
     and the resizing only applies on the three dimensions(depth, height and width).
 
     Supporting resample methods:
@@ -171,6 +174,12 @@ class UpSample(layers.Layer):
     align_corners and align_mode are optional parameters,the calculation method
     of interpolation can be selected by them.
 
+    Area interpolation is to perform area interpolation
+    in both the 3rd dimension(in height direction) , the 4th dimension(in width
+    direction) and the 5th dimension(in depth direction) on input tensor. Set to
+    area will directly call `paddle.nn.functional.adaptive_avg_pool1d` or
+    `paddle.nn.functional.adaptive_avg_pool2d` or `paddle.nn.functional.adaptive_avg_pool3d`.
+
     Example:
 
     .. code-block:: text
@@ -273,9 +282,9 @@ class UpSample(layers.Layer):
              when input is a 4-D Tensor and is (out_d, out_h, out_w) when input is a 5-D Tensor. 
              Default: None. If a list, each element can be an integer or a Tensor Variable of shape: [1].
              If a Tensor Variable, its dimensions size should be a 1.
-        scale_factor (float|Tensor|list|None): The multiplier for the input height or width. At
-             least one of :attr:`out_shape` or :attr:`scale_factor` must be set.
-             And :attr:`out_shape` has a higher priority than :attr:`scale_factor`.Has to match input size if it is a list.
+        scale_factor (float|Tensor|list|tuple|None): The multiplier for the input height or width. At
+             least one of :attr:`size` or :attr:`scale_factor` must be set.
+             And :attr:`size` has a higher priority than :attr:`scale_factor`. Has to match input size if it is either a list or a tuple or a Tensor.
              Default: None.
         mode (str): The resample method. It supports 'linear', 'nearst', 'bilinear',
                        'bicubic' and 'trilinear' currently. Default: 'nearest'
@@ -322,7 +331,7 @@ class UpSample(layers.Layer):
             paddle.disable_static()
 
             input_data = np.random.rand(2,3,6,10).astype("float32")
-            upsample_out  = paddle.nn.UpSample(size=[12,12])
+            upsample_out  = paddle.nn.Upsample(size=[12,12])
 
             input = paddle.to_tensor(input_data)
             output = upsample_out(x=input)
@@ -339,7 +348,7 @@ class UpSample(layers.Layer):
                  align_mode=0,
                  data_format='NCHW',
                  name=None):
-        super(UpSample, self).__init__()
+        super(Upsample, self).__init__()
         self.size = size
         self.scale_factor = scale_factor
         self.mode = mode.lower()
@@ -366,7 +375,8 @@ class UpsamplingNearest2d(layers.Layer):
     """
     This op upsamples a batch of images, using nearest neighbours' pixel values.
     The input must be a 4-D Tensor of the shape (num_batches, channels, in_h, in_w), 
-    and the upsampling only applies on the two dimensions(height and width).
+    where in_w is width of the input tensor, in_h is the height of the input tensor.
+    And the upsampling only applies on the two dimensions(height and width).
 
     Nearest neighbor interpolation is to perform nearest neighbor interpolation
     in both the 3rd dimension(in height direction) and the 4th dimension(in width
@@ -381,10 +391,11 @@ class UpsamplingNearest2d(layers.Layer):
              layer, the shape is (out_h, out_w) when input is a 4-D Tensor. 
              Default: None. If a list, each element can be an integer or a Tensor Variable of shape: [1].
              If a Tensor Variable, its dimensions size should be a 1.
-        scale_factor (float|int|list|Tensor|None): The multiplier for the input height or width. At
-             least one of :attr:`out_shape` or :attr:`scale_factor` must be set.
-             And :attr:`out_shape` has a higher priority than :attr:`scale_factor`.
-             Default: None. Has to match input size if it is a list.
+        scale_factor (float|int|list|tuple|Tensor|None): The multiplier for the input height or width. At
+             least one of :attr:`size` or :attr:`scale_factor` must be set.
+             And :attr:`size` has a higher priority than :attr:`scale_factor`.
+             Has to match input size if it is either a list or a tuple or a Tensor.
+             Default: None.
         data_format (str, optional): Specify the data format of the input, and the data format of the output
             will be consistent with that of the input. An optional string from:`NCW`, `NWC`, `"NCHW"`, `"NHWC"`, `"NCDHW"`,
             `"NDHWC"`. The default is `"NCHW"`. When it is `"NCHW"`, the data is stored in the order of:
@@ -449,7 +460,8 @@ class UpsamplingBilinear2d(layers.Layer):
     """
     This op upsamples a batch of images, using bilinear' pixel values.
     The input must be a 4-D Tensor of the shape (num_batches, channels, in_h, in_w), 
-    and the upsampling only applies on the two dimensions(height and width).
+    where in_w is width of the input tensor, in_h is the height of the input tensor.
+    And the upsampling only applies on the two dimensions(height and width).
 
     Bilinear interpolation is an extension of linear interpolation for
     interpolating functions of two variables (e.g. H-direction and
@@ -466,10 +478,11 @@ class UpsamplingBilinear2d(layers.Layer):
              layer, the shape is (out_h, out_w) when input is a 4-D Tensor. 
              Default: None. If a list, each element can be an integer or a Tensor Variable of shape: [1].
              If a Tensor Variable, its dimensions size should be a 1.
-        scale_factor (float|int|list|Tensor|None): The multiplier for the input height or width. At
-             least one of :attr:`out_shape` or :attr:`scale_factor` must be set.
-             And :attr:`out_shape` has a higher priority than :attr:`scale_factor`.
-             Default: None. Has to match input size if it is a list.
+        scale_factor (float|int|list|tuple|Tensor|None): The multiplier for the input height or width. At
+             least one of :attr:`size` or :attr:`scale_factor` must be set.
+             And :attr:`size` has a higher priority than :attr:`scale_factor`.
+             Has to match input size if it is either a list or a tuple or a Tensor.
+             Default: None.
         data_format (str, optional): Specify the data format of the input, and the data format of the output
             will be consistent with that of the input. An optional string from:`NCW`, `NWC`, `"NCHW"`, `"NHWC"`, `"NCDHW"`,
             `"NDHWC"`. The default is `"NCHW"`. When it is `"NCHW"`, the data is stored in the order of:
@@ -1551,22 +1564,18 @@ class CosineSimilarity(layers.Layer):
 
 class Embedding(layers.Layer):
     """
-    :alias_main: paddle.nn.Embedding
-	:alias: paddle.nn.Embedding,paddle.nn.layer.Embedding,paddle.nn.layer.common.Embedding
-	:old_api: paddle.fluid.dygraph.Embedding
-
     **Embedding Layer**
 
     This interface is used to construct a callable object of the ``Embedding`` class.
     For specific usage, refer to code examples. It implements the function of the Embedding Layer.
-    This layer is used to lookup embeddings vector of ids provided by :attr:`input` .
+    This layer is used to lookup embeddings vector of ids provided by :attr:`x` .
     It automatically constructs a 2D embedding matrix based on the
-    input :attr:`size` (vocab_size, emb_size) and :attr:`dtype` .
+    input :attr:`num_embeddings` and attr:`embedding_dim`.
 
     The shape of output Tensor is generated by appending an emb_size dimension to the
     last dimension of the input Tensor shape.
 
-    **Note:** The id in :attr:`input` must satisfy :math:`0 =< id < size[0]` ,
+    **Note:** The id in :attr:`x` must satisfy :math:`0 =< id < num_embeddings` ,
     otherwise the program will throw an exception and exit.
 
     .. code-block:: text
@@ -1594,7 +1603,7 @@ class Embedding(layers.Layer):
         num_embeddings (int): Just one element which indicate the size
             of the dictionary of embeddings.
         embedding_dim:  Just one element which indicate the size of each embedding vector respectively.
-        padding_idx(int|long|None): padding_idx needs to be in the interval [-vocab_size, vocab_size).
+        padding_idx(int|long|None): padding_idx needs to be in the interval [-num_embeddings, num_embeddings).
             If :math:`padding\_idx < 0`, the :math:`padding\_idx` will automatically be converted
             to :math:`vocab\_size + padding\_idx` . It will output all-zero padding data whenever lookup
             encounters :math:`padding\_idx` in id. And the padding data will not be updated while training.
@@ -1605,13 +1614,13 @@ class Embedding(layers.Layer):
             such as :ref:`api_optimizer_AdadeltaOptimizer` , :ref:`api_optimizer_AdamaxOptimizer` ,
             :ref:`api_optimizer_DecayedAdagradOptimizer` , :ref:`api_optimizer_FtrlOptimizer` ,
             :ref:`api_optimizer_LambOptimizer` and :ref:`api_optimizer_LarsMomentumOptimizer` .
-            In these case, is_sparse must be False. Default: False.
+            In these case, sparse must be False. Default: False.
         weight_attr(ParamAttr): To specify the weight parameter property. Default: None, which means the
-            default weight parameter property is used. See usage for details in :ref:`api_fluid_ParamAttr` . In addition,
+            default weight parameter property is used. See usage for details in :ref:`api_ParamAttr` . In addition,
             user-defined or pre-trained word vectors can be loaded with the :attr:`param_attr` parameter.
             The local word vector needs to be transformed into numpy format, and the shape of local word
-            vector should be consistent with :attr:`size` . Then :ref:`api_fluid_initializer_NumpyArrayInitializer`
-            is used to load custom or pre-trained word vectors. See code example 2 for details.
+            vector should be consistent with :attr:`num_embeddings` . Then :ref:`api_initializer_NumpyArrayInitializer`
+            is used to load custom or pre-trained word vectors. See code example for details.
         name(str|None): For detailed information, please refer
                to :ref:`api_guide_Name`. Usually name is no need to set and
                None by default.
@@ -1626,20 +1635,34 @@ class Embedding(layers.Layer):
 
         .. code-block:: python
 
-          import paddle
-          import paddle.nn as nn
-          import numpy as np
-          paddle.disable_static()
+            import paddle
+            import numpy as np
+
+            x_data = np.arange(3, 6).reshape((3, 1)).astype(np.int64)
+            y_data = np.arange(6, 12).reshape((3, 2)).astype(np.float32)
+            paddle.disable_static(paddle.CPUPlace())
+            x = paddle.to_tensor(x_data, stop_gradient=False)
+            y = paddle.to_tensor(y_data, stop_gradient=False)
 
-          # example 1
-          inp_word = np.array([[2, 3, 5], [4, 2, 1]]).astype('int64')
-          inp_word.shape  # [2, 3]
-          dict_size = 20
+            embedding = paddle.nn.Embedding(10, 3, sparse=True)
+
+            w0=np.full(shape=(10, 3), fill_value=2).astype(np.float32)
+            embedding.weight.set_value(w0)
+
+            adam = paddle.optimizer.Adam(parameters=[embedding.weight], learning_rate=0.01)
+            adam.clear_grad()
+
+            # weight.shape = [10, 3]
+
+            # x.data = [[3],[4],[5]]
+            # x.shape = [3, 1]
+
+            # out.data = [[2,2,2], [2,2,2], [2,2,2]]
+            # out.shape = [3, 1, 3]
+            out=embedding(x)
+            out.backward()
+            adam.step()
 
-          emb = nn.Embedding(
-                    dict_size,
-                    32,
-                    sparse=False)
     """
 
     def __init__(self,
@@ -1656,13 +1679,24 @@ class Embedding(layers.Layer):
         self._is_distributed = False
         self._padding_idx = -1 if padding_idx is None else padding_idx if padding_idx >= 0 else (
             num_embeddings + padding_idx)
+
+        if self._num_embeddings <= 0:
+            raise ValueError("num_embeddings must be gather than 0")
+
+        if self._embedding_dim <= 0:
+            raise ValueError("embedding_dim must be gather than 0")
+
+        if self._padding_idx >= num_embeddings or self._padding_idx < -num_embeddings:
+            raise ValueError("padding_idx must be within [-{}, {})".format(
+                num_embeddings, num_embeddings))
+
         self._dtype = self._helper.get_default_dtype()
         self._size = [self._num_embeddings, self._embedding_dim]
 
         self._weight_attr = weight_attr
         self._remote_prefetch = False
         self._name = name
-        self._weight = self.create_parameter(
+        self.weight = self.create_parameter(
             attr=self._weight_attr,
             shape=self._size,
             dtype=self._dtype,
@@ -1671,7 +1705,7 @@ class Embedding(layers.Layer):
     def forward(self, x):
         return F.embedding(
             x,
-            weight=self._weight,
+            weight=self.weight,
             padding_idx=self._padding_idx,
             sparse=self._sparse,
             name=self._name)
diff --git a/python/paddle/nn/layer/conv.py b/python/paddle/nn/layer/conv.py
index 4e342c00528a2c0115940bb7f695e1ed5b582382..a610693a0a46b7e21d2c6d83716a7bc029677583 100644
--- a/python/paddle/nn/layer/conv.py
+++ b/python/paddle/nn/layer/conv.py
@@ -113,7 +113,7 @@ class _ConvNd(layers.Layer):
             attr=self._bias_attr, shape=[self._out_channels], is_bias=True)
 
 
-class Conv1d(layers.Layer):
+class Conv1d(_ConvNd):
     """
     This interface is used to construct a callable object of the ``Conv1d`` class.
     For more details, refer to code examples.
@@ -172,8 +172,7 @@ class Conv1d(layers.Layer):
             When in 'replicate' mode, uses input boundaries to pad the input tensor.
             When in 'circular' mode, uses circular input to pad the input tensor.
             Default is 'zeros'.
-        bias(bool, optional): Whether to use bias. Default: True.
-        param_attr (ParamAttr, optional): The parameter attribute for learnable weights(Parameter)
+        weight_attr (ParamAttr, optional): The parameter attribute for learnable weights(Parameter)
             of conv1d. If it is set to None or one attribute of ParamAttr, conv1d
             will create ParamAttr as param_attr. If the Initializer of the param_attr
             is not set, the parameter is initialized with :math:`Normal(0.0, std)`,
@@ -227,205 +226,15 @@ class Conv1d(layers.Layer):
                  dilation=1,
                  groups=1,
                  padding_mode='zeros',
-                 bias=True,
                  weight_attr=None,
                  bias_attr=None,
-                 data_format="NCL",
-                 name=None):
-        super(Conv1d, self).__init__()
-        assert weight_attr is not False, "param_attr should not be False here."
-        self._in_channels = in_channels
-        self._out_channels = out_channels
-        self._groups = groups
-        if in_channels % groups != 0:
-            raise ValueError("in_channels must be divisible by groups.")
-        self._kernel_size = utils.convert_to_list(kernel_size, 1, 'kernel_size')
-        self._stride = utils.convert_to_list(stride, 1, 'stride')
-        self._dilation = utils.convert_to_list(dilation, 1, 'dilation')
-        self._padding = padding  # leave it to F.conv1d
-        self._weight_attr = weight_attr
-        self._bias_attr = bias_attr
-        self._data_format = data_format
-        self._name = name
-
-        self._padding_mode = padding_mode
-
-        valid_padding_modes = {'zeros', 'reflect', 'replicate', 'circular'}
-        if padding_mode not in valid_padding_modes:
-            raise ValueError(
-                "padding_mode must be one of {}, but got padding_mode='{}'".
-                format(valid_padding_modes, padding_mode))
-
-        if padding_mode in {'reflect', 'replicate', 'circular'
-                            } and not isinstance(padding, np.int):
-            raise ValueError(
-                "when padding_mode in ['reflect', 'replicate', 'circular'], type of padding must be int"
-            )
-        if not isinstance(padding, str):
-            self._padding = utils.convert_to_list(padding, 1, 'padding') * 2
-
-        num_filter_channels = in_channels // groups
-        filter_shape = [self._out_channels, num_filter_channels
-                        ] + self._kernel_size
-
-        self.weight = self.create_parameter(
-            attr=self._weight_attr,
-            shape=filter_shape,
-            default_initializer=_get_default_param_initializer(
-                self._in_channels, filter_shape))
-        self.bias = self.create_parameter(
-            attr=self._bias_attr, shape=[self._out_channels],
-            is_bias=True) if bias else None
-
-    def forward(self, x):
-        padding = 0
-        if self._padding_mode != "zeros":
-            x = F.pad(x,
-                      self._padding,
-                      mode=self._padding_mode,
-                      data_format=self._data_format)
-        else:
-            padding = self._padding
-
-        out = F.conv1d(
-            x,
-            self.weight,
-            bias=self.bias,
-            padding=padding,
-            stride=self._stride,
-            dilation=self._dilation,
-            groups=self._groups,
-            data_format=self._data_format,
-            name=self._name)
-        return out
-
-
-class Conv2d(_ConvNd):
-    """
-    This interface is used to construct a callable object of the ``Conv2d`` class.
-    For more details, refer to code examples.
-    The convolution2D layer calculates the output based on the input, filter
-    and strides, paddings, dilations, groups parameters. Input and
-    Output are in NCHW format, where N is batch size, C is the number of
-    the feature map, H is the height of the feature map, and W is the width of the feature map.
-    Filter's shape is [MCHW] , where M is the number of output feature map,
-    C is the number of input feature map, H is the height of the filter,
-    and W is the width of the filter. If the groups is greater than 1,
-    C will equal the number of input feature map divided by the groups.
-    Please refer to UFLDL's `convolution
-    <http://ufldl.stanford.edu/tutorial/supervised/FeatureExtractionUsingConvolution/>`_
-    for more details.
-    If bias attribution and activation type are provided, bias is added to the
-    output of the convolution, and the corresponding activation function is
-    applied to the final result.
-    For each input :math:`X`, the equation is:
-
-    ..  math::
-
-        Out = \sigma (W \\ast X + b)
-
-    Where:
-
-    * :math:`X`: Input value, a ``Tensor`` with NCHW format.
-    * :math:`W`: Filter value, a ``Tensor`` with shape [MCHW] .
-    * :math:`\\ast`: Convolution operation.
-    * :math:`b`: Bias value, a 2-D ``Tensor`` with shape [M, 1].
-    * :math:`\\sigma`: Activation function.
-    * :math:`Out`: Output value, the shape of :math:`Out` and :math:`X` may be different.
-    
-    Parameters:
-        in_channels(int): The number of input channels in the input image.
-        out_channels(int): The number of output channels produced by the convolution.
-        kernel_size(int|list|tuple, optional): The size of the convolving kernel.
-        stride(int|list|tuple, optional): The stride size. If stride is a tuple, it must
-            contain three integers, (stride_H, stride_W). Otherwise, the
-            stride_H = stride_W = stride. The default value is 1.
-        padding(int|str|tuple|list, optional): The padding size. Padding coule be in one of the following forms.
-            1. a string in ['valid', 'same'].
-            2. an int, which means each spartial dimension(depth, height, width) is zero paded by size of `padding` 
-            3. a list[int] or tuple[int] whose length is the number of spartial dimensions, which contains the amount of padding on each side for each spartial dimension. It has the form [pad_d1, pad_d2, ...].
-            4. a list[int] or tuple[int] whose length is 2 * number of spartial dimensions. It has the form  [pad_before, pad_after, pad_before, pad_after, ...] for all spartial dimensions.
-            5. a list or tuple of pairs of ints. It has the form [[pad_before, pad_after], [pad_before, pad_after], ...]. Note that, the batch dimension and channel dimension are also included. Each pair of integers correspond to the amount of padding for a dimension of the input. Padding in batch dimension and channel dimension should be [0, 0] or (0, 0).
-            The default value is 0.
-        dilation(int|list|tuple, optional): The dilation size. If dilation is a tuple, it must
-            contain three integers, (dilation_D, dilation_H, dilation_W). Otherwise, the
-            dilation_D = dilation_H = dilation_W = dilation. The default value is 1.
-        groups(int, optional): The groups number of the Conv3d Layer. According to grouped
-            convolution in Alex Krizhevsky's Deep CNN paper: when group=2,
-            the first half of the filters is only connected to the first half
-            of the input channels, while the second half of the filters is only
-            connected to the second half of the input channels. The default value is 1.
-        padding_mode(str, optional): ``'zeros'``, ``'reflect'``, ``'replicate'`` or ``'circular'``. Default: ``'zeros'``.
-        weight_attr(ParamAttr, optional): The parameter attribute for learnable parameters/weights
-            of conv2d. If it is set to None or one attribute of ParamAttr, conv2d
-            will create ParamAttr as param_attr. If it is set to None, the parameter
-            is initialized with :math:`Normal(0.0, std)`, and the :math:`std` is
-            :math:`(\\frac{2.0 }{filter\_elem\_num})^{0.5}`. The default value is None.
-        bias_attr(ParamAttr|bool, optional): The parameter attribute for the bias of conv2d.
-            If it is set to False, no bias will be added to the output units.
-            If it is set to None or one attribute of ParamAttr, conv2d
-            will create ParamAttr as bias_attr. If the Initializer of the bias_attr
-            is not set, the bias is initialized zero. The default value is None.
-        data_format(str, optional): Data format that specifies the layout of input.
-            It can be "NCHW" or "NHWC". Default: "NCHW".
-
-    Attribute:
-
-        **weight** (Parameter): the learnable weights of filter of this layer.
-
-        **bias** (Parameter or None): the learnable bias of this layer.
-
-    Shape:
-
-        - x: :math:`(N, C_{in}, H_{in}, W_{in})`
-
-        - output: :math:`(N, C_{out}, H_{out}, W_{out})`
-
-        Where
-
-        ..  math::
-
-           H_{out}&= \\frac{(H_{in} + 2 * paddings[0] - (dilations[0] * (kernel\_size[0] - 1) + 1))}{strides[0]} + 1
-
-           W_{out}&= \\frac{(W_{in} + 2 * paddings[1] - (dilations[1] * (kernel\_size[1] - 1) + 1))}{strides[1]} + 1
-
-    Examples:
-
-        .. code-block:: python
-
-          import numpy as np
-          import paddle
-          import paddle.nn as nn
-          x = np.random.uniform(-1, 1, (2, 4, 8, 8)).astype('float32')
-          
-          paddle.disable_static()
-          x_var = paddle.to_tensor(x)
-          conv = nn.Conv2d(4, 6, (3, 3))
-          y_var = conv(x_var)
-          y_np = y_var.numpy()
-          print(y_np.shape)
-          
-          # (2, 6, 6, 6)
-    """
-
-    def __init__(self,
-                 in_channels,
-                 out_channels,
-                 kernel_size,
-                 stride=1,
-                 padding=0,
-                 dilation=1,
-                 groups=1,
-                 padding_mode='zeros',
-                 weight_attr=None,
-                 bias_attr=None,
-                 data_format="NCHW"):
-        super(Conv2d, self).__init__(
+                 data_format="NCL"):
+        super(Conv1d, self).__init__(
             in_channels,
             out_channels,
             kernel_size,
             False,
-            2,
+            1,
             stride=stride,
             padding=padding,
             padding_mode=padding_mode,
@@ -436,25 +245,20 @@ class Conv2d(_ConvNd):
             data_format=data_format)
 
     def forward(self, x):
-        if self._padding_mode != 'zeros':
+        padding = 0
+        if self._padding_mode != "zeros":
             x = F.pad(x,
-                      self._reversed_padding_repeated_twice,
+                      self._padding,
                       mode=self._padding_mode,
                       data_format=self._data_format)
-            return F.conv2d(
-                x,
-                self.weight,
-                bias=self.bias,
-                stride=self._stride,
-                dilation=self._dilation,
-                groups=self._groups,
-                data_format=self._data_format)
+        else:
+            padding = self._padding
 
-        out = F.conv2d(
+        out = F.conv1d(
             x,
             self.weight,
             bias=self.bias,
-            padding=self._padding,
+            padding=padding,
             stride=self._stride,
             dilation=self._dilation,
             groups=self._groups,
@@ -462,7 +266,7 @@ class Conv2d(_ConvNd):
         return out
 
 
-class ConvTranspose1d(layers.Layer):
+class ConvTranspose1d(_ConvNd):
     """
     This interface is used to construct a callable object of the ``ConvTranspose1d`` class.
     For more details, refer to code examples.
@@ -603,34 +407,24 @@ class ConvTranspose1d(layers.Layer):
                  padding=0,
                  output_padding=0,
                  groups=1,
-                 bias=True,
                  dilation=1,
                  weight_attr=None,
                  bias_attr=None,
                  data_format="NCL"):
-        super(ConvTranspose1d, self).__init__()
-        assert weight_attr is not False, "param_attr should not be False in ConvTranspose1d."
-        self._param_attr = weight_attr
-        self._bias_attr = bias_attr
-        self._groups = groups
-        self._in_channels = in_channels
-        self._out_channels = out_channels
-        self._output_padding = output_padding
-        self._data_format = data_format
-        self._bias = bias
-
-        self._stride = utils.convert_to_list(stride, 1, 'stride')
-        self._dilation = utils.convert_to_list(dilation, 1, 'dilation')
-        self._kernel_size = utils.convert_to_list(kernel_size, 1, 'kernel_size')
-        self._padding = padding
-
-        filter_shape = [self._in_channels, out_channels // groups
-                        ] + self._kernel_size
-        self.weight = self.create_parameter(
-            shape=filter_shape, attr=self._param_attr)
-        self.bias = self.create_parameter(
-            attr=self._bias_attr, shape=[self._out_channels],
-            is_bias=True) if self._bias else None
+        super(ConvTranspose1d, self).__init__(
+            in_channels,
+            out_channels,
+            kernel_size,
+            True,
+            1,
+            stride=stride,
+            padding=padding,
+            dilation=dilation,
+            output_padding=output_padding,
+            groups=groups,
+            weight_attr=weight_attr,
+            bias_attr=bias_attr,
+            data_format=data_format)
 
     def forward(self, x, output_size=None):
         out = F.conv_transpose1d(
@@ -638,7 +432,169 @@ class ConvTranspose1d(layers.Layer):
             self.weight,
             bias=self.bias,
             output_size=output_size,
-            output_padding=self._output_padding,
+            output_padding=self.output_padding,
+            padding=self._padding,
+            stride=self._stride,
+            dilation=self._dilation,
+            groups=self._groups,
+            data_format=self._data_format)
+        return out
+
+
+class Conv2d(_ConvNd):
+    """
+    This interface is used to construct a callable object of the ``Conv2d`` class.
+    For more details, refer to code examples.
+    The convolution2D layer calculates the output based on the input, filter
+    and strides, paddings, dilations, groups parameters. Input and
+    Output are in NCHW format, where N is batch size, C is the number of
+    the feature map, H is the height of the feature map, and W is the width of the feature map.
+    Filter's shape is [MCHW] , where M is the number of output feature map,
+    C is the number of input feature map, H is the height of the filter,
+    and W is the width of the filter. If the groups is greater than 1,
+    C will equal the number of input feature map divided by the groups.
+    Please refer to UFLDL's `convolution
+    <http://ufldl.stanford.edu/tutorial/supervised/FeatureExtractionUsingConvolution/>`_
+    for more details.
+    If bias attribution and activation type are provided, bias is added to the
+    output of the convolution, and the corresponding activation function is
+    applied to the final result.
+    For each input :math:`X`, the equation is:
+
+    ..  math::
+
+        Out = \sigma (W \\ast X + b)
+
+    Where:
+
+    * :math:`X`: Input value, a ``Tensor`` with NCHW format.
+    * :math:`W`: Filter value, a ``Tensor`` with shape [MCHW] .
+    * :math:`\\ast`: Convolution operation.
+    * :math:`b`: Bias value, a 2-D ``Tensor`` with shape [M, 1].
+    * :math:`\\sigma`: Activation function.
+    * :math:`Out`: Output value, the shape of :math:`Out` and :math:`X` may be different.
+    
+    Parameters:
+        in_channels(int): The number of input channels in the input image.
+        out_channels(int): The number of output channels produced by the convolution.
+        kernel_size(int|list|tuple, optional): The size of the convolving kernel.
+        stride(int|list|tuple, optional): The stride size. If stride is a tuple, it must
+            contain three integers, (stride_H, stride_W). Otherwise, the
+            stride_H = stride_W = stride. The default value is 1.
+        padding(int|str|tuple|list, optional): The padding size. Padding coule be in one of the following forms.
+            1. a string in ['valid', 'same'].
+            2. an int, which means each spartial dimension(depth, height, width) is zero paded by size of `padding` 
+            3. a list[int] or tuple[int] whose length is the number of spartial dimensions, which contains the amount of padding on each side for each spartial dimension. It has the form [pad_d1, pad_d2, ...].
+            4. a list[int] or tuple[int] whose length is 2 * number of spartial dimensions. It has the form  [pad_before, pad_after, pad_before, pad_after, ...] for all spartial dimensions.
+            5. a list or tuple of pairs of ints. It has the form [[pad_before, pad_after], [pad_before, pad_after], ...]. Note that, the batch dimension and channel dimension are also included. Each pair of integers correspond to the amount of padding for a dimension of the input. Padding in batch dimension and channel dimension should be [0, 0] or (0, 0).
+            The default value is 0.
+        dilation(int|list|tuple, optional): The dilation size. If dilation is a tuple, it must
+            contain three integers, (dilation_D, dilation_H, dilation_W). Otherwise, the
+            dilation_D = dilation_H = dilation_W = dilation. The default value is 1.
+        groups(int, optional): The groups number of the Conv3d Layer. According to grouped
+            convolution in Alex Krizhevsky's Deep CNN paper: when group=2,
+            the first half of the filters is only connected to the first half
+            of the input channels, while the second half of the filters is only
+            connected to the second half of the input channels. The default value is 1.
+        padding_mode(str, optional): ``'zeros'``, ``'reflect'``, ``'replicate'`` or ``'circular'``. Default: ``'zeros'``.
+        weight_attr(ParamAttr, optional): The parameter attribute for learnable parameters/weights
+            of conv2d. If it is set to None or one attribute of ParamAttr, conv2d
+            will create ParamAttr as param_attr. If it is set to None, the parameter
+            is initialized with :math:`Normal(0.0, std)`, and the :math:`std` is
+            :math:`(\\frac{2.0 }{filter\_elem\_num})^{0.5}`. The default value is None.
+        bias_attr(ParamAttr|bool, optional): The parameter attribute for the bias of conv2d.
+            If it is set to False, no bias will be added to the output units.
+            If it is set to None or one attribute of ParamAttr, conv2d
+            will create ParamAttr as bias_attr. If the Initializer of the bias_attr
+            is not set, the bias is initialized zero. The default value is None.
+        data_format(str, optional): Data format that specifies the layout of input.
+            It can be "NCHW" or "NHWC". Default: "NCHW".
+
+    Attribute:
+
+        **weight** (Parameter): the learnable weights of filter of this layer.
+
+        **bias** (Parameter or None): the learnable bias of this layer.
+
+    Shape:
+
+        - x: :math:`(N, C_{in}, H_{in}, W_{in})`
+
+        - output: :math:`(N, C_{out}, H_{out}, W_{out})`
+
+        Where
+
+        ..  math::
+
+           H_{out}&= \\frac{(H_{in} + 2 * paddings[0] - (dilations[0] * (kernel\_size[0] - 1) + 1))}{strides[0]} + 1
+
+           W_{out}&= \\frac{(W_{in} + 2 * paddings[1] - (dilations[1] * (kernel\_size[1] - 1) + 1))}{strides[1]} + 1
+
+    Examples:
+
+        .. code-block:: python
+
+          import numpy as np
+          import paddle
+          import paddle.nn as nn
+          x = np.random.uniform(-1, 1, (2, 4, 8, 8)).astype('float32')
+          
+          paddle.disable_static()
+          x_var = paddle.to_tensor(x)
+          conv = nn.Conv2d(4, 6, (3, 3))
+          y_var = conv(x_var)
+          y_np = y_var.numpy()
+          print(y_np.shape)
+          
+          # (2, 6, 6, 6)
+    """
+
+    def __init__(self,
+                 in_channels,
+                 out_channels,
+                 kernel_size,
+                 stride=1,
+                 padding=0,
+                 dilation=1,
+                 groups=1,
+                 padding_mode='zeros',
+                 weight_attr=None,
+                 bias_attr=None,
+                 data_format="NCHW"):
+        super(Conv2d, self).__init__(
+            in_channels,
+            out_channels,
+            kernel_size,
+            False,
+            2,
+            stride=stride,
+            padding=padding,
+            padding_mode=padding_mode,
+            dilation=dilation,
+            groups=groups,
+            weight_attr=weight_attr,
+            bias_attr=bias_attr,
+            data_format=data_format)
+
+    def forward(self, x):
+        if self._padding_mode != 'zeros':
+            x = F.pad(x,
+                      self._reversed_padding_repeated_twice,
+                      mode=self._padding_mode,
+                      data_format=self._data_format)
+            return F.conv2d(
+                x,
+                self.weight,
+                bias=self.bias,
+                stride=self._stride,
+                dilation=self._dilation,
+                groups=self._groups,
+                data_format=self._data_format)
+
+        out = F.conv2d(
+            x,
+            self.weight,
+            bias=self.bias,
             padding=self._padding,
             stride=self._stride,
             dilation=self._dilation,
@@ -920,8 +876,8 @@ class Conv3d(_ConvNd):
                  in_channels,
                  out_channels,
                  kernel_size,
-                 padding=0,
                  stride=1,
+                 padding=0,
                  dilation=1,
                  groups=1,
                  padding_mode='zeros',
@@ -1128,7 +1084,7 @@ class ConvTranspose3d(_ConvNd):
             bias_attr=bias_attr,
             data_format=data_format)
 
-    def forward(self, x, output_size):
+    def forward(self, x, output_size=None):
         if output_size is None:
             output_padding = self.output_padding
         else:
diff --git a/python/paddle/nn/layer/loss.py b/python/paddle/nn/layer/loss.py
index a60e615d5064bf4ef2229dd67193774030383888..271dc9b4e685ce06cdb12ccdcb6bb0704a5ef2a1 100644
--- a/python/paddle/nn/layer/loss.py
+++ b/python/paddle/nn/layer/loss.py
@@ -627,10 +627,13 @@ class KLDivLoss(fluid.dygraph.Layer):
     $$l(x, y) = y * (\log(y) - x)$$
 
     Parameters:
-        reduction (str, optional): Indicate how to average the loss,
-            the candicates are ``'none'`` | ``'mean'`` | ``'sum'``.
-            If :attr:`reduction` is ``'mean'``, the reduced mean loss is returned;
-            Default is ``'mean'``.
+        reduction (Tensor): Indicate how to average the loss,
+             the candicates are ``'none'`` | ``'batchmean'`` | ``'mean'`` | ``'sum'``.
+             If `reduction` is ``'mean'``, the reduced mean loss is returned;
+             If `reduction` is ``'batchmean'``, the sum loss divided by batch size is returned;
+             if `reduction` is ``'sum'``, the reduced sum loss is returned;
+             if `reduction` is ``'none'``, no reduction will be apllied.
+             Default is ``'mean'``.
 
     Shape:
 
@@ -654,11 +657,11 @@ class KLDivLoss(fluid.dygraph.Layer):
             x = np.random.uniform(-10, 10, shape).astype('float32')
             target = np.random.uniform(-10, 10, shape).astype('float32')
 
-            # 'batchmean' reduction, loss shape will be [N]
+            # 'batchmean' reduction, loss shape will be [1]
             kldiv_criterion = nn.KLDivLoss(reduction='batchmean')
             pred_loss = kldiv_criterion(paddle.to_tensor(x),
                                         paddle.to_tensor(target))
-            # shape=[5]
+            # shape=[1]
 
             # 'mean' reduction, loss shape will be [1]
             kldiv_criterion = nn.KLDivLoss(reduction='mean')
@@ -684,7 +687,7 @@ class KLDivLoss(fluid.dygraph.Layer):
         self.reduction = reduction
 
     def forward(self, input, label):
-        out = paddle.nn.functional.kl_div(input, label, self.reduction)
+        out = F.kl_div(input, label, self.reduction)
         return out
 
 
diff --git a/python/paddle/nn/layer/norm.py b/python/paddle/nn/layer/norm.py
index 4d25418579d74ae896f8ca590400a0a334047e93..2000fbf388f88d1da7119402104706a433cebf06 100644
--- a/python/paddle/nn/layer/norm.py
+++ b/python/paddle/nn/layer/norm.py
@@ -78,7 +78,7 @@ class _InstanceNormBase(layers.Layer):
         super(_InstanceNormBase, self).__init__()
 
         if weight_attr == False or bias_attr == False:
-            assert weight_attr == param_attr, "weight_attr and bias_attr must be set to Fasle at the same time in InstanceNorm"
+            assert weight_attr == bias_attr, "weight_attr and bias_attr must be set to Fasle at the same time in InstanceNorm"
         self._epsilon = epsilon
         self._weight_attr = weight_attr
         self._bias_attr = bias_attr
@@ -176,7 +176,7 @@ class InstanceNorm1d(_InstanceNormBase):
           instance_norm = paddle.nn.InstanceNorm1d(2)
           instance_norm_out = instance_norm(x)
 
-          print(instance_norm_out.numpy)
+          print(instance_norm_out.numpy())
 
     """
 
@@ -253,7 +253,7 @@ class InstanceNorm2d(_InstanceNormBase):
           instance_norm = paddle.nn.InstanceNorm2d(2)
           instance_norm_out = instance_norm(x)
 
-          print(instance_norm_out.numpy)
+          print(instance_norm_out.numpy())
     """
 
     def _check_input_dim(self, input):
@@ -329,7 +329,7 @@ class InstanceNorm3d(_InstanceNormBase):
           instance_norm = paddle.nn.InstanceNorm3d(2)
           instance_norm_out = instance_norm(x)
 
-          print(instance_norm_out.numpy)
+          print(instance_norm_out.numpy())
     """
 
     def _check_input_dim(self, input):
@@ -346,8 +346,8 @@ class GroupNorm(layers.Layer):
     Refer to `Group Normalization <https://arxiv.org/abs/1803.08494>`_ .
 
     Parameters:
-        num_channels(int): The number of channels of input.
         num_groups(int): The number of groups that divided from channels.
+        num_channels(int): The number of channels of input.
         epsilon(float, optional): The small value added to the variance to prevent
                                   division by zero. Default: 1e-05.
         weight_attr(ParamAttr|bool, optional): The parameter attribute for the learnable
@@ -375,19 +375,19 @@ class GroupNorm(layers.Layer):
           np.random.seed(123)
           x_data = np.random.random(size=(2, 6, 2, 2)).astype('float32')
           x = paddle.to_tensor(x_data) 
-          group_norm = paddle.nn.GroupNorm(num_channels=3, num_groups=6)
+          group_norm = paddle.nn.GroupNorm(num_channels=6, num_groups=6)
           group_norm_out = group_norm(x)
 
-          print(group_norm_out.numpy)
+          print(group_norm_out.numpy())
     """
 
     def __init__(self,
-                 num_channels,
                  num_groups,
+                 num_channels,
                  epsilon=1e-05,
                  weight_attr=None,
                  bias_attr=None,
-                 data_layout='NCHW',
+                 data_format='NCHW',
                  name=None):
         super(GroupNorm, self).__init__()
         self._weight_attr = weight_attr
@@ -395,18 +395,33 @@ class GroupNorm(layers.Layer):
         self._epsilon = epsilon
         self._num_channels = num_channels
         self._num_groups = num_groups
-        if data_layout != 'NCHW':
+        if data_format != 'NCHW':
             raise ValueError("unsupported data layout:" + data_layout)
 
         param_shape = [self._num_channels]
 
-        self.weight = self.create_parameter(
-            attr=self._weight_attr or False,
-            shape=param_shape,
-            default_initializer=Constant(1.0))
+        if weight_attr == False:
+            self.weight = self.create_parameter(
+                attr=None, shape=param_shape, default_initializer=Constant(1.0))
+            self.weight.stop_gradient = True
+        else:
+            self.weight = self.create_parameter(
+                attr=self._weight_attr,
+                shape=param_shape,
+                default_initializer=Constant(1.0))
+            self.weight.stop_gradient = self._weight_attr != None and self._weight_attr.learning_rate == 0.
 
-        self.bias = self.create_parameter(
-            attr=self._weight_attr or False, shape=param_shape, is_bias=True)
+        if bias_attr == False:
+            self.bias = self.create_parameter(
+                attr=None,
+                shape=param_shape,
+                default_initializer=Constant(0.0),
+                is_bias=True)
+            self.bias.stop_gradient = True
+        else:
+            self.bias = self.create_parameter(
+                attr=self._bias_attr, shape=param_shape, is_bias=True)
+            self.bias.stop_gradient = self._bias_attr != None and self._bias_attr.learning_rate == 0.
 
     def forward(self, input):
         inputs = {'X': input}
@@ -500,7 +515,7 @@ class LayerNorm(layers.Layer):
           layer_norm = paddle.nn.LayerNorm(x_data.shape[1:])
           layer_norm_out = layer_norm(x)
 
-          print(layer_norm_out.numpy)
+          print(layer_norm_out.numpy())
     """
 
     def __init__(self,
@@ -603,8 +618,7 @@ class _BatchNormBase(layers.Layer):
                 initializer=Constant(0.0),
                 trainable=False,
                 do_model_average=True),
-            shape=param_shape,
-            dtype=self._dtype)
+            shape=param_shape)
         self._mean.stop_gradient = True
 
         self._variance = self.create_parameter(
@@ -613,8 +627,7 @@ class _BatchNormBase(layers.Layer):
                 initializer=Constant(1.0),
                 trainable=False,
                 do_model_average=True),
-            shape=param_shape,
-            dtype=self._dtype)
+            shape=param_shape)
         self._variance.stop_gradient = True
 
         self._data_format = data_format
@@ -628,8 +641,13 @@ class _BatchNormBase(layers.Layer):
     def _check_input_dim(self, input):
         raise NotImplementedError("BatchNorm Base error")
 
+    def _check_data_format(self, input):
+        raise NotImplementedError("BatchNorm Base data format error")
+
     def forward(self, input):
 
+        self._check_data_format(self._data_format)
+
         self._check_input_dim(input)
 
         if not self.training and not self._track_running_stats:
@@ -730,9 +748,15 @@ class BatchNorm1d(_BatchNormBase):
           batch_norm = paddle.nn.BatchNorm1d(1)
           batch_norm_out = batch_norm(x)
 
-          print(batch_norm_out.numpy)
+          print(batch_norm_out.numpy())
     """
 
+    def _check_data_format(self, input):
+        if input == 'NCHW' or input == 'NC' or input == 'NCL':
+            self._data_format = 'NCHW'
+        else:
+            raise ValueError('expected NC , NCL or None for data_format input')
+
     def _check_input_dim(self, input):
         if len(input.shape) != 2 and len(input.shape) != 3:
             raise ValueError('expected 2D or 3D input (got {}D input)'.format(
@@ -787,7 +811,7 @@ class BatchNorm2d(_BatchNormBase):
             If it is set to None or one attribute of ParamAttr, batch_norm
             will create ParamAttr as bias_attr. If it is set to Fasle, the weight is not learnable.
             If the Initializer of the bias_attr is not set, the bias is initialized zero. Default: None.
-        data_format(str, optional): Specify the input data format, the data format can be "NCHW" or "NHWC". Default: NCHW.
+        data_format(str, optional): Specify the input data format, the data format can be "NCHW". Default: NCHW.
         track_running_stats(bool, optional): Whether to use global mean and variance. In train period, 
             True will track global mean and variance used for inference. When inference, track_running_stats must be 
             True. Default: True.
@@ -816,9 +840,15 @@ class BatchNorm2d(_BatchNormBase):
           batch_norm = paddle.nn.BatchNorm2d(1)
           batch_norm_out = batch_norm(x)
 
-          print(batch_norm_out.numpy)
+          print(batch_norm_out.numpy())
     """
 
+    def _check_data_format(self, input):
+        if input == 'NCHW':
+            self._data_format = input
+        else:
+            raise ValueError('expected NCHW for data_format input')
+
     def _check_input_dim(self, input):
         if len(input.shape) != 4:
             raise ValueError('expected 4D input (got {}D input)'.format(
@@ -902,9 +932,15 @@ class BatchNorm3d(_BatchNormBase):
           batch_norm = paddle.nn.BatchNorm3d(1)
           batch_norm_out = batch_norm(x)
 
-          print(batch_norm_out.numpy)
+          print(batch_norm_out.numpy())
     """
 
+    def _check_data_format(self, input):
+        if input == 'NCHW' or input == 'NCDHW':
+            self._data_format = 'NCHW'
+        else:
+            raise ValueError('expected NCDHW or None for data_format input')
+
     def _check_input_dim(self, input):
         if len(input.shape) != 5:
             raise ValueError('expected 5D input (got {}D input)'.format(
@@ -1094,10 +1130,10 @@ class SyncBatchNorm(_BatchNormBase):
         """
         layer_output = layer
         if isinstance(layer, _BatchNormBase):
-            layer_output = SyncBatchNorm(layer._num_features, layer._epsilon,
-                                         layer._momentum, layer._weight_attr,
-                                         layer._bias_attr, layer._data_format,
-                                         layer._name)
+            layer_output = SyncBatchNorm(
+                layer._num_features, layer._momentum, layer._epsilon,
+                layer._weight_attr, layer._bias_attr, layer._data_format,
+                layer._track_running_stats, layer._name)
 
             if layer._weight_attr != False and layer._bias_attr != False:
                 with no_grad():
diff --git a/python/paddle/nn/layer/pooling.py b/python/paddle/nn/layer/pooling.py
index 4cb661cf541222ec4f05df0fdc69b6483f04cf55..129dae93b38327308263550e73031b607b2eacc3 100755
--- a/python/paddle/nn/layer/pooling.py
+++ b/python/paddle/nn/layer/pooling.py
@@ -87,6 +87,7 @@ class AvgPool1d(layers.Layer):
     Examples:
 
         .. code-block:: python
+
           import paddle
           import paddle.nn as nn
           paddle.disable_static()
@@ -176,6 +177,7 @@ class AvgPool2d(layers.Layer):
         ShapeError: If the output's shape calculated is not greater than 0.
     Examples:
         .. code-block:: python
+
           import paddle
           import paddle.nn as nn
           import numpy as np
@@ -267,6 +269,7 @@ class AvgPool3d(layers.Layer):
 
     Examples:
         .. code-block:: python
+
           import paddle
           import paddle.nn as nn
           import numpy as np
@@ -457,6 +460,7 @@ class MaxPool2d(layers.Layer):
 
     Examples:
         .. code-block:: python
+
           import paddle
           import paddle.nn as nn
           import numpy as np
@@ -547,6 +551,7 @@ class MaxPool3d(layers.Layer):
 
     Examples:
         .. code-block:: python
+
           import paddle
           import paddle.nn as nn
           import numpy as np
@@ -849,7 +854,7 @@ class AdaptiveMaxPool1d(layers.Layer):
 
        lend &= ceil((i + 1) * L_{in} / L_{out})
 
-       Output(i) &= max(Input[lstart:lend])}
+       Output(i) &= max(Input[lstart:lend])
 
     Args:
         output_size (int|list|tuple): The pool kernel size. If pool kernel size is a tuple or list,
@@ -915,8 +920,11 @@ class AdaptiveMaxPool2d(layers.Layer):
     """
     This operation applies 2D adaptive max pooling on input tensor. The h and w dimensions
     of the output tensor are determined by the parameter output_size. The difference between adaptive pooling and pooling is adaptive one focus on the output size.
+
     For adaptive max pool2d:
+
     ..  math::
+
        hstart &= floor(i * H_{in} / H_{out})
        hend &= ceil((i + 1) * H_{in} / H_{out})
        wstart &= floor(j * W_{in} / W_{out})
@@ -931,11 +939,12 @@ class AdaptiveMaxPool2d(layers.Layer):
     Shape:
         x (Tensor): The input tensor of adaptive max pool2d operator, which is a 4-D tensor. The data type can be float32, float64.
         output (Tensor): The output tensor of adaptive max pool2d operator, which is a 4-D tensor. The data type is same as input x.
-    
+
     Returns:
         A callable object of AdaptiveMaxPool2d.
     Examples:
         .. code-block:: python
+
             # adaptive max pool2d
             # suppose input data in shape of [N, C, H, W], `output_size` is [m, n],
             # output shape is [N, C, m, n], adaptive pool divide H and W dimensions
@@ -976,10 +985,13 @@ class AdaptiveMaxPool2d(layers.Layer):
 
 class AdaptiveMaxPool3d(layers.Layer):
     """
-   This operation applies 3D adaptive max pooling on input tensor. The h and w dimensions
+    This operation applies 3D adaptive max pooling on input tensor. The h and w dimensions
     of the output tensor are determined by the parameter output_size. The difference between adaptive pooling and pooling is adaptive one focus on the output size.
+
     For adaptive max pool3d:
+
     ..  math::
+
       dstart &= floor(i * D_{in} / D_{out})
       dend &= ceil((i + 1) * D_{in} / D_{out})
       hstart &= floor(j * H_{in} / H_{out})
@@ -987,10 +999,9 @@ class AdaptiveMaxPool3d(layers.Layer):
       wstart &= floor(k * W_{in} / W_{out})
       wend &= ceil((k + 1) * W_{in} / W_{out})
       Output(i ,j, k) &= max(Input[dstart:dend, hstart:hend, wstart:wend])
+
     Parameters:
-        output_size (int|list|tuple): The pool kernel size. If pool kernel size is a tuple or list,
-            it must contain three elements, (D, H, W). D, H and W can be either a int, or None which means
-            the size will be the same as that of the input.
+        output_size (int|list|tuple): The pool kernel size. If pool kernel size is a tuple or list, it must contain three elements, (D, H, W). D, H and W can be either a int, or None which means the size will be the same as that of the input.
         return_indices (bool): If true, the index of max pooling point will be returned along with outputs. Default False.
         name(str, optional): For detailed information, please refer
                              to :ref:`api_guide_Name`. Usually name is no need to set and
@@ -1002,6 +1013,7 @@ class AdaptiveMaxPool3d(layers.Layer):
         A callable object of AdaptiveMaxPool3d.
     Examples:
         .. code-block:: python
+
             # adaptive max pool3d
             # suppose input data in shape of [N, C, D, H, W], `output_size` is [l, m, n],
             # output shape is [N, C, l, m, n], adaptive pool divide D, H and W dimensions
@@ -1028,10 +1040,10 @@ class AdaptiveMaxPool3d(layers.Layer):
             pool = paddle.nn.AdaptiveMaxPool3d(output_size=4)
             out = pool(x)
             # out shape: [2, 3, 4, 4, 4]
-            pool, indices = paddle.nn.AdaptiveMaxPool3d(output_size=3, return_indices=True)
-            out = pool(x)
+            pool = paddle.nn.AdaptiveMaxPool3d(output_size=3, return_indices=True)
+            out, indices = pool(x)
             # out shape: [2, 3, 4, 4, 4], indices shape: [2, 3, 4, 4, 4]
-            
+
     """
 
     def __init__(self, output_size, return_indices=False, name=None):
diff --git a/python/paddle/nn/layer/transformer.py b/python/paddle/nn/layer/transformer.py
index 63069e83952172df3136458ebfee4b446749934d..4b199d5816c808d4975c51bc154ad21d46f135eb 100644
--- a/python/paddle/nn/layer/transformer.py
+++ b/python/paddle/nn/layer/transformer.py
@@ -53,7 +53,22 @@ def _convert_param_attr_to_list(param_attr, n):
     if isinstance(param_attr, (list, tuple)):
         assert len(param_attr) == n, (
             "length of param_attr should be %d when it is a list/tuple" % n)
-        param_attrs = [ParamAttr._to_attr(attr) for attr in param_attr]
+        param_attrs = []
+        for attr in param_attr:
+            if isinstance(attr, bool):
+                if attr:
+                    param_attrs.append(ParamAttr._to_attr(None))
+                else:
+                    param_attrs.append(False)
+            else:
+                param_attrs.append(ParamAttr._to_attr(attr))
+        # param_attrs = [ParamAttr._to_attr(attr) for attr in param_attr]
+    elif isinstance(param_attr, bool):
+        param_attrs = []
+        if param_attr:
+            param_attrs = [ParamAttr._to_attr(None) for i in range(n)]
+        else:
+            param_attrs = [False] * n
     else:
         param_attrs = []
         attr = ParamAttr._to_attr(param_attr)
@@ -417,7 +432,7 @@ class TransformerEncoderLayer(Layer):
             Otherwise, MHA and FFN both use it as `weight_attr` to create parameters.
             Default: None, which means the default weight parameter property is used.
             See usage for details in :code:`ParamAttr` . 
-        bias_attr (ParamAttr|tuple, optional): To specify the bias parameter property.
+        bias_attr (ParamAttr|tuple|bool, optional): To specify the bias parameter property.
             If it is a tuple, `bias_attr[0]` would be used as `bias_attr` for
             MHA, and `bias_attr[1]` would be used as `bias_attr` for linear in FFN.
             Otherwise, MHA and FFN both use it as `bias_attr` to create parameters.
@@ -986,22 +1001,31 @@ class Transformer(Layer):
             Otherwise, no pre-process and post-precess includes dropout, residual
             connection, layer normalization. Default False
         weight_attr(ParamAttr|tuple, optional): To specify the weight parameter property.
-            If it is a tuple, `weight_attr[0]` would be used as `weight_attr` for
-            self attention, `weight_attr[1]` would be used as `weight_attr` for
-            cross attention, and `weight_attr[2]` would be used as `weight_attr`
-            for linear in FFN. Otherwise, the three sub-layers all uses it as
-            `weight_attr` to create parameters. Default: None, which means the
-            default weight parameter property is used. See usage for details
+            If it is a tuple, the length of `weight_attr` could be 1, 2 or 3. If it is 3, 
+            `weight_attr[0]` would be used as `weight_attr` for self attention, `weight_attr[1]` 
+            would be used as `weight_attr` for cross attention of `TransformerDecoder`, 
+            and `weight_attr[2]` would be used as `weight_attr` for linear in FFN. 
+            If it is 2, `weight_attr[0]` would be used as `weight_attr` both for self attention 
+            and cross attntion and `weight_attr[1]` would be used as `weight_attr` for 
+            linear in FFN. If it is 1, `weight_attr[0]` would be used as `weight_attr` 
+            for self attention, cross attention and linear in FFN. Otherwise, 
+            the three sub-layers all uses it as `weight_attr` to create parameters. 
+            Default: None, which means the default weight parameter property is used. 
+            See usage for details
             in :code:`ParamAttr` . 
         bias_attr (ParamAttr|tuple, optional): To specify the bias parameter property.
-            If it is a tuple, `bias_attr[0]` would be used as `bias_attr` for
-            self attention, `bias_attr[1]` would be used as `bias_attr` for
-            cross attention, and `bias_attr[2]` would be used as `bias_attr`
-            for linear in FFN. Otherwise, the three sub-layers all uses it as
-            `bias_attr` to create parameters. The `False` value means the
-            corresponding layer would not have trainable bias parameter. See
-            usage for details in :code:`ParamAttr` . Default: None,which means
-            the default bias parameter property is used.
+            If it is a tuple, the length of `bias_attr` could be 1, 2 or 3. If it is 3, 
+            `bias_attr[0]` would be used as `bias_attr` for self attention, `bias_attr[1]` 
+            would be used as `bias_attr` for cross attention of `TransformerDecoder`, 
+            and `bias_attr[2]` would be used as `bias_attr` for linear in FFN. 
+            If it is 2, `bias_attr[0]` would be used as `bias_attr` both for self attention 
+            and cross attntion and `bias_attr[1]` would be used as `bias_attr` for 
+            linear in FFN. If it is 1, `bias_attr[0]` would be used as `bias_attr` 
+            for self attention, cross attention and linear in FFN. Otherwise, 
+            the three sub-layers all uses it as `bias_attr` to create parameters. 
+            The `False` value means the corresponding layer would not have trainable 
+            bias parameter. See usage for details in :code:`ParamAttr` . 
+            Default: None,which means the default bias parameter property is used.
         custom_encoder (Layer): If custom encoder is provided, use it as the encoder.
             Default None
         custom_decoder (Layer): If custom decoder is provided, use it as the decoder.
@@ -1049,13 +1073,51 @@ class Transformer(Layer):
                  custom_decoder=None):
         super(Transformer, self).__init__()
 
+        if isinstance(bias_attr, (list, tuple)):
+            if len(bias_attr) == 1:
+                encoder_bias_attr = [bias_attr[0]] * 2
+                decoder_bias_attr = [bias_attr[0]] * 3
+            elif len(bias_attr) == 2:
+                encoder_bias_attr = bias_attr
+                decoder_bias_attr = [bias_attr[0], bias_attr[0], bias_attr[-1]]
+            elif len(bias_attr) == 3:
+                encoder_bias_attr = [bias_attr[0], bias_attr[-1]]
+                decoder_bias_attr = bias_attr
+            else:
+                assert False, (
+                    "length of bias_attr should be 1 or 2 or 3 when it is a list/tuple"
+                )
+        else:
+            encoder_bias_attr = bias_attr
+            decoder_bias_attr = bias_attr
+
+        if isinstance(weight_attr, (list, tuple)):
+            if len(weight_attr) == 1:
+                encoder_weight_attr = [weight_attr[0]] * 2
+                decoder_weight_attr = [weight_attr[0]] * 3
+            elif len(weight_attr) == 2:
+                encoder_weight_attr = weight_attr
+                decoder_weight_attr = [
+                    weight_attr[0], weight_attr[0], weight_attr[-1]
+                ]
+            elif len(weight_attr) == 3:
+                encoder_weight_attr = [weight_attr[0], weight_attr[-1]]
+                decoder_weight_attr = weight_attr
+            else:
+                assert False, (
+                    "length of weight_attr should be 1 or 2 or 3 when it is a list/tuple"
+                )
+        else:
+            encoder_weight_attr = weight_attr
+            decoder_weight_attr = weight_attr
+
         if custom_encoder is not None:
             self.encoder = custom_encoder
         else:
             encoder_layer = TransformerEncoderLayer(
                 d_model, nhead, dim_feedforward, dropout, activation,
-                attn_dropout, act_dropout, normalize_before, weight_attr,
-                bias_attr)
+                attn_dropout, act_dropout, normalize_before,
+                encoder_weight_attr, encoder_bias_attr)
             encoder_norm = LayerNorm(d_model)
             self.encoder = TransformerEncoder(encoder_layer, num_encoder_layers,
                                               encoder_norm)
@@ -1065,8 +1127,8 @@ class Transformer(Layer):
         else:
             decoder_layer = TransformerDecoderLayer(
                 d_model, nhead, dim_feedforward, dropout, activation,
-                attn_dropout, act_dropout, normalize_before, weight_attr,
-                bias_attr)
+                attn_dropout, act_dropout, normalize_before,
+                decoder_weight_attr, decoder_bias_attr)
             decoder_norm = LayerNorm(d_model)
             self.decoder = TransformerDecoder(decoder_layer, num_decoder_layers,
                                               decoder_norm)
diff --git a/python/paddle/nn/utils/weight_norm_hook.py b/python/paddle/nn/utils/weight_norm_hook.py
index ad53bf394660f3a7e0e48fdbd5eb530abd0852bb..7a21e7661d4e78d0004996ee67c80ddc35006bc3 100644
--- a/python/paddle/nn/utils/weight_norm_hook.py
+++ b/python/paddle/nn/utils/weight_norm_hook.py
@@ -112,6 +112,14 @@ class WeightNorm(object):
         if dim is None:
             dim = -1
 
+        # support dim is negative numeber, (dim = -1) == (dim = None)
+        weight_dim = len(layer._parameters[name].shape)
+        assert (
+            dim < weight_dim and dim >= -1 * weight_dim
+        ), "dim must set between [-R, R), R means the dimension of weight."
+        if dim != -1:
+            dim = (dim + weight_dim) % weight_dim
+
         fn = WeightNorm(name, dim)
 
         w = getattr(layer, name)
diff --git a/python/paddle/optimizer/adam.py b/python/paddle/optimizer/adam.py
index 3150b8c2d0363274dfb6fd3465110c89339cd4c9..24cebf8e6e6388a2d1e9711e3f862090918876a3 100644
--- a/python/paddle/optimizer/adam.py
+++ b/python/paddle/optimizer/adam.py
@@ -250,3 +250,46 @@ class Adam(Optimizer):
             stop_gradient=True)
 
         return adam_op
+
+    @framework.dygraph_only
+    def step(self):
+        """
+        Execute the optimizer and update parameters once.
+        
+        Returns:
+            None
+
+        Examples:
+            .. code-block:: python
+
+                import paddle
+                import numpy as np
+                paddle.disable_static()
+                value = np.arange(26).reshape(2, 13).astype("float32")
+                a = paddle.to_tensor(value)
+                linear = paddle.nn.Linear(13, 5)
+                # This can be any optimizer supported by dygraph.
+                adam = paddle.optimizer.Adam(learning_rate = 0.01, 
+                                            parameters = linear.parameters())
+                out = linear(a)
+                out.backward()
+                adam.step()
+                adam.clear_grad()
+        """
+        parameter_list = self._parameter_list
+        self._dtype = None
+        params_grads = []
+        for param in self._parameter_list:
+            if not param.trainable:
+                continue
+            if param._grad_ivar() is not None:
+                grad_var = param._grad_ivar()
+                if hasattr(grad_var, "_is_sparse") and grad_var._is_sparse(
+                ) and self.regularization is not None:
+                    raise RuntimeError(
+                        "Adam don't support weight_decay with sparse parameters, please set it to None."
+                    )
+                params_grads.append((param, grad_var))
+
+        optimize_ops = self._apply_optimize(
+            loss=None, startup_program=None, params_grads=params_grads)
diff --git a/python/paddle/optimizer/lr_scheduler.py b/python/paddle/optimizer/lr_scheduler.py
index 4ecaffb8fa509bdc54067bb25f8d1b5191b7ac1b..61391704061bda7dfbad7252cbc04c0b7d6492a4 100644
--- a/python/paddle/optimizer/lr_scheduler.py
+++ b/python/paddle/optimizer/lr_scheduler.py
@@ -109,7 +109,7 @@ class _LRScheduler(object):
         """
         self.keys = ['last_epoch', 'last_lr']
 
-    def set_dict(self, state_dict):
+    def set_state_dict(self, state_dict):
         """
         Loads the schedulers state.
         """
@@ -126,8 +126,8 @@ class _LRScheduler(object):
                 "There are some unused values in state_dict. Maybe the optimizer have different 'LearningRateDecay' when invoking state_dict and set_dict"
             )
 
-    # alias for set_dict
-    set_state_dict = set_dict
+    # alias for set_state_dict
+    set_dict = set_state_dict
 
     def get_lr(self):
         # calculate by python float
diff --git a/python/paddle/optimizer/optimizer.py b/python/paddle/optimizer/optimizer.py
index 2f7bc94e646324b849b0308b219261f56eba1e28..1bd9a1f144ed4b5c69d76070eadc317e2063e25b 100644
--- a/python/paddle/optimizer/optimizer.py
+++ b/python/paddle/optimizer/optimizer.py
@@ -169,7 +169,7 @@ class Optimizer(object):
 
                 import paddle
                 paddle.disable_static()
-                emb = paddle.nn.Embedding([10, 10])
+                emb = paddle.nn.Embedding(10, 10)
 
                 adam = paddle.optimizer.Adam(0.001, parameters=emb.parameters())
                 state_dict = adam.state_dict()
@@ -199,7 +199,7 @@ class Optimizer(object):
 
                 import paddle
                 paddle.disable_static()
-                emb = paddle.nn.Embedding([10, 10])
+                emb = paddle.nn.Embedding(10, 10)
 
                 state_dict = emb.state_dict()
                 paddle.framework.save(state_dict, "paddle_dy")
@@ -371,7 +371,7 @@ class Optimizer(object):
                 import paddle
                 # example1: _LRScheduler is not used, return value is all the same
                 paddle.disable_static()
-                emb = paddle.nn.Embedding([10, 10])
+                emb = paddle.nn.Embedding(10, 10)
                 adam = paddle.optimizer.Adam(0.001, parameters = emb.parameters())
                 lr = adam.get_lr()
                 print(lr) # 0.001
diff --git a/python/paddle/reader/__init__.py b/python/paddle/reader/__init__.py
index 29337cf06682f5f5bf8e0e6d9b1bf8ec32512d45..881cfd813141653fed8e7d9107cdebe54c9df791 100644
--- a/python/paddle/reader/__init__.py
+++ b/python/paddle/reader/__init__.py
@@ -66,4 +66,4 @@ An example implementation for multiple item data reader creator:
 import paddle.reader.decorator
 from paddle.reader.decorator import *
 
-__all__ = decorator.__all__
+__all__ = []
diff --git a/python/paddle/reader/decorator.py b/python/paddle/reader/decorator.py
index ff09f4c562aeb8216dea6e1d40b9492c257aeab4..8ee4d73ea847ea116ea4401b5b05ef1b925950fe 100644
--- a/python/paddle/reader/decorator.py
+++ b/python/paddle/reader/decorator.py
@@ -32,6 +32,21 @@ import random
 import zlib
 import paddle.compat as cpt
 
+# On macOS, the 'spawn' start method is now the default in Python3.8 multiprocessing,
+# Paddle is currently unable to solve this, so forces the process to start using 
+# the 'fork' start method.
+#
+# TODO: This solution is not good, because the fork start method could lead to 
+# crashes of the subprocess. Figure out how to make 'spawn' work.
+#
+# For more details, please refer to
+# https://docs.python.org/3/library/multiprocessing.html#contexts-and-start-methods
+# https://bugs.python.org/issue33725
+if sys.version_info >= (3, 8) and sys.platform == 'darwin':
+    fork_context = multiprocessing.get_context('fork')
+else:
+    fork_context = multiprocessing
+
 
 def cache(reader):
     """
@@ -47,6 +62,22 @@ def cache(reader):
 
     Returns:
         generator: a decorated reader object which yields data from cached memory.
+    
+    Examples:
+        .. code-block:: python
+
+            import paddle
+            
+            def reader():
+                for i in range(3):
+                    yield i
+            
+            # All data is cached into memory
+            cached_reader = paddle.io.cache(reader)
+            
+            # Output: 0 1 2
+            for i in cached_reader():
+                print(i)
     """
     all_data = tuple(reader())
 
@@ -281,12 +312,28 @@ def buffered(reader, size):
     buffer. Reading from the buffered data reader will proceed as long
     as the buffer is not empty.
 
-    :param reader: the data reader to read from.
-    :type reader: callable
-    :param size: max buffer size.
-    :type size: int
+    Args:
+        reader(generator): the data reader to read from.
+        size(int): max buffer size.
+
+    Returns:
+        generator: the buffered data reader.
+    
+    Examples:
+        .. code-block:: python
 
-    :returns: the buffered data reader.
+            import paddle
+            
+            def reader():
+                for i in range(3):
+                    yield i
+            
+            # Create a buffered reader, and the buffer size is 2.
+            buffered_reader = paddle.io.buffered(reader, 2)
+            
+            # Output: 0 1 2
+            for i in buffered_reader():
+                print(i)
     """
 
     class EndSignal():
@@ -560,9 +607,9 @@ def multiprocess_reader(readers, use_pipe=True, queue_size=1000):
             six.reraise(*sys.exc_info())
 
     def queue_reader():
-        queue = multiprocessing.Queue(queue_size)
+        queue = fork_context.Queue(queue_size)
         for reader in readers:
-            p = multiprocessing.Process(
+            p = fork_context.Process(
                 target=_read_into_queue, args=(reader, queue))
             p.start()
 
@@ -593,9 +640,9 @@ def multiprocess_reader(readers, use_pipe=True, queue_size=1000):
     def pipe_reader():
         conns = []
         for reader in readers:
-            parent_conn, child_conn = multiprocessing.Pipe()
+            parent_conn, child_conn = fork_context.Pipe()
             conns.append(parent_conn)
-            p = multiprocessing.Process(
+            p = fork_context.Process(
                 target=_read_into_pipe, args=(reader, child_conn))
             p.start()
 
diff --git a/python/paddle/regularizer.py b/python/paddle/regularizer.py
index 2b20bb41970f0b1bd829585cd3767c6c08421f1e..b3f483fd89197c9bd0a447b4272e958824331942 100644
--- a/python/paddle/regularizer.py
+++ b/python/paddle/regularizer.py
@@ -12,8 +12,134 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-# TODO: define the regularizer functions 
-# __all__ = ['L1Decay',
-#            'L1DecayRegularizer',
-#            'L2Decay',
-#            'L2DecayRegularizer']
+__all__ = ['L1Decay', 'L2Decay']
+
+import paddle.fluid as fluid
+
+
+class L1Decay(fluid.regularizer.L1Decay):
+    """
+    Implement the L1 Weight Decay Regularization, which encourages the weights to be sparse.
+    
+    It can be set in :ref:`api_fluid_ParamAttr` or ``optimizer`` (such as :ref:`api_paddle_optimizer_Momentum` ). 
+    When set in ``ParamAttr`` , it only takes effect for trainable parameters in this layer. When set in 
+    ``optimizer`` , it takes effect for all trainable parameters. When set together, ``ParamAttr`` has 
+    higher priority than ``optimizer`` , which means that for a trainable parameter, if regularizer is defined 
+    in its ParamAttr, then the regularizer in Optimizer will be ignored. Otherwise the  regularizer
+    in Optimizer will be used.
+    
+    In the implementation, the formula of L1 Weight Decay Regularization is as follows:
+	
+    .. math::
+
+        L1WeightDecay = reg\_coeff * sign(parameter)
+
+    Args:
+        coeff(float, optional): regularization coeff. Default:0.0.
+	
+    Examples:
+        .. code-block:: python
+
+            # Example1: set Regularizer in optimizer
+            import paddle
+            from paddle.regularizer import L1Decay
+            import numpy as np
+            paddle.disable_static()
+            inp = np.random.uniform(-0.1, 0.1, [10, 10]).astype("float32")
+            linear = paddle.nn.Linear(10, 10)
+            inp = paddle.to_tensor(inp)
+            out = linear(inp)
+            loss = paddle.mean(out)
+            beta1 = paddle.to_tensor([0.9], dtype="float32")
+            beta2 = paddle.to_tensor([0.99], dtype="float32")
+            momentum = paddle.optimizer.Momentum(
+                learning_rate=0.1,
+                parameters=linear.parameters(),
+                weight_decay=L1Decay(0.0001))
+            back = out.backward()
+            momentum.step()
+            momentum.clear_grad()
+
+            # Example2: set Regularizer in parameters
+            # Set L1 regularization in parameters.
+            # Global regularizer does not take effect on my_conv2d for this case.
+            from paddle.nn import Conv2d
+            from paddle import ParamAttr
+            from paddle.regularizer import L2Decay
+
+            my_conv2d = Conv2d(
+                    in_channels=10,
+                    out_channels=10,
+                    kernel_size=1,
+                    stride=1,
+                    padding=0,
+                    weight_attr=ParamAttr(regularizer=L2Decay(coeff=0.01)),
+                    bias_attr=False)
+    """
+
+    def __init__(self, coeff=0.0):
+        super(L1Decay, self).__init__(coeff)
+
+
+class L2Decay(fluid.regularizer.L2Decay):
+    """
+    Implement the L2 Weight Decay Regularization, which helps to prevent the model over-fitting.
+    
+    It can be set in :ref:`api_fluid_ParamAttr` or ``optimizer`` (such as :ref:`api_paddle_optimizer_Momentum` ). 
+    When set in ``ParamAttr`` , it only takes effect for trainable parameters in this layer. When set in 
+    ``optimizer`` , it takes effect for all trainable parameters. When set together, ``ParamAttr`` has 
+    higher priority than ``optimizer`` , which means that for a trainable parameter, if regularizer is defined 
+    in its ParamAttr, then the regularizer in Optimizer will be ignored. Otherwise the  regularizer
+    in Optimizer will be used.
+    
+    In the implementation, the formula of L2 Weight Decay Regularization is as follows:
+
+    .. math::
+
+        L2WeightDecay = reg\_coeff * parameter
+
+    Args:
+        regularization_coeff(float, optional): regularization coeff. Default:0.0
+	
+    Examples:
+        .. code-block:: python
+
+            # Example1: set Regularizer in optimizer
+            import paddle
+            from paddle.regularizer import L2Decay
+            import numpy as np
+            paddle.disable_static()
+            inp = np.random.uniform(-0.1, 0.1, [10, 10]).astype("float32")
+            linear = paddle.nn.Linear(10, 10)
+            inp = paddle.to_tensor(inp)
+            out = linear(inp)
+            loss = paddle.mean(out)
+            beta1 = paddle.to_tensor([0.9], dtype="float32")
+            beta2 = paddle.to_tensor([0.99], dtype="float32")
+            momentum = paddle.optimizer.Momentum(
+                learning_rate=0.1,
+                parameters=linear.parameters(),
+                weight_decay=L2Decay(0.0001))
+            back = out.backward()
+            momentum.step()
+            momentum.clear_grad()
+
+            # Example2: set Regularizer in parameters
+            # Set L2 regularization in parameters.
+            # Global regularizer does not take effect on my_conv2d for this case.
+            from paddle.nn import Conv2d
+            from paddle import ParamAttr
+            from paddle.regularizer import L2Decay
+
+            my_conv2d = Conv2d(
+                    in_channels=10,
+                    out_channels=10,
+                    kernel_size=1,
+                    stride=1,
+                    padding=0,
+                    weight_attr=ParamAttr(regularizer=L2Decay(coeff=0.01)),
+                    bias_attr=False)
+    """
+
+    def __init__(self, coeff=0.0):
+        super(L2Decay, self).__init__(coeff)
diff --git a/python/paddle/tensor/__init__.py b/python/paddle/tensor/__init__.py
index 0fed32a1676759bd94961af0a8949d035ec48c8f..cec989fba8b0887499876f94bb862f72ba0e18d5 100755
--- a/python/paddle/tensor/__init__.py
+++ b/python/paddle/tensor/__init__.py
@@ -40,6 +40,8 @@ from .creation import full_like  #DEFINE_ALIAS
 from .creation import triu  #DEFINE_ALIAS
 from .creation import tril  #DEFINE_ALIAS
 from .creation import meshgrid  #DEFINE_ALIAS
+from .creation import empty  #DEFINE_ALIAS
+from .creation import empty_like  #DEFINE_ALIAS
 from .io import save  #DEFINE_ALIAS
 from .io import load  #DEFINE_ALIAS
 from .linalg import matmul  #DEFINE_ALIAS
@@ -54,6 +56,7 @@ from .linalg import cholesky  #DEFINE_ALIAS
 # from .linalg import tensordot        #DEFINE_ALIAS
 from .linalg import bmm  #DEFINE_ALIAS
 from .linalg import histogram  #DEFINE_ALIAS
+from .linalg import mv  #DEFINE_ALIAS
 from .logic import equal  #DEFINE_ALIAS
 from .logic import greater_equal  #DEFINE_ALIAS
 from .logic import greater_than  #DEFINE_ALIAS
@@ -168,7 +171,6 @@ from .math import prod  #DEFINE_ALIAS
 from .random import standard_normal
 from .random import normal
 from .random import uniform  #DEFINE_ALIAS
-from .random import shuffle  #DEFINE_ALIAS
 from .random import randn  #DEFINE_ALIAS
 from .random import rand  #DEFINE_ALIAS
 from .random import randint  #DEFINE_ALIAS
diff --git a/python/paddle/tensor/creation.py b/python/paddle/tensor/creation.py
index 9ef66712540aa54eac39b7e6160c5c91b6e3fcd5..9aee911e568d1b2cd7aac0cf45e44f2886612a5a 100644
--- a/python/paddle/tensor/creation.py
+++ b/python/paddle/tensor/creation.py
@@ -48,6 +48,8 @@ __all__ = [
     'eye',
     'full',
     'full_like',
+    'empty',
+    'empty_like',
     'triu',
     'tril',
     'meshgrid'
@@ -62,8 +64,7 @@ def to_tensor(data, dtype=None, place=None, stop_gradient=True):
 
     If the ``data`` is already a tensor, and ``dtype`` or ``place`` does't change, no copy 
     will be performed and return origin tensor, otherwise a new tensor will be constructed
-    and returned. Similarly, if the data is an numpy\.ndarray of with the same ``dtype`` 
-    and the current place is cpu, no copy will be performed.
+    and returned. 
 
     The ``ComplexTensor`` is a unique type of paddle. If x is ``ComplexTensor``, then 
     ``x.real`` is the real part, and ``x.imag`` is the imaginary part.
@@ -73,8 +74,8 @@ def to_tensor(data, dtype=None, place=None, stop_gradient=True):
             Can be a scalar, list, tuple, numpy\.ndarray, paddle\.Tensor, paddle\.ComplexTensor.
         dtype(str|np.dtype, optional): The desired data type of returned tensor. Can be 'bool' , 'float16' , 
             'float32' , 'float64' , 'int8' , 'int16' , 'int32' , 'int64' , 'uint8'. And
-            'complex64' , 'complex128' only for ComplexTensor. Default: None, for float point number, 
-            get type from ``get_default_type``, for other type, infers from ``data`` .
+            'complex64' , 'complex128' only for ComplexTensor. Default: None, infers dtype from ``data`` 
+            except for python float number which gets dtype from ``get_default_type`` .
         place(CPUPlace|CUDAPinnedPlace|CUDAPlace, optional): The place to allocate Tensor. Can be  
             CPUPlace, CUDAPinnedPlace, CUDAPlace. Default: None, means global place.
         stop_gradient(bool, optional): Whether to block the gradient propagation of Autograd. Default: True.
@@ -188,40 +189,40 @@ def to_tensor(data, dtype=None, place=None, stop_gradient=True):
             raise TypeError(
                 "Can't constructs a 'paddle.Tensor' with data type {}, data type must be scalar|list|tuple|numpy.ndarray|paddle.Tensor|paddle.ComplexTensor".
                 format(type(data)))
+        if not dtype and data.dtype in [
+                'float16', 'float32', 'float64', 'complex64', 'complex128'
+        ]:
+            default_type = paddle.get_default_dtype()
+            if np.iscomplexobj(data):
+                default_type = 'complex64' if default_type in [
+                    'float16', 'float32'
+                ] else 'complex128'
+            data = data.astype(default_type)
+
+    if dtype and convert_dtype(dtype) != data.dtype:
+        data = data.astype(dtype)
 
     if not np.iscomplexobj(data):
-        if dtype:
-            dtype = convert_dtype(dtype)
-        elif data.dtype in ['float16', 'float32', 'float64']:
-            dtype = paddle.framework.get_default_dtype()
-        if dtype and dtype != data.dtype:
+        if dtype and convert_dtype(dtype) != data.dtype:
             data = data.astype(dtype)
         return paddle.Tensor(
             value=data,
             place=place,
             persistable=False,
-            zero_copy=True,
+            zero_copy=False,
             stop_gradient=stop_gradient)
     else:
-        if dtype:
-            dtype = convert_dtype(dtype)
-        else:
-            dtype = paddle.framework.get_default_dtype()
-            dtype = 'complex64' if dtype in ['float16', 'float32'
-                                             ] else 'complex128'
-        if dtype != data.dtype:
-            data = data.astype(dtype)
         name = unique_name.generate('generated_tensor')
         real_tensor = paddle.Tensor(
             value=data.real,
             place=place,
-            zero_copy=True,
+            zero_copy=False,
             name=name + ".real",
             stop_gradient=stop_gradient)
         imag_tensor = paddle.Tensor(
             value=data.imag,
             place=place,
-            zero_copy=True,
+            zero_copy=False,
             name=name + ".imag",
             stop_gradient=stop_gradient)
         return paddle.ComplexTensor(real_tensor, imag_tensor)
@@ -244,10 +245,6 @@ def full_like(x, fill_value, dtype=None, name=None):
     Returns:
         Tensor: Tensor which is created according to ``x``, ``fill_value`` and ``dtype``.
     
-    Raises:
-        TypeError: The data type of ``x`` must be one of bool, float16, float32, float64, int32, int64.
-        TypeError: The ``dtype`` must be one of bool, float16, float32, float64, int32, int64 and None.
-    
     Examples:
         .. code-block:: python
 
@@ -303,11 +300,6 @@ def ones(shape, dtype=None, name=None):
     Returns:
         Tensor: A tensor of data type :attr:`dtype` with shape :attr:`shape` and all elements set to 1.
 
-    Raises:
-        TypeError: The ``dtype`` must be one of bool, float16, float32, float64, int32, int64 and None.
-        TypeError: The ``shape`` must be one of list, tuple and Tensor. The data type of ``shape`` must
-            be int32 or int64 when it's a Tensor.
-    
     Examples:
         .. code-block:: python
 
@@ -366,11 +358,10 @@ def ones_like(x, dtype=None, name=None):
         .. code-block:: python
 
             import paddle
-            import numpy as np
 
             paddle.disable_static()
 
-            x = paddle.to_tensor(np.array([1,2,3], dtype='float32'))
+            x = paddle.to_tensor([1,2,3])
             out1 = paddle.zeros_like(x) # [1., 1., 1.]
             out2 = paddle.zeros_like(x, dtype='int32') # [1, 1, 1]
 
@@ -392,11 +383,6 @@ def zeros(shape, dtype=None, name=None):
     Returns:
         Tensor: A tensor of data type :attr:`dtype` with shape :attr:`shape` and all elements set to 0.
 
-    Raises:
-        TypeError: The ``dtype`` must be one of bool, float16, float32, float64, int32, int64 and None.
-        TypeError: The ``shape`` must be one of list, tuple and Tensor. The data type of ``shape`` must
-            be int32 or int64 when it's a Tensor.
-    
     Examples:
         .. code-block:: python
 
@@ -453,11 +439,10 @@ def zeros_like(x, dtype=None, name=None):
         .. code-block:: python
 
             import paddle
-            import numpy as np
 
             paddle.disable_static()
 
-            x = paddle.to_tensor(np.array([1,2,3], dtype='float32'))
+            x = paddle.to_tensor([1,2,3])
             out1 = paddle.zeros_like(x) # [0., 0., 0.]
             out2 = paddle.zeros_like(x, dtype='int32') # [0, 0, 0]
 
@@ -482,10 +467,6 @@ def eye(num_rows, num_columns=None, dtype=None, name=None):
 
     Returns:
         Tensor: An identity Tensor or LoDTensor of shape [num_rows, num_columns].
-    
-    Raises:
-        TypeError: The ``dtype`` must be one of float16, float32, float64, int32 int64 and None.
-        TypeError: The ``num_columns`` must be non-negative int.
 
     Examples:
         .. code-block:: python
@@ -534,11 +515,6 @@ def full(shape, fill_value, dtype=None, name=None):
     Returns:
         Tensor: Tensor which is created according to ``shape``, ``fill_value`` and ``dtype``.
 
-    Raises:
-        TypeError: The ``dtype`` must be one of None, bool, float16, float32, float64, int32 and int64.
-        TypeError: The ``shape`` must be one of Tensor, list and tuple. The data type of ``shape`` must
-            be int32 or int64 when the it's a Tensor
-    
     Examples:
         .. code-block:: python
 
@@ -619,7 +595,6 @@ def arange(start=0, end=None, step=1, dtype=None, name=None):
         .. code-block:: python
 
         import paddle
-        import numpy as np
 
         paddle.disable_static()
 
@@ -633,7 +608,7 @@ def arange(start=0, end=None, step=1, dtype=None, name=None):
         out3 = paddle.arange(4.999, dtype='float32')
         # [0., 1., 2., 3., 4.]
 
-        start_var = paddle.to_tensor(np.array([3]))
+        start_var = paddle.to_tensor([3])
         out4 = paddle.arange(start_var, 7)
         # [3, 4, 5, 6]
              
@@ -725,7 +700,7 @@ def tril(x, diagonal=0, name=None):
 
             paddle.disable_static()
 
-            x = paddle.to_variable(data)
+            x = paddle.to_tensor(data)
             
             tril1 = paddle.tensor.tril(x)
             # array([[ 1,  0,  0,  0],
@@ -797,7 +772,7 @@ def triu(x, diagonal=0, name=None):
             paddle.disable_static()
 
             # example 1, default diagonal
-            x = paddle.to_variable(data)
+            x = paddle.to_tensor(data)
             triu1 = paddle.tensor.triu(x)
             # array([[ 1,  2,  3,  4],
             #        [ 0,  6,  7,  8],
@@ -1007,3 +982,157 @@ def diag(x, offset=0, padding_value=0, name=None):
 
     out.stop_gradient = True
     return out
+
+
+def empty(shape, dtype=None, name=None):
+    """
+    This Op returns a Tensor with uninitialized data which size is same as ``shape``.
+    
+    Args:
+        shape(list|tuple|Tensor): Shape of the Tensor to be created.
+                The data type of dimension of shape is ``int32`` or ``int64`` . If ``shape`` is a list or tuple,
+                the elements of it should be integers or Tensors with shape [1].
+                If ``shape`` is an Tensor, it should be an 1-D Tensor.
+        dtype(np.dtype|str, optional): Data type of the output Tensor
+            which can be bool, float16, float32, float64, int32, int64, if dytpe is `None`, the data
+            type of created Tensor use global default dtype (see ``get_default_dtype``
+            for details).
+        name(str, optional): The default value is None. Normally there is no need for user to set this
+            property. For more information, please refer to :ref:`api_guide_Name`.
+    
+    Returns:
+        Tensor: Tensor which is created according to ``shape`` and ``dtype``, and is uninitialized.
+
+    Examples:
+        .. code-block:: python
+
+          import paddle
+          import numpy as np
+
+          paddle.disable_static()   # Now we are in imperative mode
+          paddle.set_device("cpu")  # and use cpu device
+
+          # example 1: argument ``shape`` is a list which doesn't contain Tensor.
+          data1 = paddle.empty(shape=[2,3], dtype='float32')
+          #[[4.3612203e+27 1.8176809e+31 1.3555911e-19]     # uninitialized
+          # [1.1699684e-19 1.3563156e-19 3.6408321e-11]]    # uninitialized
+
+          # example 2: argument ``shape`` is a Tensor, the data type must be int64 or int32.
+          shape_data = np.array([2, 3]).astype('int32')
+          shape = paddle.to_tensor(shape_data)
+          data2 = paddle.empty(shape=shape, dtype='float32')
+          #[[1.7192326e-37 4.8125365e-38 1.9866003e-36]     # uninitialized
+          # [1.3284029e-40 7.1117408e-37 2.5353012e+30]]    # uninitialized
+
+          # example 3: argument ``shape`` is a list which contains Tensor.
+          dim2_data = np.array([3]).astype('int32')
+          dim2 = paddle.to_tensor(dim2_data)
+          data3 = paddle.empty(shape=[2, dim2], dtype='float32')
+          #[[1.1024214e+24 7.0379409e+22 6.5737699e-34]     # uninitialized
+          # [7.5563101e+31 7.7130405e+31 2.8020654e+20]]    # uninitialized
+    """
+
+    if dtype is None:
+        dtype = paddle.get_default_dtype()
+
+    dtype = convert_dtype(dtype)
+
+    if in_dygraph_mode():
+        shape = utils.convert_shape_to_list(shape)
+        out = core.ops.empty('shape', shape, 'dtype',
+                             convert_np_dtype_to_dtype_(dtype))
+        out.stop_gradient = True
+        return out
+
+    helper = LayerHelper("empty", **locals())
+    inputs = {}
+
+    check_dtype(dtype, 'dtype',
+                ['bool', 'float16', 'float32', 'float64', 'int32', 'int64'],
+                'empty')
+    check_type(shape, 'shape', (Variable, list, tuple), 'empty')
+
+    if isinstance(shape, Variable):
+        check_dtype(shape.dtype, 'shape', ['int32', 'int64'], 'empty')
+
+    attrs = {}
+    utils.get_shape_tensor_inputs(
+        inputs=inputs, attrs=attrs, shape=shape, op_type='empty')
+
+    out = helper.create_variable_for_type_inference(dtype=dtype)
+    attrs['dtype'] = convert_np_dtype_to_dtype_(dtype)
+    helper.append_op(
+        type='empty',
+        inputs=inputs,
+        outputs={'Out': [out]},
+        attrs=attrs,
+        stop_gradient=True)
+    out.stop_gradient = True
+    return out
+
+
+def empty_like(x, dtype=None, name=None):
+    """
+    This Op returns a Tensor with uninitialized data which has identical shape of ``x`` and ``dtype``.
+    If the ``dtype`` is None, the data type of Tensor is same with ``x``.
+    
+    Args:
+        x(Tensor): The input tensor which specifies shape and data type. The data type can be bool, float16, float32, float64, int32, int64.
+        dtype(np.dtype|str, optional): The data type of output. The data type can be one
+            of bool, float16, float32, float64, int32, int64. The default value is None, which means the output 
+            data type is the same as input.
+        name(str, optional): The default value is None. Normally there is no need for user to set this
+            property. For more information, please refer to :ref:`api_guide_Name`.
+    
+    Returns:
+        Tensor: Tensor which is created according to ``x`` and ``dtype``, and is uninitialized.
+
+    Examples:
+        .. code-block:: python
+
+          import paddle
+          import numpy as np
+
+          paddle.disable_static()   # Now we are in imperative mode
+          paddle.set_device("cpu")  # and use cpu device
+
+          x = paddle.randn([2, 3], 'float32')
+          output = paddle.empty_like(x)
+          #[[1.8491974e+20 1.8037303e+28 1.7443726e+28]     # uninitialized
+          # [4.9640171e+28 3.0186127e+32 5.6715899e-11]]    # uninitialized
+    """
+
+    if dtype is None:
+        dtype = x.dtype
+    dtype = convert_dtype(dtype)
+
+    if in_dygraph_mode():
+        out = core.ops.empty('shape', x.shape, 'dtype',
+                             convert_np_dtype_to_dtype_(dtype))
+        out.stop_gradient = True
+        return out
+
+    helper = LayerHelper("empty_like", **locals())
+    check_variable_and_dtype(
+        x, 'x', ['bool', 'float16', 'float32', 'float64', 'int32', 'int64'],
+        'empty_like')
+    check_dtype(dtype, 'dtype',
+                ['bool', 'float16', 'float32', 'float64', 'int32', 'int64'],
+                'empty_like')
+    out = helper.create_variable_for_type_inference(dtype=dtype)
+
+    inputs = {}
+    attrs = {}
+    attrs['dtype'] = convert_np_dtype_to_dtype_(dtype)
+    shape = paddle.shape(x)
+    utils.get_shape_tensor_inputs(
+        inputs=inputs, attrs=attrs, shape=shape, op_type='empty_like')
+
+    helper.append_op(
+        type='empty',
+        inputs=inputs,
+        outputs={'Out': [out]},
+        attrs=attrs,
+        stop_gradient=True)
+    out.stop_gradient = True
+    return out
diff --git a/python/paddle/tensor/linalg.py b/python/paddle/tensor/linalg.py
index b5b528325cd9f52a8b61ef21df0095c41da5a8ed..f27cfba487d78f284408815eaba933b18f303df9 100644
--- a/python/paddle/tensor/linalg.py
+++ b/python/paddle/tensor/linalg.py
@@ -32,7 +32,8 @@ __all__ = [
     'cholesky',
     #       'tensordot',
     'bmm',
-    'histogram'
+    'histogram',
+    'mv'
 ]
 
 
@@ -183,12 +184,13 @@ def norm(x, p='fro', axis=None, keepdim=False, name=None):
         x (Tensor): The input tensor could be N-D tensor, and the input data
             type could be float32 or float64.
         p (float|string, optional): Order of the norm. Supported values are `fro`, `0`, `1`, `2`,
-           `inf`,`-inf` and any positive real number yielding the corresponding p-norm.
-            Not supported: ord < 0, nuclear norm.
+            `inf`, `-inf` and any positive real number yielding the corresponding p-norm. Not supported: ord < 0 and nuclear norm. 
+            Default value is `fro`.
         axis (int|list|tuple, optional): The axis on which to apply norm operation. If axis is int
             or list(int)/tuple(int)  with only one element, the vector norm is computed over the axis.
             If `axis < 0`, the dimension to norm operation is rank(input) + axis.
             If axis is a list(int)/tuple(int) with two elements, the matrix norm is computed over the axis.
+            Defalut value is `None`.
         keepdim (bool, optional): Whether to reserve the reduced dimension in the
             output Tensor. The result tensor will have fewer dimension
             than the :attr:`input` unless :attr:`keepdim` is true, default
@@ -197,13 +199,9 @@ def norm(x, p='fro', axis=None, keepdim=False, name=None):
             user to set this property. For more information, please refer to :ref:`api_guide_Name`.
 
     Returns:
-        Variable: Tensor, results of norm operation on the specified axis of input tensor,
+        Tensor: results of norm operation on the specified axis of input tensor,
         it's data type is the same as input's Tensor.
  
-    Raises:
-        TypeError, if out data type is different with the input data type.
-        ValueError, If `p` or `axis` is invalid.
-    
     Examples:
         .. code-block:: python
             
@@ -256,15 +254,13 @@ def norm(x, p='fro', axis=None, keepdim=False, name=None):
                 "The dim of frobenius norm op should be None or two elements list!"
             )
         if in_dygraph_mode():
-            if dim is None: dim = [-1]
-            return core.ops.frobenius_norm(input, 'dim', dim, 'keepdim',
-                                           keepdim)
-        attrs = {
-            'dim': dim if dim != None else [-2, -1],
-            'keep_dim': keepdim,
-            'reduce_all': False
-        }
-        if len(attrs['dim']) == len(input.shape):
+            if dim is None:
+                return core.ops.frobenius_norm(input, 'keep_dim', keepdim,
+                                               'reduce_all', True)
+            return core.ops.frobenius_norm(input, 'dim', dim, 'keep_dim',
+                                           keepdim, 'reduce_all', False)
+        attrs = {'dim': dim, 'keep_dim': keepdim, 'reduce_all': False}
+        if dim is None:
             attrs['reduce_all'] = True
         check_variable_and_dtype(input, 'input', ['float32', 'float64'],
                                  'frobenius_norm')
@@ -351,42 +347,6 @@ def norm(x, p='fro', axis=None, keepdim=False, name=None):
 
         return reduce_out
 
-    def p0_matrix_norm(input, porder=0., axis=axis, keepdim=False, name=None):
-        block = LayerHelper('norm', **locals())
-        out = block.create_variable_for_type_inference(
-            dtype=block.input_dtype())
-
-        cast_out = block.create_variable_for_type_inference(dtype=bool)
-        block.append_op(
-            type='cast',
-            inputs={'X': input},
-            outputs={'Out': cast_out},
-            attrs={
-                'in_dtype': input.dtype,
-                'out_dtype': int(core.VarDesc.VarType.BOOL)
-            })
-        cast_out2 = block.create_variable_for_type_inference(dtype=bool)
-        block.append_op(
-            type='cast',
-            inputs={'X': cast_out},
-            outputs={'Out': cast_out2},
-            attrs={
-                'in_dtype': cast_out.dtype,
-                'out_dtype': int(core.VarDesc.VarType.FP32)
-            })
-        sum_out = block.create_variable_for_type_inference(
-            dtype=block.input_dtype())
-        block.append_op(
-            type='reduce_sum',
-            inputs={'X': cast_out2},
-            outputs={'Out': sum_out},
-            attrs={
-                'dim': axis,
-                'keep_dim': keepdim,
-                'reduce_all': True if axis is None else False
-            })
-        return sum_out
-
     def p_matrix_norm(input, porder=1., axis=axis, keepdim=False, name=None):
         block = LayerHelper('norm', **locals())
         out = block.create_variable_for_type_inference(
@@ -448,7 +408,20 @@ def norm(x, p='fro', axis=None, keepdim=False, name=None):
 
     #calculate vector norm, where axis is int or list with only one integer
     if isinstance(axis, int):
-        if isinstance(p, (int, float)):
+        if isinstance(p, str):
+            if p == "fro":
+                return vector_norm(
+                    x,
+                    porder=2,
+                    axis=axis,
+                    keepdim=keepdim,
+                    asvector=False,
+                    name=name)
+
+            else:
+                raise ValueError(
+                    "only valid string values are 'fro', found {}".format(p))
+        elif isinstance(p, (int, float)):
             return vector_norm(
                 x,
                 axis=axis,
@@ -464,10 +437,12 @@ def norm(x, p='fro', axis=None, keepdim=False, name=None):
     elif isinstance(axis, list) and len(axis) == 2:
         if p == "fro":
             return frobenius_norm(x, dim=axis, keepdim=keepdim, name=name)
-        elif p == 0:
-            return p0_matrix_norm(x, axis=axis, keepdim=keepdim, name=name)
         elif p == np.inf or p == -np.inf:
             return inf_norm(x, porder=p, axis=axis, keepdim=keepdim, name=name)
+        elif p == 0:
+            raise ValueError(
+                "just suport axis type int or list (length of list <=1) if p = 0, found {}".
+                format(axis))
         else:
             return p_matrix_norm(
                 x, porder=p, axis=axis, keepdim=keepdim, name=name)
@@ -810,7 +785,7 @@ def cholesky(x, upper=False, name=None):
             a = np.random.rand(3, 3)
             a_t = np.transpose(a, [1, 0])
             x_data = np.matmul(a, a_t) + 1e-03
-            x = paddle.to_variable(x_data)
+            x = paddle.to_tensor(x_data)
             out = paddle.cholesky(x, upper=False)
             print(out.numpy())
             # [[1.190523   0.         0.        ]
@@ -855,15 +830,16 @@ def bmm(x, y, name=None):
     Examples:
         import paddle
 
-        # In imperative mode:
-        # size input1: (2, 2, 3) and input2: (2, 3, 2)
-        input1 = np.array([[[1.0, 1.0, 1.0],[2.0, 2.0, 2.0]],[[3.0, 3.0, 3.0],[4.0, 4.0, 4.0]]])
-        input2 = np.array([[[1.0, 1.0],[2.0, 2.0],[3.0, 3.0]],[[4.0, 4.0],[5.0, 5.0],[6.0, 6.0]]])
-
         paddle.disable_static()
-        
-        x = paddle.to_variable(input1)
-        y = paddle.to_variable(input2)
+
+        # In imperative mode:
+        # size x: (2, 2, 3) and y: (2, 3, 2)
+        x = paddle.to_tensor([[[1.0, 1.0, 1.0],
+                               [2.0, 2.0, 2.0]],
+                              [[3.0, 3.0, 3.0],
+                               [4.0, 4.0, 4.0]]])
+        y = paddle.to_tensor([[[1.0, 1.0],[2.0, 2.0],[3.0, 3.0]],
+                              [[4.0, 4.0],[5.0, 5.0],[6.0, 6.0]]])
         out = paddle.bmm(x, y)
         #output size: (2, 2, 2)
         #output value:
@@ -924,10 +900,8 @@ def histogram(input, bins=100, min=0, max=0):
     Code Example 2:
         .. code-block:: python
             import paddle
-            import numpy as np
             paddle.disable_static(paddle.CPUPlace())
-            inputs_np = np.array([1, 2, 1]).astype(np.float)
-            inputs = paddle.to_variable(inputs_np)
+            inputs = paddle.to_tensor([1, 2, 1])
             result = paddle.histogram(inputs, bins=4, min=0, max=3)
             print(result) # [0, 2, 1, 0]
             paddle.enable_static()
@@ -947,3 +921,64 @@ def histogram(input, bins=100, min=0, max=0):
                'min': min,
                'max': max})
     return out
+
+
+def mv(x, vec, name=None):
+    """
+    Performs a matrix-vector product of the matrix x and the vector vec.
+
+    Args:
+        x (Variable): A tensor with shape :math:`[M, N]` , The data type of the input Tensor x
+            should be one of float32, float64.
+        vec (Variable): A tensor with shape :math:`[N]` , The data type of the input Tensor x
+            should be one of float32, float64.
+        name(str, optional): The default value is None.  Normally there is no need for user to set this
+            property.  For more information, please refer to :ref:`api_guide_Name`.
+
+    Returns:
+        Tensor: The tensor which is producted by x and vec.
+
+    Examples:
+        .. code-block:: python
+
+            # x: [M, N], vec: [N]
+            # paddle.mv(x, vec)  # out: [M]
+
+            import numpy as np
+            import paddle
+
+            paddle.disable_static()
+            x_data = np.array([[2, 1, 3], [3, 0, 1]]).astype("float64")
+            x = paddle.to_tensor(x_data)
+            vec_data = np.array([3, 5, 1])
+            vec = paddle.to_tensor(vec_data).astype("float64")
+            out = paddle.mv(x, vec)
+            paddle.enable_static()
+    """
+    if in_dygraph_mode():
+        out = core.ops.mv(x, vec)
+        return out
+
+    def __check_input(x, vec):
+        var_names = {'x': x, 'vec': vec}
+        for name, val in var_names.items():
+            check_variable_and_dtype(val, name, ['float32', 'float64'], 'mv')
+        x_shape = list(x.shape)
+        vec_shape = list(vec.shape)
+        if len(x_shape) != 2:
+            raise ValueError(
+                "x should be 2-dimensional. But received x's dimention: {}".
+                format(x_shape))
+        if len(vec_shape) != 1:
+            raise ValueError(
+                "vec should be 1-dimensional. But received vec's dimention: {}".
+                format(vec_shape))
+
+    __check_input(x, vec)
+
+    helper = LayerHelper('mv', **locals())
+    out = helper.create_variable_for_type_inference(dtype=x.dtype)
+    helper.append_op(
+        type='mv', inputs={'X': x,
+                           'Vec': vec}, outputs={'Out': out})
+    return out
diff --git a/python/paddle/tensor/logic.py b/python/paddle/tensor/logic.py
index 36b558d597c1ce1333a8f1eec54e2fd2813625e3..5fd714421c8ed14820738543a1824c779296d7c3 100644
--- a/python/paddle/tensor/logic.py
+++ b/python/paddle/tensor/logic.py
@@ -71,13 +71,12 @@ def equal_all(x, y, name=None):
     Examples:
         .. code-block:: python
 
-          import numpy as np
           import paddle
 
           paddle.disable_static()
-          x = paddle.to_variable(np.array([1, 2, 3]))
-          y = paddle.to_variable(np.array([1, 2, 3]))
-          z = paddle.to_variable(np.array([1, 4, 3]))
+          x = paddle.to_tensor([1, 2, 3])
+          y = paddle.to_tensor([1, 2, 3])
+          z = paddle.to_tensor([1, 4, 3])
           result1 = paddle.equal_all(x, y)
           print(result1.numpy()) # result1 = [True ]
           result2 = paddle.equal_all(x, z)
@@ -120,14 +119,11 @@ def allclose(x, y, rtol=1e-05, atol=1e-08, equal_nan=False, name=None):
         .. code-block:: python
 
           import paddle
-          import numpy as np
 
           paddle.disable_static()
 
-          np_x = np.array([10000., 1e-07]).astype("float32")
-          np_y = np.array([10000.1, 1e-08]).astype("float32")
-          x = paddle.to_tensor(np_x)
-          y = paddle.to_tensor(np_y)
+          x = paddle.to_tensor([10000., 1e-07])
+          y = paddle.to_tensor([10000.1, 1e-08])
           result1 = paddle.allclose(x, y, rtol=1e-05, atol=1e-08,
                                   equal_nan=False, name="ignore_nan")
           np_result1 = result1.numpy()
@@ -137,10 +133,8 @@ def allclose(x, y, rtol=1e-05, atol=1e-08, equal_nan=False, name=None):
           np_result2 = result2.numpy()
           # [False]
 
-          np_x = np.array([1.0, float('nan')]).astype("float32")
-          np_y = np.array([1.0, float('nan')]).astype("float32")
-          x = paddle.to_tensor(np_x)
-          y = paddle.to_tensor(np_y)
+          x = paddle.to_tensor([1.0, float('nan')])
+          y = paddle.to_tensor([1.0, float('nan')])
           result1 = paddle.allclose(x, y, rtol=1e-05, atol=1e-08,
                                   equal_nan=False, name="ignore_nan")
           np_result1 = result1.numpy()
@@ -195,12 +189,11 @@ def equal(x, y, name=None):
     Examples:
         .. code-block:: python
 
-          import numpy as np
           import paddle
 
           paddle.disable_static()
-          x = paddle.to_variable(np.array([1, 2, 3]))
-          y = paddle.to_variable(np.array([1, 3, 2]))
+          x = paddle.to_tensor([1, 2, 3])
+          y = paddle.to_tensor([1, 3, 2])
           result1 = paddle.equal(x, y)
           print(result1.numpy())  # result1 = [True False False]
     """
@@ -227,12 +220,11 @@ def greater_equal(x, y, name=None):
 
     Examples:
         .. code-block:: python
-            import numpy as np
             import paddle
 
             paddle.disable_static()
-            x = paddle.to_variable(np.array([1, 2, 3]))
-            y = paddle.to_variable(np.array([1, 3, 2]))
+            x = paddle.to_tensor([1, 2, 3])
+            y = paddle.to_tensor([1, 3, 2])
             result1 = paddle.greater_equal(x, y)
             print(result1.numpy())  # result1 = [True False True]
     """
@@ -259,12 +251,11 @@ def greater_than(x, y, name=None):
 
     Examples:
         .. code-block:: python
-            import numpy as np
             import paddle
 
             paddle.disable_static()
-            x = paddle.to_variable(np.array([1, 2, 3]))
-            y = paddle.to_variable(np.array([1, 3, 2]))
+            x = paddle.to_tensor([1, 2, 3])
+            y = paddle.to_tensor([1, 3, 2])
             result1 = paddle.greater_than(x, y)
             print(result1.numpy())  # result1 = [False False True]
     """
@@ -292,12 +283,11 @@ def less_equal(x, y, name=None):
 
     Examples:
         .. code-block:: python
-            import numpy as np
             import paddle
 
             paddle.disable_static()
-            x = paddle.to_variable(np.array([1, 2, 3]))
-            y = paddle.to_variable(np.array([1, 3, 2]))
+            x = paddle.to_tensor([1, 2, 3])
+            y = paddle.to_tensor([1, 3, 2])
             result1 = paddle.less_equal(x, y)
             print(result1.numpy())  # result1 = [True True False]
     """
@@ -325,12 +315,11 @@ def less_than(x, y, name=None):
 
     Examples:
         .. code-block:: python
-            import numpy as np
             import paddle
 
             paddle.disable_static()
-            x = paddle.to_variable(np.array([1, 2, 3]))
-            y = paddle.to_variable(np.array([1, 3, 2]))
+            x = paddle.to_tensor([1, 2, 3])
+            y = paddle.to_tensor([1, 3, 2])
             result1 = paddle.less_than(x, y)
             print(result1.numpy())  # result1 = [False True False]
     """
@@ -358,12 +347,12 @@ def not_equal(x, y, name=None):
 
     Examples:
         .. code-block:: python
-            import numpy as np
+
             import paddle
 
             paddle.disable_static()
-            x = paddle.to_variable(np.array([1, 2, 3]))
-            y = paddle.to_variable(np.array([1, 3, 2]))
+            x = paddle.to_tensor([1, 2, 3])
+            y = paddle.to_tensor([1, 3, 2])
             result1 = paddle.not_equal(x, y)
             print(result1.numpy())  # result1 = [False True True]
     """
diff --git a/python/paddle/tensor/manipulation.py b/python/paddle/tensor/manipulation.py
index 5a01fff88c16bfa584479d71ea93d78999de40df..9de407841fb461713d00f997afdf33a38a531245 100644
--- a/python/paddle/tensor/manipulation.py
+++ b/python/paddle/tensor/manipulation.py
@@ -21,6 +21,7 @@ from ..fluid.data_feeder import convert_dtype, check_variable_and_dtype, check_t
 from ..fluid.layers.tensor import fill_constant
 from ..fluid.layers import utils
 import numpy as np
+import six
 # TODO: define functions to manipulate a tensor  
 from ..fluid.layers import cast  #DEFINE_ALIAS
 from ..fluid.layers import slice  #DEFINE_ALIAS
@@ -85,11 +86,6 @@ def concat(x, axis=0, name=None):
         name (str, optional): The default value is None. Normally there is no
             need for user to set this property. For more information, please
             refer to :ref:`api_guide_Name`.
-    Raises:
-        TypeError: ``x`` must be list or tuple.
-        TypeError: The data type of ``x`` must be one of bool, float16, float32, float64, int32 and int64. 
-        TypeError: The ``axis`` must be int or Tensor. The dtype of ``axis`` must be int32 or int64 when it's a Tensor.
-        TypeError: All the Tensors in ``x`` must have the same data type.
 
     Returns:
         Tensor: A Tensor with the same data type as ``x``.
@@ -98,18 +94,14 @@ def concat(x, axis=0, name=None):
         .. code-block:: python
             
             import paddle
-            import numpy as np
             
             paddle.disable_static()  # Now we are in imperative mode
-            in1 = np.array([[1, 2, 3],
-                            [4, 5, 6]])
-            in2 = np.array([[11, 12, 13],
-                            [14, 15, 16]])
-            in3 = np.array([[21, 22],
-                            [23, 24]])
-            x1 = paddle.to_tensor(in1)
-            x2 = paddle.to_tensor(in2)
-            x3 = paddle.to_tensor(in3)
+            x1 = paddle.to_tensor([[1, 2, 3],
+                                   [4, 5, 6]])
+            x2 = paddle.to_tensor([[11, 12, 13],
+                                   [14, 15, 16]])
+            x3 = paddle.to_tensor([[21, 22],
+                                   [23, 24]])
             zero = paddle.full(shape=[1], dtype='int32', fill_value=0)
             # When the axis is negative, the real axis is (axis + Rank(x))
             # As follow, axis is -1, Rank(x) is 2, the real axis is 1
@@ -158,7 +150,7 @@ def flip(x, axis, name=None):
           image_shape=(3, 2, 2)
           x = np.arange(image_shape[0] * image_shape[1] * image_shape[2]).reshape(image_shape)
           x = x.astype('float32')
-          img = paddle.to_variable(x)
+          img = paddle.to_tensor(x)
           out = paddle.flip(img, [0,1])
 
           print(out) # [[[10,11][8, 9]],[[6, 7],[4, 5]] [[2, 3],[0, 1]]]
@@ -250,7 +242,7 @@ def flatten(x, start_axis=0, stop_axis=-1, name=None):
             x = np.arange(image_shape[0] * image_shape[1] * image_shape[2] * image_shape[3]).reshape(image_shape) / 100.
             x = x.astype('float32')
             
-            img = paddle.to_variable(x)
+            img = paddle.to_tensor(x)
             out = paddle.flatten(img, start_axis=1, stop_axis=2)
             # out shape is [2, 12, 4]
     """
@@ -315,15 +307,13 @@ def roll(x, shifts, axis=None, name=None):
 
     Examples:
         .. code-block:: python
-            import numpy as np
             import paddle
             import paddle.fluid as fluid
 
-            data = np.array([[1.0, 2.0, 3.0],
-                             [4.0, 5.0, 6.0],
-                             [7.0, 8.0, 9.0]])
             paddle.disable_static()
-            x = paddle.to_variable(data)
+            x = paddle.to_tensor([[1.0, 2.0, 3.0],
+                                  [4.0, 5.0, 6.0],
+                                  [7.0, 8.0, 9.0]])
             out_z1 = paddle.roll(x, shifts=1)
             print(out_z1.numpy())
             #[[9. 1. 2.]
@@ -433,8 +423,7 @@ def stack(x, axis=0, name=None):
                           [5.0, 6.0] ] ]
 
     Args:
-        x (Tensor|list[Tensor]|tuple[Tensor]): Input ``x`` can be a single tensor, or a ``list`` or ``tuple`` of tensors.
-                                     If ``x`` is a ``list`` or ``tuple`` , the Tensors in ``x``
+        x (list[Tensor]|tuple[Tensor]): Input ``x`` can be a ``list`` or ``tuple`` of tensors, the Tensors in ``x``
                                      must be of the same shape and dtype. Supported data types: float32, float64, int32, int64.
         axis (int, optional): The axis along which all inputs are stacked. ``axis`` range is ``[-(R+1), R+1)``,
                               where ``R`` is the number of dimensions of the first input tensor ``x[0]``. 
@@ -448,17 +437,11 @@ def stack(x, axis=0, name=None):
         .. code-block:: python
 
             import paddle
-            import numpy as np
-
-            data1 = np.array([[1.0, 2.0]])
-            data2 = np.array([[3.0, 4.0]])
-            data3 = np.array([[5.0, 6.0]])
-
+            
             paddle.disable_static()
-            x1 = paddle.to_variable(data1)
-            x2 = paddle.to_variable(data2)
-            x3 = paddle.to_variable(data3)
-
+            x1 = paddle.to_tensor([[1.0, 2.0]])
+            x2 = paddle.to_tensor([[3.0, 4.0]])
+            x3 = paddle.to_tensor([[5.0, 6.0]])
             out = paddle.stack([x1, x2, x3], axis=0)
             print(out.shape)  # [3, 1, 2]
             print(out.numpy())
@@ -487,10 +470,7 @@ def split(x, num_or_sections, axis=0, name=None):
             For more information, please refer to :ref:`api_guide_Name` .
     Returns:
         list(Tensor): The list of segmented Tensors.
-    Raises:
-        TypeError: The data type of ``x`` must be one of bool, float16, float32, float64, int32, int64.
-        TypeError: ``num_or_sections`` is not int, list or tuple.
-        TypeError: ``axis`` is not int or Tensor. the data type of ``axis`` must be int32 or int64 when it's a Tensor.
+    
     Example:
         .. code-block:: python
             
@@ -638,12 +618,10 @@ def unique(x,
     Examples:
         .. code-block:: python
 
-            import numpy as np
             import paddle
 
             paddle.disable_static()
-            x_data = np.array([2, 3, 3, 1, 5, 3])
-            x = paddle.to_tensor(x_data)
+            x = paddle.to_tensor([2, 3, 3, 1, 5, 3])
             unique = paddle.unique(x)
             np_unique = unique.numpy() # [1 2 3 5]
             _, indices, inverse, counts = paddle.unique(x, return_index=True, return_inverse=True, return_counts=True)
@@ -651,8 +629,7 @@ def unique(x,
             np_inverse = inverse.numpy() # [1 2 2 0 3 2]
             np_counts = counts.numpy() # [1 1 3 1]
 
-            x_data = np.array([[2, 1, 3], [3, 0, 1], [2, 1, 3]])
-            x = paddle.to_tensor(x_data)
+            x = paddle.to_tensor([[2, 1, 3], [3, 0, 1], [2, 1, 3]])
             unique = paddle.unique(x)
             np_unique = unique.numpy() # [0 1 2 3]
 
@@ -770,8 +747,6 @@ def unsqueeze(x, axis, name=None):
             print(out3.shape)  # [1, 1, 1, 5, 10]
             
     """
-    if isinstance(axis, int):
-        axis = [axis]
 
     return layers.unsqueeze(x, axis, name)
 
@@ -812,23 +787,15 @@ def gather(x, index, axis=None, name=None):
     Returns:
         output (Tensor): The output is a tensor with the same rank as ``x``.
     
-    Raises:
-        TypeError: ``x`` must be a Tensor and the data type of ``x`` must to be one of float16, float32, float64, int32, int64, uint8.
-        TypeError: ``index`` must be a Tensor and the data type of ``index`` must be int32 or int64.
-        TypeError: ``axis`` must be a Tensor or int and the data type of ``index`` must be int32 or int64 when it's a Tensor.
-
     Examples:
 
         .. code-block:: python
 
-            import numpy as np
             import paddle
 
             paddle.disable_static()
-            input_1 = np.array([[1,2],[3,4],[5,6]])
-            index_1 = np.array([0,1])
-            input = paddle.to_tensor(input_1)
-            index = paddle.to_tensor(index_1)
+            input = paddle.to_tensor([[1,2],[3,4],[5,6]])
+            index = paddle.to_tensor([0,1])
             output = paddle.gather(input, index, axis=0)
             # expected output: [[1,2],[3,4]]
     """
@@ -964,16 +931,11 @@ def scatter(x, index, updates, overwrite=True, name=None):
         .. code-block:: python
             
             import paddle
-            import numpy as np
             paddle.disable_static()
 
-            x_data = np.array([[1, 1], [2, 2], [3, 3]]).astype(np.float32)
-            index_data = np.array([2, 1, 0, 1]).astype(np.int64)
-            updates_data = np.array([[1, 1], [2, 2], [3, 3], [4, 4]]).astype(np.float32)
-            
-            x = paddle.to_tensor(x_data)
-            index = paddle.to_tensor(index_data)
-            updates = paddle.to_tensor(updates_data)
+            x = paddle.to_tensor([[1, 1], [2, 2], [3, 3]], dtype='float32')
+            index = paddle.to_tensor([2, 1, 0, 1], dtype='int64')
+            updates = paddle.to_tensor([[1, 1], [2, 2], [3, 3], [4, 4]], dtype='float32')
   
             output1 = paddle.scatter(x, index, updates, overwrite=False)
             # [[3., 3.],
@@ -1026,10 +988,7 @@ def chunk(x, chunks, axis=0, name=None):
             For more information, please refer to :ref:`api_guide_Name` .
     Returns:
         list(Tensor): The list of segmented Tensors.
-    Raises:
-        TypeError: The data type of ``x`` must be one of bool, float16, float32, float64, int32, int64.
-        TypeError: ``chunks`` is not int.
-        TypeError: ``axis`` is not int or Tensor. the data type of ``axis`` must be int32 or int64 when it's a Tensor.
+    
     Example:
         .. code-block:: python
             
@@ -1041,7 +1000,7 @@ def chunk(x, chunks, axis=0, name=None):
             x_np = np.random.random([3, 9, 5]).astype("int32")
             x = paddle.to_tensor(x_np)
 
-            out0, out1, out22 = paddle.chunk(x, chunks=3, axis=1)
+            out0, out1, out2 = paddle.chunk(x, chunks=3, axis=1)
             # out0.shape [3, 3, 5]
             # out1.shape [3, 3, 5]
             # out2.shape [3, 3, 5]
@@ -1080,11 +1039,9 @@ def tile(x, repeat_times, name=None):
         .. code-block:: python
 
             import paddle
-            import numpy as np
 
             paddle.disable_static()
-            np_data = np.array([1, 2, 3]).astype('int32')
-            data = paddle.to_tensor(np_data)
+            data = paddle.to_tensor([1, 2, 3], dtype='int32')
             out = paddle.tile(data, repeat_times=[2, 1])
             np_out = out.numpy()
             # [[1, 2, 3], [1, 2, 3]]
@@ -1093,18 +1050,32 @@ def tile(x, repeat_times, name=None):
             np_out = out.numpy()
             # [[1, 2, 3, 1, 2, 3], [1, 2, 3, 1, 2, 3]]
 
-            np_repeat_times = np.array([2, 1]).astype("int32")
-            repeat_times = paddle.to_tensor(np_repeat_times)
+            repeat_times = paddle.to_tensor([2, 1], dtype='int32')
             out = paddle.tile(data, repeat_times=repeat_times)
             np_out = out.numpy()
             # [[1, 2, 3], [1, 2, 3]]
     """
     if in_dygraph_mode():
         return core.ops.tile(x, 'repeat_times', repeat_times)
+    check_type(repeat_times, 'repeat_times', (list, tuple, Variable), 'tile')
+    if isinstance(repeat_times, Variable):
+        assert len(repeat_times.shape) == 1, (
+            'repeat_times must be an 1-D Tensor.')
+    else:
+        for elem in repeat_times:
+            if isinstance(elem, Variable):
+                assert len(elem.shape) == 1, (
+                    'Elements in repeat_times must be 1-D Tensors or integers.')
+            else:
+                if six.PY3:
+                    type_tuple = (int, np.int32, np.int64)
+                elif six.PY2:
+                    type_tuple = (int, long, np.int32, np.int64)
+                assert isinstance(elem, type_tuple), (
+                    'Elements in repeat_times must be 1-D Tensors or integers.')
 
     check_variable_and_dtype(
         x, 'x', ['bool', 'float32', 'float64', 'int32', 'int64'], 'tile')
-    check_type(repeat_times, 'repeat_times', (list, tuple, Variable), 'tile')
     if convert_dtype(x.dtype) == 'bool' and x.stop_gradient == False:
         raise ValueError(
             "When the date type is bool for the input 'x' of tile op, you "
@@ -1162,15 +1133,12 @@ def expand_as(x, y, name=None):
     Examples:
         .. code-block:: python
 
-            import numpy as np
             import paddle
 
             paddle.disable_static()
 
-            np_data_x = np.array([1, 2, 3]).astype('int32')
-            np_data_y = np.array([[1, 2, 3], [4, 5, 6]]).astype('int32')
-            data_x = paddle.to_tensor(np_data_x)
-            data_y = paddle.to_tensor(np_data_y)
+            data_x = paddle.to_tensor([1, 2, 3], 'int32')
+            data_y = paddle.to_tensor([[1, 2, 3], [4, 5, 6]], 'int32')
             out = paddle.expand_as(data_x, data_y)
             np_out = out.numpy()
             # [[1, 2, 3], [1, 2, 3]]
@@ -1218,12 +1186,10 @@ def expand(x, shape, name=None):
     Examples:
         .. code-block:: python
 
-            import numpy as np
             import paddle
 
             paddle.disable_static()
-            np_data = np.array([1, 2, 3]).astype('int32')
-            data = paddle.to_tensor(np_data)
+            data = paddle.to_tensor([1, 2, 3], dtype='int32')
             out = paddle.expand(data, shape=[2, 3])
             out = out.numpy()
             # [[1, 2, 3], [1, 2, 3]]
@@ -1231,18 +1197,33 @@ def expand(x, shape, name=None):
     if in_dygraph_mode():
         return core.ops.expand_v2(x, 'shape', shape)
 
+    if isinstance(shape, Variable):
+        assert len(shape.shape) == 1, ('shape must be an 1-D Tensor.')
+    else:
+        for elem in shape:
+            if isinstance(elem, Variable):
+                assert len(elem.shape) == 1, (
+                    'Elements in shape must be 1-D Tensors or integers.')
+            else:
+                if six.PY3:
+                    type_tuple = (int, np.int32, np.int64)
+                elif six.PY2:
+                    type_tuple = (int, long, np.int32, np.int64)
+                assert isinstance(elem, type_tuple), (
+                    'Elements in shape must be 1-D Tensors or integers.')
+
     check_variable_and_dtype(
         x, 'x', ['bool', 'float32', 'float64', 'int32', 'int64'], 'expand')
     check_type(shape, 'shape', (list, tuple, Variable), 'expand')
-
-    inputs = {"X": [x]}
-    attrs = {}
     if convert_dtype(x.dtype) == 'bool' and x.stop_gradient == False:
         raise ValueError("When the data type of input 'x' for expand is bool, "
                          "you must set its stop_gradient to be False by "
                          "some_var.stop_gradient = True, supporting "
                          "some_var as the input.")
 
+    inputs = {"X": [x]}
+    attrs = {}
+
     helper = LayerHelper('expand', **locals())
 
     def get_attr_expand_shape(list_expand_shape):
@@ -1322,11 +1303,6 @@ def reshape(x, shape, name=None):
     Returns:
         Tensor: A reshaped Tensor with the same data type as ``x``.
 
-    Raises:
-        ValueError: If more than one elements of ``shape`` is -1.
-        ValueError: If the element of ``shape`` is 0, the corresponding dimension should be less than or equal to the dimension of ``x``.
-        ValueError: If the elements in ``shape`` is negative except -1.
-
     Examples:
         .. code-block:: python
 
@@ -1413,23 +1389,16 @@ def gather_nd(x, index, name=None):
     Returns:
         output (Tensor): A tensor with the shape index.shape[:-1] + input.shape[index.shape[-1]:]
     
-    Raises:
-        TypeError: ``x`` must be a Tensor and the data type of ``x`` must be one of float32, float64, int32 and int64.
-        TypeError: ``index`` must be a Tensor and the data type of ``index`` must be one of int32 and int64.
-
     Examples:
 
         .. code-block:: python
             
             import paddle
-            import numpy as np
             
             paddle.disable_static()
-            np_x = np.array([[[1, 2], [3, 4], [5, 6]],
-                             [[7, 8], [9, 10], [11, 12]]])
-            np_index = [[0, 1]]
-            x = paddle.to_tensor(np_x)
-            index = paddle.to_tensor(np_index)
+            x = paddle.to_tensor([[[1, 2], [3, 4], [5, 6]],
+                                  [[7, 8], [9, 10], [11, 12]]])
+            index = paddle.to_tensor([[0, 1]])
             
             output = paddle.gather_nd(x, index) #[[3, 4]]
 
diff --git a/python/paddle/tensor/math.py b/python/paddle/tensor/math.py
index d2db2a7cb71945e137e46d6793f8cba1f7adf12f..966544c7abb54ae7de163aa322890a55ee94d3d8 100755
--- a/python/paddle/tensor/math.py
+++ b/python/paddle/tensor/math.py
@@ -64,7 +64,6 @@ from ..fluid.layers import increment    #DEFINE_ALIAS
 from ..fluid.layers import multiplex    #DEFINE_ALIAS
 from ..fluid.layers import sums    #DEFINE_ALIAS
 from ..fluid import layers
-import paddle
 
 
 __all__ = [
@@ -174,14 +173,12 @@ def pow(x, y, name=None):
         ..  code-block:: python
 
             import paddle
-            import numpy as np
 
             paddle.disable_static()
             
             # example 1: y is a float
-            x_data = np.array([1, 2, 3])
+            x = paddle.to_tensor([1, 2, 3])
             y = 2
-            x = paddle.to_tensor(x_data)
             res = paddle.pow(x, y)
             print(res.numpy()) # [1 4 9]
             
@@ -291,13 +288,10 @@ Examples:
     ..  code-block:: python
 
         import paddle
-        import numpy as np
 
         paddle.disable_static()
-        np_x = np.array([2, 3, 4]).astype('float64')
-        np_y = np.array([1, 5, 2]).astype('float64')
-        x = paddle.to_variable(np_x)
-        y = paddle.to_variable(np_y)
+        x = paddle.to_tensor([2, 3, 4], 'float64')
+        y = paddle.to_tensor([1, 5, 2], 'float64')
         z = paddle.add(x, y)
         np_z = z.numpy()
         print(np_z)  # [3., 8., 6. ]
@@ -335,14 +329,11 @@ def divide(x, y, name=None):
         ..  code-block:: python
 
             import paddle
-            import numpy as np
 
             paddle.disable_static()
 
-            np_x = np.array([2, 3, 4]).astype('float64')
-            np_y = np.array([1, 5, 2]).astype('float64')
-            x = paddle.to_tensor(np_x)
-            y = paddle.to_tensor(np_y)
+            x = paddle.to_tensor([2, 3, 4], dtype='float64')
+            y = paddle.to_tensor([1, 5, 2], dtype='float64')
             z = paddle.divide(x, y)
             print(z.numpy())  # [2., 0.6, 2.]
 
@@ -351,69 +342,9 @@ def divide(x, y, name=None):
     axis = -1
     act = None
     if in_dygraph_mode():
-        # rule 1 : avoid numpy.ndarray
-        if isinstance(x, numpy.ndarray) or isinstance(y, numpy.ndarray):
-            raise TypeError("divide(): arguments must be Tensor or scalar, not numpy.ndarray.")
-
-        # rule 2: both the inputs are not Tensor
-        elif not isinstance(x, paddle.Tensor) and not isinstance(y, paddle.Tensor):
-            x = paddle.full(shape=[1], dtype=paddle.get_default_dtype(), fill_value=x)
-            y = paddle.full(shape=[1], dtype=paddle.get_default_dtype(), fill_value=y)
-
-        # rule 3: both the inputs are Tensor
-        elif isinstance(x, paddle.Tensor) and isinstance(y, paddle.Tensor):
-            if y.dtype != x.dtype:
-                raise TypeError("divide(): argument position 1 and argument position 2 must have the same dtype."
-                                "But x is {}, y is {}".format(x.dtype, y.dtype))
-            elif x.dtype in _supported_int_dtype_:
-                x = x.astype(paddle.get_default_dtype())
-                y = y.astype(paddle.get_default_dtype())
-
-        # rule 4: x is Tensor, y is scalar
-        elif isinstance(x, paddle.Tensor) and not isinstance(y, paddle.Tensor):
-            if x.dtype in _supported_int_dtype_:
-                x = x.astype(paddle.get_default_dtype())
-            y = paddle.full(shape=[1], dtype=x.dtype, fill_value=y)
-
-        # rule 5: x is scalar, y is Tensor
-        elif not isinstance(x, paddle.Tensor) and isinstance(y, paddle.Tensor):
-            if y.dtype in _supported_int_dtype_:
-                y = y.astype(paddle.get_default_dtype())
-            x = paddle.full(shape=[1], dtype=y.dtype, fill_value=x)
-
         return _elementwise_op_in_dygraph(
             x, y, axis=axis, act=act, op_name=op_type)
 
-    # rule 1 : avoid numpy.ndarray
-    if isinstance(x, numpy.ndarray) or isinstance(y, numpy.ndarray):
-        raise TypeError("divide(): arguments must be Tensor or scalar, not numpy.ndarray.")
-
-    # rule 2: both the inputs are not Tensor
-    elif not isinstance(x, Variable) and not isinstance(y, Variable):
-        x = paddle.fill_constant(shape=[1], dtype=paddle.get_default_dtype(), value=x)
-        y = paddle.fill_constant(shape=[1], dtype=paddle.get_default_dtype(), value=y)
-
-    # rule 3: both the inputs are Tensor
-    elif isinstance(x, Variable) and isinstance(y, Variable):
-        if y.dtype != x.dtype:
-            raise TypeError("divide(): argument position 1 and argument position 2 must have the same dtype."
-                            "But x is {}, y is {}".format(x.dtype, y.dtype))
-        elif x.dtype in _supported_int_dtype_:
-            x = paddle.cast(x, paddle.get_default_dtype())
-            y = paddle.cast(y, paddle.get_default_dtype())
-
-    # rule 4: x is Tensor, y is scalar
-    elif isinstance(x, Variable) and not isinstance(y, Variable):
-        if x.dtype in _supported_int_dtype_:
-            x = paddle.cast(x, paddle.get_default_dtype())
-        y = paddle.fill_constant(shape=[1], dtype=x.dtype, value=y)
-
-    # rule 5: x is scalar, y is Tensor
-    elif not isinstance(x, Variable) and isinstance(y, Variable):
-        if y.dtype in _supported_int_dtype_:
-            y = paddle.cast(y, paddle.get_default_dtype())
-        x = paddle.fill_constant(shape=[1], dtype=y.dtype, value=x)
-
     return _elementwise_op(LayerHelper(op_type, **locals()))
 
 
@@ -440,14 +371,11 @@ def floor_divide(x, y, name=None):
         ..  code-block:: python
 
             import paddle
-            import numpy as np
 
             paddle.disable_static()
 
-            np_x = np.array([2, 3, 8, 7])
-            np_y = np.array([1, 5, 3, 3])
-            x = paddle.to_tensor(np_x)
-            y = paddle.to_tensor(np_y)
+            x = paddle.to_tensor([2, 3, 8, 7])
+            y = paddle.to_tensor([1, 5, 3, 3])
             z = paddle.floor_divide(x, y)
             print(z.numpy())  # [2, 0, 2, 2]
 
@@ -455,55 +383,9 @@ def floor_divide(x, y, name=None):
     op_type = 'elementwise_floordiv'
     axis = -1
     if in_dygraph_mode():
-        # rule 1 : avoid numpy.ndarray
-        if isinstance(x, numpy.ndarray) or isinstance(y, numpy.ndarray):
-            raise TypeError("floor_divide(): arguments must be Tensor or scalar, not numpy.ndarray.")
-
-        # rule 2: both the inputs are not Tensor
-        elif not isinstance(x, paddle.Tensor) and not isinstance(y, paddle.Tensor):
-            x = paddle.full(shape=[1], dtype=paddle.get_default_dtype(), fill_value=x)
-            y = paddle.full(shape=[1], dtype=paddle.get_default_dtype(), fill_value=y)
-
-        # rule 3: both the inputs are Tensor
-        elif isinstance(x, paddle.Tensor) and isinstance(y, paddle.Tensor):
-            if y.dtype != x.dtype:
-                raise TypeError("floor_divide(): argument position 1 and argument position 2 must have the same dtype."
-                                "But x is {}, y is {}".format(x.dtype, y.dtype))
-
-        # rule 4: x is Tensor, y is scalar
-        elif isinstance(x, paddle.Tensor) and not isinstance(y, paddle.Tensor):
-            y = paddle.full(shape=[1], dtype=x.dtype, fill_value=y)
-
-        # rule 5: x is scalar, y is Tensor
-        elif not isinstance(x, paddle.Tensor) and isinstance(y, paddle.Tensor):
-            x = paddle.full(shape=[1], dtype=y.dtype, fill_value=x)
-
         return _elementwise_op_in_dygraph(
             x, y, axis=axis, op_name=op_type)
 
-    # rule 1 : avoid numpy.ndarray
-    if isinstance(x, numpy.ndarray) or isinstance(y, numpy.ndarray):
-        raise TypeError("divide(): arguments must be Tensor or scalar, not numpy.ndarray.")
-
-    # rule 2: both the inputs are not Tensor
-    elif not isinstance(x, Variable) and not isinstance(y, Variable):
-        x = paddle.fill_constant(shape=[1], dtype=paddle.get_default_dtype(), value=x)
-        y = paddle.fill_constant(shape=[1], dtype=paddle.get_default_dtype(), value=y)
-
-    # rule 3: both the inputs are Tensor
-    elif isinstance(x, Variable) and isinstance(y, Variable):
-        if y.dtype != x.dtype:
-            raise TypeError("divide(): argument position 1 and argument position 2 must have the same dtype."
-                            "But x is {}, y is {}".format(x.dtype, y.dtype))
-
-    # rule 4: x is Tensor, y is scalar
-    elif isinstance(x, Variable) and not isinstance(y, Variable):
-        y = paddle.fill_constant(shape=[1], dtype=x.dtype, value=y)
-
-    # rule 5: x is scalar, y is Tensor
-    elif not isinstance(x, Variable) and isinstance(y, Variable):
-        x = paddle.fill_constant(shape=[1], dtype=y.dtype, value=x)
-
     return _elementwise_op(LayerHelper(op_type, **locals()))
 
 
@@ -530,14 +412,11 @@ def remainder(x, y, name=None):
         ..  code-block:: python
 
             import paddle
-            import numpy as np
 
             paddle.disable_static()
 
-            np_x = np.array([2, 3, 8, 7])
-            np_y = np.array([1, 5, 3, 3])
-            x = paddle.to_tensor(np_x)
-            y = paddle.to_tensor(np_y)
+            x = paddle.to_tensor([2, 3, 8, 7])
+            y = paddle.to_tensor([1, 5, 3, 3])
             z = paddle.remainder(x, y)
             print(z.numpy())  # [0, 3, 2, 1]
 
@@ -545,43 +424,9 @@ def remainder(x, y, name=None):
     op_type = 'elementwise_mod'
     axis = -1
     if in_dygraph_mode():
-        # rule 1 : avoid numpy.ndarray
-        if isinstance(x, numpy.ndarray) or isinstance(y, numpy.ndarray):
-            raise TypeError("remainder(): arguments must be Tensor or scalar, not numpy.ndarray.")
-
-        elif not isinstance(x, paddle.Tensor):
-            raise TypeError("remainder(): arguments position 1 must be Tensor, not {}".format(type(x)))
-
-        # rule 3: both the inputs are Tensor
-        elif isinstance(y, paddle.Tensor):
-            if y.dtype != x.dtype:
-                raise TypeError("remainder(): argument position 1 and argument position 2 must have the same dtype."
-                                "But x is {}, y is {}".format(x.dtype, y.dtype))
-
-        # rule 4: x is Tensor, y is scalar
-        elif not isinstance(y, paddle.Tensor):
-            y = paddle.full(shape=[1], dtype=x.dtype, fill_value=y)
-
         return _elementwise_op_in_dygraph(
             x, y, axis=axis, op_name=op_type)
 
-    # rule 1 : avoid numpy.ndarray
-    if isinstance(x, numpy.ndarray) or isinstance(y, numpy.ndarray):
-        raise TypeError("remainder(): arguments must be Tensor or scalar, not numpy.ndarray.")
-
-    elif not isinstance(x, Variable):
-        raise TypeError("remainder(): arguments position 1 must be Tensor, not {}".format(type(x)))
-
-    # rule 3: both the inputs are Tensor
-    elif isinstance(y, Variable):
-        if y.dtype != x.dtype:
-            raise TypeError("remainder(): argument position 1 and argument position 2 must have the same dtype."
-                            "But x is {}, y is {}".format(x.dtype, y.dtype))
-
-    # rule 4: x is Tensor, y is scalar
-    elif not isinstance(y, paddle.Tensor):
-        y = paddle.fill_constant(shape=[1], dtype=x.dtype, value=y)
-
     return _elementwise_op(LayerHelper(op_type, **locals()))
 
 
@@ -612,20 +457,15 @@ def multiply(x, y, axis=-1, name=None):
         ..  code-block:: python
 
             import paddle
-            import numpy as np
 
             paddle.disable_static()
-            x_data = np.array([[1, 2], [3, 4]], dtype=np.float32)
-            y_data = np.array([[5, 6], [7, 8]], dtype=np.float32)
-            x = paddle.to_tensor(x_data)
-            y = paddle.to_tensor(y_data)
+            x = paddle.to_tensor([[1, 2], [3, 4]])
+            y = paddle.to_tensor([[5, 6], [7, 8]])
             res = paddle.multiply(x, y)
             print(res.numpy()) # [[5, 12], [21, 32]]
 
-            x_data = np.array([[[1, 2, 3], [1, 2, 3]]], dtype=np.float32)
-            y_data = np.array([1, 2], dtype=np.float32)
-            x = paddle.to_tensor(x_data)
-            y = paddle.to_tensor(y_data)
+            x = paddle.to_tensor([[[1, 2, 3], [1, 2, 3]]])
+            y = paddle.to_tensor([1, 2])
             res = paddle.multiply(x, y, axis=1)
             print(res.numpy()) # [[[1, 2, 3], [2, 4, 6]]]
 
@@ -654,36 +494,28 @@ Examples:
 
         paddle.disable_static()
   
-        x_data = np.array([[1, 2], [3, 4]], dtype=np.float32)
-        y_data = np.array([[5, 6], [7, 8]], dtype=np.float32)
-        x = paddle.to_variable(x_data)
-        y = paddle.to_variable(y_data)
+        x = paddle.to_tensor([[1, 2], [3, 4]])
+        y = paddle.to_tensor([[5, 6], [7, 8]])
         res = paddle.maximum(x, y)
         print(res.numpy())
         #[[5. 6.]
         # [7. 8.]]
 
-        x_data = np.array([[[1, 2, 3], [1, 2, 3]]], dtype=np.float32)
-        y_data = np.array([1, 2], dtype=np.float32)
-        x = paddle.to_variable(x_data)
-        y = paddle.to_variable(y_data)
+        x = paddle.to_tensor([[[1, 2, 3], [1, 2, 3]]])
+        y = paddle.to_tensor([1, 2])
         res = paddle.maximum(x, y, axis=1)
         print(res.numpy())
         #[[[1. 2. 3.]
         #  [2. 2. 3.]]]
 
-        x_data = np.array([2, 3, 5], dtype=np.float32)
-        y_data = np.array([1, 4, np.nan], dtype=np.float32)
-        x = paddle.to_variable(x_data)
-        y = paddle.to_variable(y_data)
+        x = paddle.to_tensor([2, 3, 5], dtype='float32')
+        y = paddle.to_tensor([1, 4, np.nan], dtype='float32')
         res = paddle.maximum(x, y)
         print(res.numpy())
         #[ 2.  4. nan]
 
-        x_data = np.array([5, 3, np.inf], dtype=np.float32)
-        y_data = np.array([1, 4, 5], dtype=np.float32)
-        x = paddle.to_variable(x_data)
-        y = paddle.to_variable(y_data)
+        x = paddle.to_tensor([5, 3, np.inf], dtype='float32')
+        y = paddle.to_tensor([1, 4, 5], dtype='float32')
         res = paddle.maximum(x, y)
         print(res.numpy())
         #[ 5.  4. inf]
@@ -703,38 +535,31 @@ Examples:
 
         import paddle
         import numpy as np
+
         paddle.disable_static()
   
-        x_data = np.array([[1, 2], [3, 4]], dtype=np.float32)
-        y_data = np.array([[5, 6], [7, 8]], dtype=np.float32)
-        x = paddle.to_variable(x_data)
-        y = paddle.to_variable(y_data)
+        x = paddle.to_tensor([[1, 2], [3, 4]], dtype='float32')
+        y = paddle.to_tensor([[5, 6], [7, 8]], dtype='float32')
         res = paddle.minimum(x, y)
         print(res.numpy())
         #[[1. 2.]
         # [3. 4.]]
 
-        x_data = np.array([[[1, 2, 3], [1, 2, 3]]], dtype=np.float32)
-        y_data = np.array([1, 2], dtype=np.float32)
-        x = paddle.to_variable(x_data)
-        y = paddle.to_variable(y_data)
+        x = paddle.to_tensor([[[1, 2, 3], [1, 2, 3]]], dtype='float32')
+        y = paddle.to_tensor([1, 2], dtype='float32')
         res = paddle.minimum(x, y, axis=1)
         print(res.numpy())
         #[[[1. 1. 1.]
         #  [2. 2. 2.]]]
 
-        x_data = np.array([2, 3, 5], dtype=np.float32)
-        y_data = np.array([1, 4, np.nan], dtype=np.float32)
-        x = paddle.to_variable(x_data)
-        y = paddle.to_variable(y_data)
+        x = paddle.to_tensor([2, 3, 5], dtype='float32')
+        y = paddle.to_tensor([1, 4, np.nan], dtype='float32')
         res = paddle.minimum(x, y)
         print(res.numpy())
         #[ 1.  3. nan]
 
-        x_data = np.array([5, 3, np.inf], dtype=np.float32)
-        y_data = np.array([1, 4, 5], dtype=np.float32)
-        x = paddle.to_variable(x_data)
-        y = paddle.to_variable(y_data)
+        x = paddle.to_tensor([5, 3, np.inf], dtype='float32')
+        y = paddle.to_tensor([1, 4, 5], dtype='float32')
         res = paddle.minimum(x, y)
         print(res.numpy())
         #[1. 3. 5.]
@@ -794,33 +619,33 @@ def sum(x, axis=None, dtype=None, keepdim=False, name=None):
         it's data type is the same as `x`.
 
     Raises:
-        ValueError: The :attr:`dtype` must be float64 or int64.
+        ValueError: If the data type of `x` is float64, :attr:`dtype` can not be float32 or int32.
+        ValueError: If the data type of `x` is int64, :attr:`dtype` can not be int32.
         TypeError: The type of :attr:`axis` must be int, list or tuple.
 
     Examples:
         .. code-block:: python
 
-            import numpy as np
             import paddle
             paddle.disable_static()
 
-            # x is a Tensor variable with following elements:
+            # x is a Tensor with following elements:
             #    [[0.2, 0.3, 0.5, 0.9]
             #     [0.1, 0.2, 0.6, 0.7]]
             # Each example is followed by the corresponding output tensor.
-            x_data = np.array([[0.2, 0.3, 0.5, 0.9],[0.1, 0.2, 0.6, 0.7]]).astype('float32')
-            x = paddle.to_variable(x_data)
+            x = paddle.to_tensor([[0.2, 0.3, 0.5, 0.9],
+                                  [0.1, 0.2, 0.6, 0.7]])
             out1 = paddle.sum(x)  # [3.5]
             out2 = paddle.sum(x, axis=0)  # [0.3, 0.5, 1.1, 1.6]
             out3 = paddle.sum(x, axis=-1)  # [1.9, 1.6]
             out4 = paddle.sum(x, axis=1, keepdim=True)  # [[1.9], [1.6]]
 
-            # y is a Tensor variable with shape [2, 2, 2] and elements as below:
+            # y is a Tensor with shape [2, 2, 2] and elements as below:
             #      [[[1, 2], [3, 4]],
             #      [[5, 6], [7, 8]]]
             # Each example is followed by the corresponding output tensor.
-            y_data = np.array([[[1, 2], [3, 4]], [[5, 6], [7, 8]]]).astype('float32')
-            y = paddle.to_variable(y_data)
+            y = paddle.to_tensor([[[1, 2], [3, 4]], 
+                                  [[5, 6], [7, 8]]])
             out5 = paddle.sum(y, axis=[1, 2]) # [10, 26]
             out6 = paddle.sum(y, axis=[0, 1]) # [16, 20]
     """
@@ -850,10 +675,6 @@ def sum(x, axis=None, dtype=None, keepdim=False, name=None):
                     'out_dtype': convert_np_dtype_to_dtype_(dtype)
                 })
                 dtype_flag = True
-        else:
-            raise ValueError(
-                "The value of 'dtype' in sum op must be float64, int64, but received of {}".
-                format(dtype))
 
     if in_dygraph_mode():
         axis = axis if axis != None and axis != [] else [0]
@@ -867,6 +688,17 @@ def sum(x, axis=None, dtype=None, keepdim=False, name=None):
                                        'reduce_all', reduce_all_flag)
     check_variable_and_dtype(
         x, 'x', ['float32', 'float64', 'int32', 'int64'], 'sum')
+
+    if dtype is not None:
+        check_dtype(dtype, 'dtype', ['float32', 'float64', 'int32', 'int64'], 'sum')
+        x_dtype = convert_dtype(x.dtype)
+
+        if (x_dtype == "float64" and dtype in ["float32", "int32"]) or \
+                (x_dtype == "int64" and dtype == "int32"):
+            raise ValueError("The input(x)'s dtype is {} but the attr(dtype) of sum is {}, "
+                             "which may cause data type overflows. Please reset attr(dtype) of sum."
+                             .format(x_dtype, dtype))
+
     check_type(axis, 'axis', (int, list, tuple, type(None)), 'sum')
 
     helper = LayerHelper('sum', **locals())
@@ -1121,9 +953,9 @@ def addmm(input, x, y, beta=1.0, alpha=1.0, name=None):
 
             paddle.disable_static()
 
-            x = paddle.to_variable(data_x)
-            y = paddle.to_variable(data_y)
-            input = paddle.to_variable(data_input)
+            x = paddle.to_tensor(data_x)
+            y = paddle.to_tensor(data_y)
+            input = paddle.to_tensor(data_input)
 
             out = paddle.tensor.addmm( input=input, x=x, y=y, beta=0.5, alpha=5.0 )
 
@@ -1204,12 +1036,10 @@ def logsumexp(x, axis=None, keepdim=False, name=None):
     .. code-block:: python
 
         import paddle
-        import numpy as np
 
         paddle.disable_static()
 
-        x = np.array([[-1.5, 0., 2.], [3., 1.2, -2.4]])
-        x = paddle.to_tensor(x)
+        x = paddle.to_tensor([[-1.5, 0., 2.], [3., 1.2, -2.4]])
         out1 = paddle.logsumexp(x) # [3.4691226]
         out2 = paddle.logsumexp(x, 1) # [2.15317821, 3.15684602]
 
@@ -1223,15 +1053,14 @@ def logsumexp(x, axis=None, keepdim=False, name=None):
         axis = [0]
 
     if in_dygraph_mode():
-        return core.ops.logsumexp(x, 'dim', axis, 'keep_dim', keepdim,
-                                    'reduce_all', reduce_all)
+        return core.ops.logsumexp(x, 'axis', axis, 'keepdim', keepdim, 'reduce_all', reduce_all)
 
     check_variable_and_dtype(x, 'x',
                              ['float32', 'float64'],
                              'logsumexp')
 
     helper = LayerHelper('logsumexp', **locals())
-    attrs = {'dim': axis, 'keep_dim': keepdim, 'reduce_all': reduce_all}
+    attrs = {'axis': axis, 'keepdim': keepdim, 'reduce_all':reduce_all}
     out = helper.create_variable_for_type_inference(x.dtype)
     helper.append_op(
         type='logsumexp', inputs={'X': x}, outputs={'Out': out}, attrs=attrs)
@@ -1260,12 +1089,10 @@ def inverse(x, name=None):
     Examples:
         .. code-block:: python
 
-            import numpy as np
             import paddle
-
-            mat_np = np.array([[2, 0], [0, 2]]).astype("float32")
             paddle.disable_static()
-            mat = paddle.to_variable(mat_np)
+
+            mat = paddle.to_tensor([[2, 0], [0, 2]], dtype='float32')
             inv = paddle.inverse(mat)
             print(inv) # [[0.5, 0], [0, 0.5]]
 
@@ -1316,16 +1143,15 @@ def max(x, axis=None, keepdim=False, name=None):
     Examples:
         .. code-block:: python
 
-            import numpy as np
             import paddle
 
             paddle.disable_static()
 
             # data_x is a variable with shape [2, 4]
             # the axis is a int element
-            data_x = np.array([[0.2, 0.3, 0.5, 0.9],
-                               [0.1, 0.2, 0.6, 0.7]])
-            x = paddle.to_variable(data_x)
+
+            x = paddle.to_tensor([[0.2, 0.3, 0.5, 0.9],
+                                  [0.1, 0.2, 0.6, 0.7]])
             result1 = paddle.max(x)
             print(result1.numpy())
             #[0.9]
@@ -1342,9 +1168,9 @@ def max(x, axis=None, keepdim=False, name=None):
 
             # data_y is a variable with shape [2, 2, 2]
             # the axis is list 
-            data_y = np.array([[[1.0, 2.0], [3.0, 4.0]],
-                               [[5.0, 6.0], [7.0, 8.0]]])
-            y = paddle.to_variable(data_y)
+
+            y = paddle.to_tensor([[[1.0, 2.0], [3.0, 4.0]],
+                                  [[5.0, 6.0], [7.0, 8.0]]])
             result5 = paddle.max(y, axis=[1, 2])
             print(result5.numpy())
             #[4. 8.]
@@ -1411,16 +1237,14 @@ def min(x, axis=None, keepdim=False, name=None):
     Examples:
         .. code-block:: python
 
-            import numpy as np
             import paddle
 
             paddle.disable_static()
 
-            # data_x is a variable with shape [2, 4]
+            # x is a tensor with shape [2, 4]
             # the axis is a int element
-            data_x = np.array([[0.2, 0.3, 0.5, 0.9],
-                            [0.1, 0.2, 0.6, 0.7]])
-            x = paddle.to_variable(data_x)
+            x = paddle.to_tensor([[0.2, 0.3, 0.5, 0.9],
+                                  [0.1, 0.2, 0.6, 0.7]])
             result1 = paddle.min(x)
             print(result1.numpy())
             #[0.1]
@@ -1435,11 +1259,10 @@ def min(x, axis=None, keepdim=False, name=None):
             #[[0.2]
             # [0.1]]
 
-            # data_y is a variable with shape [2, 2, 2]
+            # y is a variable with shape [2, 2, 2]
             # the axis is list 
-            data_y = np.array([[[1.0, 2.0], [3.0, 4.0]],
-                               [[5.0, 6.0], [7.0, 8.0]]])
-            y = paddle.to_variable(data_y)
+            y = paddle.to_tensor([[[1.0, 2.0], [3.0, 4.0]],
+                                  [[5.0, 6.0], [7.0, 8.0]]])
             result5 = paddle.min(y, axis=[1, 2])
             print(result5.numpy()) 
             #[1. 5.]
@@ -1596,11 +1419,9 @@ def clip(x, min=None, max=None, name=None):
         .. code-block:: python
 
             import paddle
-            import numpy as np
 
             paddle.disable_static()
-            x = np.array([[1.2,3.5], [4.5,6.4]]).astype('float32')
-            x1 = paddle.to_variable(x)
+            x1 = paddle.to_tensor([[1.2, 3.5], [4.5, 6.4]], 'float32')
             out1 = paddle.clip(x1, min=3.5, max=5.0)
             out2 = paddle.clip(x1, min=2.5)
             print(out1.numpy())
@@ -1611,11 +1432,8 @@ def clip(x, min=None, max=None, name=None):
             # [[4.5, 6.4]
     """
 
-    np_dtype = np.float32
-    if x.dtype == VarDesc.VarType.FP64:
-        np_dtype = np.float64
-    fmin = float(np.finfo(np_dtype).min)
-    fmax = float(np.finfo(np_dtype).max)
+    fmin = float(np.finfo(np.float32).min)
+    fmax = float(np.finfo(np.float32).max)
 
     if in_dygraph_mode():
         if isinstance(min, Variable):
@@ -1656,7 +1474,7 @@ def clip(x, min=None, max=None, name=None):
 
     helper = LayerHelper('clip', **locals())
     output = helper.create_variable_for_type_inference(
-        dtype=helper.input_dtype())
+        dtype=helper.input_dtype('x'))
     helper.append_op(
         type='clip', inputs=inputs, outputs={'Out': [output]}, attrs=attrs)
 
@@ -1704,9 +1522,9 @@ def trace(x, offset=0, axis1=0, axis2=1, name=None):
 
             paddle.disable_static()
 
-            case1 = paddle.to_variable(case1)
-            case2 = paddle.to_variable(case2)
-            case3 = paddle.to_variable(case3)
+            case1 = paddle.to_tensor(case1)
+            case2 = paddle.to_tensor(case2)
+            case3 = paddle.to_tensor(case3)
             data1 = paddle.trace(case1) # data1.shape = [1]
             data2 = paddle.trace(case2, offset=1, axis1=1, axis2=2) # data2.shape = [3]
             data3 = paddle.trace(case3, offset=-3, axis1=1, axis2=-1) # data2.shape = [3, 5]
@@ -1897,10 +1715,8 @@ def isfinite(x, name=None):
         .. code-block:: python
 
             import paddle
-            import numpy as np
             paddle.disable_static()
-            x_np = np.array([float('-inf'), -2, 3.6, float('inf'), 0, float('-nan'), float('nan')])
-            x = paddle.to_tensor(x_np)
+            x = paddle.to_tensor([float('-inf'), -2, 3.6, float('inf'), 0, float('-nan'), float('nan')])
             out = paddle.tensor.isfinite(x)
             print(out.numpy())  # [False  True  True False  True False False]
     """
@@ -1928,10 +1744,8 @@ def isinf(x, name=None):
         .. code-block:: python
 
             import paddle
-            import numpy as np
             paddle.disable_static()
-            x_np = np.array([float('-inf'), -2, 3.6, float('inf'), 0, float('-nan'), float('nan')])
-            x = paddle.to_tensor(x_np)
+            x = paddle.to_tensor([float('-inf'), -2, 3.6, float('inf'), 0, float('-nan'), float('nan')])
             out = paddle.tensor.isinf(x)
             print(out.numpy())  # [ True False False  True False False False]
     """
@@ -1959,10 +1773,8 @@ def isnan(x, name=None):
         .. code-block:: python
 
             import paddle
-            import numpy as np
             paddle.disable_static()
-            x_np = np.array([float('-inf'), -2, 3.6, float('inf'), 0, float('-nan'), float('nan')])
-            x = paddle.to_tensor(x_np)
+            x = paddle.to_tensor([float('-inf'), -2, 3.6, float('inf'), 0, float('-nan'), float('nan')])
             out = paddle.tensor.isnan(x)
             print(out.numpy())  # [False False False False False  True  True]
     """
@@ -2005,14 +1817,12 @@ def prod(x, axis=None, keepdim=False, dtype=None, name=None):
         .. code-block:: python
 
             import paddle
-            import numpy as np
 
             paddle.disable_static()
 
             # the axis is a int element
-            data_x = np.array([[0.2, 0.3, 0.5, 0.9],
-                         [0.1, 0.2, 0.6, 0.7]]).astype(np.float32)
-            x = paddle.to_tensor(data_x)
+            x = paddle.to_tensor([[0.2, 0.3, 0.5, 0.9],
+                                  [0.1, 0.2, 0.6, 0.7]])
             out1 = paddle.prod(x)
             print(out1.numpy())
             # [0.0002268]
@@ -2038,9 +1848,8 @@ def prod(x, axis=None, keepdim=False, dtype=None, name=None):
             # int64
 
             # the axis is list
-            data_y = np.array([[[1.0, 2.0], [3.0, 4.0]],
-                               [[5.0, 6.0], [7.0, 8.0]]])
-            y = paddle.to_tensor(data_y)
+            y = paddle.to_tensor([[[1.0, 2.0], [3.0, 4.0]],
+                                  [[5.0, 6.0], [7.0, 8.0]]])
             out6 = paddle.prod(y, [0, 1])
             print(out6.numpy())
             # [105. 384.]
@@ -2073,12 +1882,10 @@ def sign(x, name=None):
     Examples:
         .. code-block:: python
 
-          import numpy as np
           import paddle
 
-          data = np.array([3.0, 0.0, -2.0, 1.7], dtype='float32')
           paddle.disable_static()
-          x = paddle.to_tensor(data)
+          x = paddle.to_tensor([3.0, 0.0, -2.0, 1.7], dtype='float32')
           out = paddle.sign(x=x)
           print(out)  # [1.0, 0.0, -1.0, 1.0]
     """
@@ -2113,12 +1920,9 @@ def tanh(x, name=None):
         .. code-block:: python
 
             import paddle
-            import numpy as np
 
             paddle.disable_static()
-
-            x_data = np.array([-0.4, -0.2, 0.1, 0.3])
-            x = paddle.to_tensor(x_data)
+            x = paddle.to_tensor([-0.4, -0.2, 0.1, 0.3])
             out = paddle.tanh(x)
             print(out.numpy())
             # [-0.37994896 -0.19737532  0.09966799  0.29131261]
diff --git a/python/paddle/tensor/random.py b/python/paddle/tensor/random.py
index 6b08599fad1dfc6b5d60c3798bba802a5ddefd02..9ffd81995eda407740fce03b488375e06a3ae37b 100644
--- a/python/paddle/tensor/random.py
+++ b/python/paddle/tensor/random.py
@@ -14,26 +14,18 @@
 
 # TODO: define random functions  
 
-import numpy as np
-
 from ..fluid import core
-from ..fluid.framework import device_guard, in_dygraph_mode, _varbase_creator, Variable, convert_np_dtype_to_dtype_
-from ..fluid.layers.layer_function_generator import templatedoc
+from ..fluid.framework import in_dygraph_mode, Variable, convert_np_dtype_to_dtype_
 from ..fluid.layer_helper import LayerHelper
-from ..fluid.data_feeder import convert_dtype, check_variable_and_dtype, check_type, check_dtype
+from ..fluid.data_feeder import check_variable_and_dtype, check_type, check_dtype, check_shape
 from ..fluid.layers import utils
-from ..fluid.layers.tensor import fill_constant
 import paddle
-import warnings
-
-from ..fluid.io import shuffle  #DEFINE_ALIAS
 
 __all__ = [
     'bernoulli',
     'standard_normal',
     'normal',
     'uniform',
-    'shuffle',
     'randn',
     'rand',
     'randint',
@@ -65,7 +57,6 @@ def bernoulli(x, name=None):
         .. code-block:: python
 
         import paddle
-        import numpy as np
 
         paddle.disable_static()
 
@@ -94,26 +85,26 @@ def bernoulli(x, name=None):
     return out
 
 
-def gaussian_random(shape, mean=0.0, std=1.0, dtype=None, name=None):
+def gaussian(shape, mean=0.0, std=1.0, dtype=None, name=None):
     """
     This OP returns a Tensor filled with random values sampled from a Gaussian
     distribution, with ``shape`` and ``dtype``.
 
     Args:
-        shape(list|tuple|Tensor): The shape of the output Tensor. If ``shape``
+        shape (list|tuple|Tensor): The shape of the output Tensor. If ``shape``
             is a list or tuple, the elements of it should be integers or Tensors
             (with the shape [1], and the data type int32 or int64). If ``shape``
             is a Tensor, it should be a 1-D Tensor(with the data type int32 or
             int64).
-        mean(float|int, optional): Mean of the output tensor, default is 0.0.
-        std(float|int, optional): Standard deviation of the output tensor, default
+        mean (float|int, optional): Mean of the output tensor, default is 0.0.
+        std (float|int, optional): Standard deviation of the output tensor, default
             is 1.0.
-        seed(int, optional): ${seed_comment}
-        dtype(str|np.dtype, optional): The data type of the output Tensor.
+        seed (int, optional): Random seed of generator.
+        dtype (str|np.dtype, optional): The data type of the output Tensor.
             Supported data types: float32, float64.
             Default is None, use global default dtype (see ``get_default_dtype``
             for details).
-        name(str, optional): The default value is None. Normally there is no
+        name (str, optional): The default value is None. Normally there is no
             need for user to set this property. For more information, please
             refer to :ref:`api_guide_Name`.
 
@@ -121,26 +112,26 @@ def gaussian_random(shape, mean=0.0, std=1.0, dtype=None, name=None):
         Tensor: A Tensor filled with random values sampled from a Gaussian
         distribution, with ``shape`` and ``dtype``. 
     """
+    op_type_for_check = 'gaussian/standard_normal/randn/normal'
+    seed = 0
+
     if dtype is None:
         dtype = paddle.framework.get_default_dtype()
         if dtype not in ['float32', 'float64']:
             raise TypeError(
-                "gaussian_random only supports [float32, float64], but the default dtype is %s"
-                % dtype)
-
+                "{} only supports [float32, float64], but the default dtype is {}"
+                .format(op_type_for_check, dtype))
     if not isinstance(dtype, core.VarDesc.VarType):
         dtype = convert_np_dtype_to_dtype_(dtype)
-    seed = 0
-    op_type_for_check = 'gaussian_random/standard_normal/randn/normal'
 
     if in_dygraph_mode():
-        shape = utils._convert_shape_to_list(shape)
+        shape = utils.convert_shape_to_list(shape)
         return core.ops.gaussian_random('shape', shape, 'mean',
                                         float(mean), 'std',
                                         float(std), 'seed', seed, 'dtype',
                                         dtype)
 
-    check_type(shape, 'shape', (list, tuple, Variable), op_type_for_check)
+    check_shape(shape, op_type_for_check)
     check_dtype(dtype, 'dtype', ['float32', 'float64'], op_type_for_check)
 
     inputs = {}
@@ -151,10 +142,10 @@ def gaussian_random(shape, mean=0.0, std=1.0, dtype=None, name=None):
         'dtype': dtype,
         'use_mkldnn': False
     }
-    utils._get_shape_tensor_inputs(
+    utils.get_shape_tensor_inputs(
         inputs=inputs, attrs=attrs, shape=shape, op_type=op_type_for_check)
 
-    helper = LayerHelper('gaussian_random', **locals())
+    helper = LayerHelper('gaussian', **locals())
     out = helper.create_variable_for_type_inference(dtype)
     helper.append_op(
         type='gaussian_random',
@@ -172,12 +163,12 @@ def standard_normal(shape, dtype=None, name=None):
     and ``dtype``.
 
     Args:
-        shape(list|tuple|Tensor): The shape of the output Tensor. If ``shape``
+        shape (list|tuple|Tensor): The shape of the output Tensor. If ``shape``
             is a list or tuple, the elements of it should be integers or Tensors
             (with the shape [1], and the data type int32 or int64). If ``shape``
             is a Tensor, it should be a 1-D Tensor(with the data type int32 or
             int64).
-        dtype(str|np.dtype, optional): The data type of the output Tensor.
+        dtype (str|np.dtype, optional): The data type of the output Tensor.
             Supported data types: float32, float64.
             Default is None, use global default dtype (see ``get_default_dtype``
             for details).
@@ -189,27 +180,22 @@ def standard_normal(shape, dtype=None, name=None):
         normal distribution with mean 0 and standard deviation 1, with
         ``shape`` and ``dtype``.
 
-    Raises:
-        TypeError: If ``shape`` is not list, tuple, Tensor.
-        TypeError: If ``dtype`` is not float32, float64.
-
     Examples:
         .. code-block:: python
 
             import paddle
-            import numpy as np
 
             paddle.disable_static()
 
             # example 1: attr shape is a list which doesn't contain Tensor.
-            result_1 = paddle.standard_normal(shape=[2, 3])
+            out1 = paddle.standard_normal(shape=[2, 3])
             # [[-2.923464  ,  0.11934398, -0.51249987],  # random
             #  [ 0.39632758,  0.08177969,  0.2692008 ]]  # random
 
             # example 2: attr shape is a list which contains Tensor.
-            dim_1 = paddle.fill_constant([1], "int64", 2)
-            dim_2 = paddle.fill_constant([1], "int32", 3)
-            result_2 = paddle.standard_normal(shape=[dim_1, dim_2, 2])
+            dim1 = paddle.full([1], 2, "int64")
+            dim2 = paddle.full([1], 3, "int32")
+            out2 = paddle.standard_normal(shape=[dim1, dim2, 2])
             # [[[-2.8852394 , -0.25898588],  # random
             #   [-0.47420555,  0.17683524],  # random
             #   [-0.7989969 ,  0.00754541]],  # random
@@ -218,21 +204,14 @@ def standard_normal(shape, dtype=None, name=None):
             #   [ 0.8086993 ,  0.6868893 ]]]  # random
 
             # example 3: attr shape is a Tensor, the data type must be int64 or int32.
-            var_shape = paddle.to_tensor(np.array([2, 3]))
-            result_3 = paddle.standard_normal(var_shape)
+            shape_tensor = paddle.to_tensor([2, 3])
+            result_3 = paddle.standard_normal(shape_tensor)
+
             # [[-2.878077 ,  0.17099959,  0.05111201]  # random
             #  [-0.3761474, -1.044801  ,  1.1870178 ]]  # random
 
     """
-    if dtype is None:
-        dtype = paddle.framework.get_default_dtype()
-        if dtype not in ['float32', 'float64']:
-            raise TypeError(
-                "standard_normal only supports [float32, float64], but the default dtype is %s"
-                % dtype)
-
-    return gaussian_random(
-        shape=shape, mean=0.0, std=1.0, dtype=dtype, name=name)
+    return gaussian(shape=shape, mean=0.0, std=1.0, dtype=dtype, name=name)
 
 
 randn = standard_normal
@@ -275,7 +254,6 @@ def normal(mean=0.0, std=1.0, shape=None, name=None):
         .. code-block:: python
 
             import paddle
-            import numpy as np
 
             paddle.disable_static()
 
@@ -283,11 +261,11 @@ def normal(mean=0.0, std=1.0, shape=None, name=None):
             # [[ 0.17501129  0.32364586  1.561118  ]  # random
             #  [-1.7232178   1.1545963  -0.76156676]]  # random
 
-            mean_tensor = paddle.to_tensor(np.array([1.0, 2.0, 3.0]))
+            mean_tensor = paddle.to_tensor([1.0, 2.0, 3.0])
             out2 = paddle.normal(mean=mean_tensor)
             # [ 0.18644847 -1.19434458  3.93694787]  # random
 
-            std_tensor = paddle.to_tensor(np.array([1.0, 2.0, 3.0]))
+            std_tensor = paddle.to_tensor([1.0, 2.0, 3.0])
             out3 = paddle.normal(mean=mean_tensor, std=std_tensor)
             # [1.00780561 3.78457445 5.81058198]  # random
 
@@ -306,16 +284,7 @@ def normal(mean=0.0, std=1.0, shape=None, name=None):
                 "If std is Tensor, it's data type only support float32, float64."
             )
         if shape is not None:
-            if isinstance(shape, (list, tuple)):
-                for item in shape:
-                    check_type(item, 'shape', (int), 'normal',
-                               'Elements of shape should be int.')
-            elif isinstance(shape, Variable):
-                check_dtype(shape.dtype, 'shape', ['int32', 'int64'], 'normal')
-            else:
-                assert TypeError(
-                    'If mean and std are all not Tensor, shape should be list, tuple, Tensor.'
-                )
+            check_shape(shape, 'normal')
 
     if isinstance(mean, Variable):
         if isinstance(std, Variable):
@@ -330,7 +299,7 @@ def normal(mean=0.0, std=1.0, shape=None, name=None):
         mean = float(mean)
         out = standard_normal(paddle.shape(std), std.dtype, name)
     else:
-        return gaussian_random(shape=shape, mean=mean, std=std, name=name)
+        return gaussian(shape=shape, mean=mean, std=std, name=name)
 
     out = out * std + mean
     if not in_dygraph_mode():
@@ -383,7 +352,6 @@ def uniform(shape, dtype=None, min=-1.0, max=1.0, seed=0, name=None):
     Examples:
         .. code-block:: python
             
-            import numpy as np
             import paddle
 
             paddle.disable_static()
@@ -405,8 +373,7 @@ def uniform(shape, dtype=None, min=-1.0, max=1.0, seed=0, name=None):
 
             # example 3:
             # attr shape is a Tensor, the data type must be int64 or int32.
-            shape = np.array([2, 3])
-            shape_tensor = paddle.to_tensor(shape)
+            shape_tensor = paddle.to_tensor([2, 3])
             result_3 = paddle.tensor.random.uniform(shape_tensor)
             # if shape_tensor's value is [2, 3]
             # result_3 is:
@@ -419,27 +386,27 @@ def uniform(shape, dtype=None, min=-1.0, max=1.0, seed=0, name=None):
         dtype = paddle.framework.get_default_dtype()
         if dtype not in ['float32', 'float64']:
             raise TypeError(
-                "uniform only supports [float32, float64], but the default dtype is %s"
-                % dtype)
+                "uniform/rand only supports [float32, float64], but the default dtype is {}".
+                format(dtype))
 
     if not isinstance(dtype, core.VarDesc.VarType):
         dtype = convert_np_dtype_to_dtype_(dtype)
 
     if in_dygraph_mode():
-        shape = utils._convert_shape_to_list(shape)
+        shape = utils.convert_shape_to_list(shape)
         return core.ops.uniform_random('shape', shape, 'min',
                                        float(min), 'max',
                                        float(max), 'seed', seed, 'dtype', dtype)
 
-    check_type(shape, 'shape', (list, tuple, Variable), 'uniform_random/rand')
-    check_dtype(dtype, 'dtype', ('float32', 'float64'), 'uniform_random/rand')
+    check_type(shape, 'shape', (list, tuple, Variable), 'uniform/rand')
+    check_dtype(dtype, 'dtype', ('float32', 'float64'), 'uniform/rand')
 
     inputs = dict()
     attrs = {'seed': seed, 'min': min, 'max': max, 'dtype': dtype}
-    utils._get_shape_tensor_inputs(
-        inputs=inputs, attrs=attrs, shape=shape, op_type='uniform_random/rand')
+    utils.get_shape_tensor_inputs(
+        inputs=inputs, attrs=attrs, shape=shape, op_type='uniform/rand')
 
-    helper = LayerHelper("uniform_random", **locals())
+    helper = LayerHelper("uniform", **locals())
     out = helper.create_variable_for_type_inference(dtype)
     helper.append_op(
         type="uniform_random", inputs=inputs, attrs=attrs,
@@ -449,29 +416,26 @@ def uniform(shape, dtype=None, min=-1.0, max=1.0, seed=0, name=None):
 
 def randint(low=0, high=None, shape=[1], dtype=None, name=None):
     """
-	:alias_main: paddle.randint
-	:alias: paddle.tensor.randint, paddle.tensor.random.randint
-
     This OP returns a Tensor filled with random integers from a discrete uniform
     distribution in the range [``low``, ``high``), with ``shape`` and ``dtype``.
     If ``high`` is None (the default), the range is [0, ``low``).
 
     Args:
-        low(int): The lower bound on the range of random values to generate.
+        low (int): The lower bound on the range of random values to generate.
             The ``low`` is included in the range. If ``high`` is None, the
             range is [0, ``low``). Default is 0.
-        high(int, optional): The upper bound on the range of random values to
+        high (int, optional): The upper bound on the range of random values to
             generate, the ``high`` is excluded in the range. Default is None
             (see above for behavior if high = None). Default is None.
-        shape(list|tuple|Tensor): The shape of the output Tensor. If ``shape``
+        shape (list|tuple|Tensor): The shape of the output Tensor. If ``shape``
             is a list or tuple, the elements of it should be integers or Tensors
             (with the shape [1], and the data type int32 or int64). If ``shape``
             is a Tensor, it should be a 1-D Tensor(with the data type int32 or
             int64). Default is [1].
-        dtype(str|np.dtype, optional): The data type of the
+        dtype (str|np.dtype, optional): The data type of the
             output tensor. Supported data types: int32, int64. If ``dytpe``
             is None, the data type is int64. Default is None.
-        name(str, optional): The default value is None.  Normally there is no
+        name (str, optional): The default value is None.  Normally there is no
             need for user to set this property.  For more information, please
             refer to :ref:`api_guide_Name`.
 
@@ -479,48 +443,43 @@ def randint(low=0, high=None, shape=[1], dtype=None, name=None):
         Tensor: A Tensor filled with random integers from a discrete uniform
         distribution in the range [``low``, ``high``), with ``shape`` and ``dtype``.
 
-    Raises:
-        TypeError: If ``shape`` is not list, tuple, Tensor.
-        TypeError: If ``dtype`` is not int32, int64.
-        ValueError: If ``high`` is not greater then ``low``; If ``high`` is 
-            None, and ``low`` is not greater than 0.
-
     Examples:
         .. code-block:: python
 
             import paddle
-            import numpy as np
 
             paddle.disable_static()
 
             # example 1:
             # attr shape is a list which doesn't contain Tensor.
-            result_1 = paddle.randint(low=-5, high=5, shape=[3])
+            out1 = paddle.randint(low=-5, high=5, shape=[3])
             # [0, -3, 2]  # random
 
             # example 2:
             # attr shape is a list which contains Tensor.
-            dim_1 = paddle.fill_constant([1], "int64", 2)
-            dim_2 = paddle.fill_constant([1], "int32", 3)
-            result_2 = paddle.randint(low=-5, high=5, shape=[dim_1, dim_2], dtype="int32")
+            dim1 = paddle.full([1], 2, "int64")
+            dim2 = paddle.full([1], 3, "int32")
+            out2 = paddle.randint(low=-5, high=5, shape=[dim1, dim2], dtype="int32")
             # [[0, -1, -3],  # random
             #  [4, -2,  0]]  # random
 
             # example 3:
             # attr shape is a Tensor
-            var_shape = paddle.to_variable(np.array([3]))
-            result_3 = paddle.randint(low=-5, high=5, shape=var_shape)
+
+            shape_tensor = paddle.to_tensor(3)
+            result_3 = paddle.randint(low=-5, high=5, shape=shape_tensor)
+
             # [-2, 2, 3]  # random
 
             # example 4:
             # data type is int32
-            result_4 = paddle.randint(low=-5, high=5, shape=[3], dtype='int32')
+            out4 = paddle.randint(low=-5, high=5, shape=[3], dtype='int32')
             # [-5, 4, -4]  # random
 
             # example 5:
             # Input only one parameter
             # low=0, high=10, shape=[1], dtype='int64'
-            result_5 = paddle.randint(10)
+            out5 = paddle.randint(10)
             # [7]  # random
 
     """
@@ -537,11 +496,11 @@ def randint(low=0, high=None, shape=[1], dtype=None, name=None):
         dtype = convert_np_dtype_to_dtype_(dtype)
 
     if in_dygraph_mode():
-        shape = utils._convert_shape_to_list(shape)
+        shape = utils.convert_shape_to_list(shape)
         return core.ops.randint('shape', shape, 'low', low, 'high', high,
                                 'seed', 0, 'dtype', dtype)
 
-    check_type(shape, 'shape', (list, tuple, Variable), 'randint')
+    check_shape(shape, 'randint')
     check_dtype(dtype, 'dtype', ['int32', 'int64'], 'randint')
     if low >= high:
         raise ValueError(
@@ -550,7 +509,7 @@ def randint(low=0, high=None, shape=[1], dtype=None, name=None):
 
     inputs = dict()
     attrs = {'low': low, 'high': high, 'seed': 0, 'dtype': dtype}
-    utils._get_shape_tensor_inputs(
+    utils.get_shape_tensor_inputs(
         inputs=inputs, attrs=attrs, shape=shape, op_type='randint')
 
     helper = LayerHelper("randint", **locals())
@@ -560,21 +519,17 @@ def randint(low=0, high=None, shape=[1], dtype=None, name=None):
     return out
 
 
-@templatedoc()
 def randperm(n, dtype="int64", name=None):
     """
-	:alias_main: paddle.randperm
-	:alias: paddle.tensor.randperm, paddle.tensor.random.randperm
-
     This OP returns a 1-D Tensor filled with random permutation values from 0
     to n-1, with ``dtype``.
 
     Args:
-        n(int): The upper bound (exclusive), and it should be greater than 0.
-        dtype(str|np.dtype, optional): The data type of
+        n (int): The upper bound (exclusive), and it should be greater than 0.
+        dtype (str|np.dtype, optional): The data type of
             the output Tensor. Supported data types: int32, int64, float32,
             float64. Default is int64.
-        name(str, optional): The default value is None. Normally there is no
+        name (str, optional): The default value is None. Normally there is no
             need for user to set this property. For more information, please
             refer to :ref:`api_guide_Name`.
 
@@ -582,10 +537,6 @@ def randperm(n, dtype="int64", name=None):
         Tensor: A 1-D Tensor filled with random permutation values from 0
         to n-1, with ``dtype``.
 
-    Raises:
-        ValueError: If ``n`` is not greater than 0.
-        TypeError: If ``dtype`` is not int32, int64, float32, float64.
-
     Examples:
         .. code-block:: python
 
@@ -593,10 +544,10 @@ def randperm(n, dtype="int64", name=None):
 
             paddle.disable_static()
 
-            result_1 = paddle.randperm(5)
+            out1 = paddle.randperm(5)
             # [4, 1, 2, 3, 0]  # random
 
-            result_2 = paddle.randperm(7, 'int32')
+            out2 = paddle.randperm(7, 'int32')
             # [1, 6, 2, 0, 4, 3, 5]  # random
  
     """
@@ -622,32 +573,20 @@ def randperm(n, dtype="int64", name=None):
 
 def rand(shape, dtype=None, name=None):
     """
-	:alias_main: paddle.rand
-	:alias: paddle.tensor.rand, paddle.tensor.random.rand
-
     This OP returns a Tensor filled with random values sampled from a uniform
     distribution in the range [0, 1), with ``shape`` and ``dtype``.
 
-    Examples:
-    ::
-
-        Input:
-          shape = [1, 2]
-
-        Output:
-          result=[[0.8505902, 0.8397286]]
-
     Args:
-        shape(list|tuple|Tensor): The shape of the output Tensor. If ``shape``
+        shape (list|tuple|Tensor): The shape of the output Tensor. If ``shape``
             is a list or tuple, the elements of it should be integers or Tensors
             (with the shape [1], and the data type int32 or int64). If ``shape``
             is a Tensor, it should be a 1-D Tensor(with the data type int32 or
             int64).
-        dtype(str|np.dtype, optional): The data type of the output Tensor.
+        dtype (str|np.dtype, optional): The data type of the output Tensor.
             Supported data types: float32, float64.
             Default is None, use global default dtype (see ``get_default_dtype``
             for details).
-        name(str, optional): The default value is None. Normally there is no
+        name (str, optional): The default value is None. Normally there is no
             need for user to set this property. For more information, please
             refer to :ref:`api_guide_Name`.
 
@@ -655,26 +594,21 @@ def rand(shape, dtype=None, name=None):
         Tensor: A Tensor filled with random values sampled from a uniform
         distribution in the range [0, 1), with ``shape`` and ``dtype``.
 
-    Raises:
-        TypeError: If ``shape`` is not list, tuple, Tensor.
-        ValueError: If ``dtype`` is not float32, float64.
-
     Examples:
         .. code-block:: python
 
             import paddle
-            import numpy as np
 
             paddle.disable_static()
             # example 1: attr shape is a list which doesn't contain Tensor.
-            result_1 = paddle.rand(shape=[2, 3])
+            out1 = paddle.rand(shape=[2, 3])
             # [[0.451152  , 0.55825245, 0.403311  ],  # random
             #  [0.22550228, 0.22106001, 0.7877319 ]]  # random
 
             # example 2: attr shape is a list which contains Tensor.
-            dim_1 = paddle.fill_constant([1], "int64", 2)
-            dim_2 = paddle.fill_constant([1], "int32", 3)
-            result_2 = paddle.rand(shape=[dim_1, dim_2, 2])
+            dim1 = paddle.full([1], 2, "int64")
+            dim2 = paddle.full([1], 3, "int32")
+            out2 = paddle.rand(shape=[dim1, dim2, 2])
             # [[[0.8879919 , 0.25788337],  # random
             #   [0.28826773, 0.9712097 ],  # random
             #   [0.26438272, 0.01796806]],  # random
@@ -683,19 +617,11 @@ def rand(shape, dtype=None, name=None):
             #   [0.870881  , 0.2984597 ]]]  # random
 
             # example 3: attr shape is a Tensor, the data type must be int64 or int32.
-            var_shape = paddle.to_variable(np.array([2, 3]))
-            result_3 = paddle.rand(var_shape)
+            shape_tensor = paddle.to_tensor([2, 3])
+            result_3 = paddle.rand(shape_tensor)
+
             # [[0.22920267, 0.841956  , 0.05981819],  # random
             #  [0.4836288 , 0.24573246, 0.7516129 ]]  # random
 
     """
-    if dtype is None:
-        dtype = paddle.framework.get_default_dtype()
-        if dtype not in ['float32', 'float64']:
-            raise TypeError(
-                "rand only supports [float32, float64], but the default dtype is %s"
-                % dtype)
-
-    out = uniform(shape, dtype, min=0.0, max=1.0, name=name)
-    out.stop_gradient = True
-    return out
+    return uniform(shape, dtype, min=0.0, max=1.0, name=name)
diff --git a/python/paddle/tensor/search.py b/python/paddle/tensor/search.py
index 552da3401c61d9c046c29bc86b429a8ae1242fa5..f55d285586f0ec6959573af64e720bea5de10c8d 100644
--- a/python/paddle/tensor/search.py
+++ b/python/paddle/tensor/search.py
@@ -66,16 +66,15 @@ def argsort(x, axis=-1, descending=False, name=None):
     Examples:
         .. code-block:: python
             import paddle
-            import numpy as np
             
             paddle.disable_static()
-            input_array = np.array([[[5,8,9,5],
-                            [0,0,1,7],
-                            [6,9,2,4]],
-                            [[5,2,4,2],
-                            [4,7,7,9],
-                            [1,7,0,6]]]).astype(np.float32)
-            x = paddle.to_variable(input_array)
+            x = paddle.to_tensor([[[5,8,9,5],
+                                   [0,0,1,7],
+                                   [6,9,2,4]],
+                                  [[5,2,4,2],
+                                   [4,7,7,9],
+                                   [1,7,0,6]]], 
+                                dtype='float32')
             out1 = paddle.argsort(x=x, axis=-1)
             out2 = paddle.argsort(x=x, axis=0)
             out3 = paddle.argsort(x=x, axis=1)
@@ -148,14 +147,12 @@ def argmax(x, axis=None, keepdim=False, dtype="int64", name=None):
     Examples:
         .. code-block:: python
 
-            import numpy as np
             import paddle
 
             paddle.disable_static()
-            data = np.array([[5,8,9,5],
-                             [0,0,1,7],
-                             [6,9,2,4]])
-            x =  paddle.to_variable(data)
+            x =  paddle.to_tensor([[5,8,9,5],
+                                     [0,0,1,7],
+                                     [6,9,2,4]])
             out1 = paddle.argmax(x)
             print(out1.numpy()) # 2
             out2 = paddle.argmax(x, axis=1)
@@ -169,6 +166,12 @@ def argmax(x, axis=None, keepdim=False, dtype="int64", name=None):
         raise TypeError(
             "The type of 'axis'  must be int or None in argmax, but received %s."
             % (type(axis)))
+
+    if dtype is None:
+        raise ValueError(
+            "the value of 'dtype' in argmax could not be None, but received None"
+        )
+
     var_dtype = convert_np_dtype_to_dtype_(dtype)
     check_dtype(var_dtype, 'dtype', ['int32', 'int64'], 'argmin')
     flatten = False
@@ -222,14 +225,12 @@ def argmin(x, axis=None, keepdim=False, dtype="int64", name=None):
     Examples:
         .. code-block:: python
 
-            import numpy as np
             import paddle
 
             paddle.disable_static()
-            data = np.array([[5,8,9,5],
-                             [0,0,1,7],
-                             [6,9,2,4]])
-            x =  paddle.to_variable(data)
+            x =  paddle.to_tensor([[5,8,9,5],
+                                     [0,0,1,7],
+                                     [6,9,2,4]])
             out1 = paddle.argmin(x)
             print(out1.numpy()) # 4
             out2 = paddle.argmin(x, axis=1)
@@ -243,6 +244,12 @@ def argmin(x, axis=None, keepdim=False, dtype="int64", name=None):
         raise TypeError(
             "The type of 'axis'  must be int or None in argmin, but received %s."
             % (type(axis)))
+
+    if dtype is None:
+        raise ValueError(
+            "the value of 'dtype' in argmin could not be None, but received None"
+        )
+
     var_dtype = convert_np_dtype_to_dtype_(dtype)
     check_dtype(var_dtype, 'dtype', ['int32', 'int64'], 'argmin')
     flatten = False
@@ -292,24 +299,16 @@ def index_select(x, index, axis=0, name=None):
     Returns:
         Tensor: A Tensor with same data type as ``x``.
     
-    Raises:
-        TypeError: ``x`` must be a Tensor and the data type of ``x`` must be one of  float32, float64, int32 and int64.
-        TypeError: ``index`` must be a Tensor and the data type of ``index`` must be int32 or int64.
-
     Examples:
         .. code-block:: python
             
             import paddle
-            import numpy as np
 
             paddle.disable_static()  # Now we are in imperative mode
-            data = np.array([[1.0, 2.0, 3.0, 4.0],
-                             [5.0, 6.0, 7.0, 8.0],
-                             [9.0, 10.0, 11.0, 12.0]])
-            data_index = np.array([0, 1, 1]).astype('int32')
-
-            x = paddle.to_tensor(data)
-            index = paddle.to_tensor(data_index)
+            x = paddle.to_tensor([[1.0, 2.0, 3.0, 4.0],
+                                  [5.0, 6.0, 7.0, 8.0],
+                                  [9.0, 10.0, 11.0, 12.0]])
+            index = paddle.to_tensor([0, 1, 1], dtype='int32')
             out_z1 = paddle.index_select(x=x, index=index)
             #[[1. 2. 3. 4.]
             # [5. 6. 7. 8.]
@@ -363,48 +362,44 @@ def nonzero(input, as_tuple=False):
     Examples:
         .. code-block:: python
             import paddle
-            import paddle.fluid as fluid
-            import numpy as np
-
-            data1 = np.array([[1.0, 0.0, 0.0],
-                              [0.0, 2.0, 0.0],
-                              [0.0, 0.0, 3.0]])
-            data2 = np.array([0.0, 1.0, 0.0, 3.0])
-            data3 = np.array([0.0, 0.0, 0.0])
-            with fluid.dygraph.guard():
-                x1 = fluid.dygraph.to_variable(data1)
-                x2 = fluid.dygraph.to_variable(data2)
-                x3 = fluid.dygraph.to_variable(data3)
-                out_z1 = paddle.nonzero(x1)
-                print(out_z1.numpy())
-                #[[0 0]
-                # [1 1]
-                # [2 2]]
-                out_z1_tuple = paddle.nonzero(x1, as_tuple=True)
-                for out in out_z1_tuple:
-                    print(out.numpy())
-                #[[0]
-                # [1]
-                # [2]]
-                #[[0]
-                # [1]
-                # [2]]
-                out_z2 = paddle.nonzero(x2)
-                print(out_z2.numpy())
-                #[[1]
-                # [3]]
-                out_z2_tuple = paddle.nonzero(x2, as_tuple=True)
-                for out in out_z2_tuple:
-                    print(out.numpy())
-                #[[1]
-                # [3]]
-                out_z3 = paddle.nonzero(x3)
-                print(out_z3.numpy())
-                #[]
-                out_z3_tuple = paddle.nonzero(x3, as_tuple=True)
-                for out in out_z3_tuple:
-                    print(out.numpy())
-                #[]                    
+
+            paddle.disable_static()
+
+            x1 = paddle.to_tensor([[1.0, 0.0, 0.0],
+                          [0.0, 2.0, 0.0],
+                          [0.0, 0.0, 3.0]])
+            x2 = paddle.to_tensor([0.0, 1.0, 0.0, 3.0])
+            x3 = paddle.to_tensor([0.0, 0.0, 0.0])
+            out_z1 = paddle.nonzero(x1)
+            print(out_z1.numpy())
+            #[[0 0]
+            # [1 1]
+            # [2 2]]
+            out_z1_tuple = paddle.nonzero(x1, as_tuple=True)
+            for out in out_z1_tuple:
+                print(out.numpy())
+            #[[0]
+            # [1]
+            # [2]]
+            #[[0]
+            # [1]
+            # [2]]
+            out_z2 = paddle.nonzero(x2)
+            print(out_z2.numpy())
+            #[[1]
+            # [3]]
+            out_z2_tuple = paddle.nonzero(x2, as_tuple=True)
+            for out in out_z2_tuple:
+                print(out.numpy())
+            #[[1]
+            # [3]]
+            out_z3 = paddle.nonzero(x3)
+            print(out_z3.numpy())
+            #[]
+            out_z3_tuple = paddle.nonzero(x3, as_tuple=True)
+            for out in out_z3_tuple:
+                print(out.numpy())
+            #[]                    
     """
     list_out = []
     shape = input.shape
@@ -451,16 +446,15 @@ def sort(x, axis=-1, descending=False, name=None):
     Examples:
         .. code-block:: python
             import paddle
-            import numpy as np
             
             paddle.disable_static()
-            input_array = np.array([[[5,8,9,5],
-                            [0,0,1,7],
-                            [6,9,2,4]],
-                            [[5,2,4,2],
-                            [4,7,7,9],
-                            [1,7,0,6]]]).astype(np.float32)
-            x = paddle.to_variable(input_array)
+            x = paddle.to_tensor([[[5,8,9,5],
+                                   [0,0,1,7],
+                                   [6,9,2,4]],
+                                  [[5,2,4,2],
+                                   [4,7,7,9],
+                                   [1,7,0,6]]], 
+                                 dtype='float32')
             out1 = paddle.sort(x=x, axis=-1)
             out2 = paddle.sort(x=x, axis=0)
             out3 = paddle.sort(x=x, axis=1)
@@ -536,16 +530,11 @@ def where(condition, x, y, name=None):
         .. code-block:: python
 
           import paddle
-          import numpy as np
-          import paddle.fluid as fluid
 
-          x_i = np.array([0.9383, 0.1983, 3.2, 1.2]).astype("float32")
-          y_i = np.array([1.0, 1.0, 1.0, 1.0]).astype("float32")
-
-          with fluid.dygraph.guard():
-              x = fluid.dygraph.to_variable(x_i)
-              y = fluid.dygraph.to_variable(y_i)
-              out = paddle.where(x>1, x, y)
+          paddle.disable_static()
+          x = paddle.to_tensor([0.9383, 0.1983, 3.2, 1.2])
+          y = paddle.to_tensor([1.0, 1.0, 1.0, 1.0])
+          out = paddle.where(x>1, x, y)
 
           print(out.numpy())
           #out: [1.0, 1.0, 3.2, 1.2]
@@ -622,50 +611,41 @@ def index_sample(x, index):
         .. code-block:: python
 
             import paddle
-            import paddle.fluid as fluid
-            import numpy as np
-
-            data = np.array([[1.0, 2.0, 3.0, 4.0],
-                                [5.0, 6.0, 7.0, 8.0],
-                                [9.0, 10.0, 11.0, 12.0]]).astype('float32')
-
-            data_index = np.array([[0, 1, 2],
-                                    [1, 2, 3],
-                                    [0, 0, 0]]).astype('int32')
-
-            target_data = np.array([[100, 200, 300, 400],
-                                    [500, 600, 700, 800],
-                                    [900, 1000, 1100, 1200]]).astype('int32')
-
-            with fluid.dygraph.guard():
-                x = fluid.dygraph.to_variable(data)
-                index = fluid.dygraph.to_variable(data_index)
-                target = fluid.dygraph.to_variable(target_data)
-
-                out_z1 = paddle.index_sample(x, index)
-                print(out_z1.numpy())
-                #[[1. 2. 3.]
-                # [6. 7. 8.]
-                # [9. 9. 9.]]
-
-                # Use the index of the maximum value by topk op
-                # get the value of the element of the corresponding index in other tensors
-                top_value, top_index = fluid.layers.topk(x, k=2)
-                out_z2 = paddle.index_sample(target, top_index)
-                print(top_value.numpy())
-                #[[ 4.  3.]
-                # [ 8.  7.]
-                # [12. 11.]]
-
-                print(top_index.numpy())
-                #[[3 2]
-                # [3 2]
-                # [3 2]]
-
-                print(out_z2.numpy())
-                #[[ 400  300]
-                # [ 800  700]
-                # [1200 1100]]
+
+            paddle.disable_static()
+            x = paddle.to_tensor([[1.0, 2.0, 3.0, 4.0],
+                                  [5.0, 6.0, 7.0, 8.0],
+                                  [9.0, 10.0, 11.0, 12.0]], dtype='float32')
+            index = paddle.to_tensor([[0, 1, 2],
+                                      [1, 2, 3],
+                                      [0, 0, 0]], dtype='int32')
+            target = paddle.to_tensor([[100, 200, 300, 400],
+                                       [500, 600, 700, 800],
+                                       [900, 1000, 1100, 1200]], dtype='int32')
+            out_z1 = paddle.index_sample(x, index)
+            print(out_z1.numpy())
+            #[[1. 2. 3.]
+            # [6. 7. 8.]
+            # [9. 9. 9.]]
+
+            # Use the index of the maximum value by topk op
+            # get the value of the element of the corresponding index in other tensors
+            top_value, top_index = paddle.topk(x, k=2)
+            out_z2 = paddle.index_sample(target, top_index)
+            print(top_value.numpy())
+            #[[ 4.  3.]
+            # [ 8.  7.]
+            # [12. 11.]]
+
+            print(top_index.numpy())
+            #[[3 2]
+            # [3 2]
+            # [3 2]]
+
+            print(out_z2.numpy())
+            #[[ 400  300]
+            # [ 800  700]
+            # [1200 1100]]
 
 
     """
@@ -698,27 +678,20 @@ def masked_select(x, mask, name=None):
 
     Returns: A 1-D Tensor which is the same data type  as ``x``.
     
-    Raises:
-        TypeError: ``x`` must be a Tensor and the data type of ``x`` must be one of  float32, float64, int32 and int64.
-        TypeError: ``mask`` must be a Tensor and the data type of ``mask`` must be bool.
-
     Examples:
 
         .. code-block:: python
 
             import paddle
-            import numpy as np
-            
+
             paddle.disable_static()
-            data = np.array([[1.0, 2.0, 3.0, 4.0],
-                                [5.0, 6.0, 7.0, 8.0],
-                                [9.0, 10.0, 11.0, 12.0]]).astype('float32')
-            
-            mask_data = np.array([[True, False, False, False],
-                            [True, True, False, False],
-                            [True, False, False, False]]).astype('bool')
-            x = paddle.to_tensor(data)
-            mask = paddle.to_tensor(mask_data)
+
+            x = paddle.to_tensor([[1.0, 2.0, 3.0, 4.0],
+                                  [5.0, 6.0, 7.0, 8.0],
+                                  [9.0, 10.0, 11.0, 12.0]])
+            mask = paddle.to_tensor([[True, False, False, False],
+                                     [True, True, False, False],
+                                     [True, False, False, False]])
             out = paddle.masked_select(x, mask)
             #[1.0 5.0 6.0 9.0]
     """
@@ -763,20 +736,17 @@ def topk(x, k, axis=None, largest=True, sorted=True, name=None):
 
         .. code-block:: python
 
-           import numpy as np
            import paddle
 
            paddle.disable_static()
 
-           data_1 = np.array([1, 4, 5, 7])
-           tensor_1 = paddle.to_tensor(data_1)
+           tensor_1 = paddle.to_tensor([1, 4, 5, 7])
            value_1, indices_1 = paddle.topk(tensor_1, k=1)
            print(value_1.numpy())
            # [7]
            print(indices_1.numpy())
            # [3] 
-           data_2 = np.array([[1, 4, 5, 7], [2, 6, 2, 5]])
-           tensor_2 = paddle.to_tensor(data_2)
+           tensor_2 = paddle.to_tensor([[1, 4, 5, 7], [2, 6, 2, 5]])
            value_2, indices_2 = paddle.topk(tensor_2, k=1)
            print(value_2.numpy())
            # [[7]
diff --git a/python/paddle/tensor/stat.py b/python/paddle/tensor/stat.py
index 91676a6316b81a1998b9b48fb9ea7fcba6d67c25..d56dff5a81018e13e1c186f66172f868b0c4074b 100644
--- a/python/paddle/tensor/stat.py
+++ b/python/paddle/tensor/stat.py
@@ -237,10 +237,6 @@ def numel(x, name=None):
 
     Returns:
         Tensor: The number of elements for the input Tensor.
-    
-    Raises:
-        TypeError: ``x`` must be a Tensor and the data type of ``x`` must be one of bool, float16, float32, float64, int32, int64.
-
 
     Examples:
         .. code-block:: python
diff --git a/python/paddle/tests/test_dataset_cifar.py b/python/paddle/tests/test_dataset_cifar.py
index 2ecc41c3f0a81a56cc34e826483ea4f5cc6681d9..672de7ae8e94eceded92dfa0e77621eedac0e3b0 100644
--- a/python/paddle/tests/test_dataset_cifar.py
+++ b/python/paddle/tests/test_dataset_cifar.py
@@ -27,8 +27,10 @@ class TestCifar10Train(unittest.TestCase):
         # long time, randomly check 1 sample
         idx = np.random.randint(0, 50000)
         data, label = cifar[idx]
-        self.assertTrue(len(data.shape) == 1)
-        self.assertTrue(data.shape[0] == 3072)
+        self.assertTrue(len(data.shape) == 3)
+        self.assertTrue(data.shape[0] == 3)
+        self.assertTrue(data.shape[1] == 32)
+        self.assertTrue(data.shape[2] == 32)
         self.assertTrue(0 <= int(label) <= 9)
 
 
@@ -41,8 +43,10 @@ class TestCifar10Test(unittest.TestCase):
         # long time, randomly check 1 sample
         idx = np.random.randint(0, 10000)
         data, label = cifar[idx]
-        self.assertTrue(len(data.shape) == 1)
-        self.assertTrue(data.shape[0] == 3072)
+        self.assertTrue(len(data.shape) == 3)
+        self.assertTrue(data.shape[0] == 3)
+        self.assertTrue(data.shape[1] == 32)
+        self.assertTrue(data.shape[2] == 32)
         self.assertTrue(0 <= int(label) <= 9)
 
 
@@ -55,8 +59,10 @@ class TestCifar100Train(unittest.TestCase):
         # long time, randomly check 1 sample
         idx = np.random.randint(0, 50000)
         data, label = cifar[idx]
-        self.assertTrue(len(data.shape) == 1)
-        self.assertTrue(data.shape[0] == 3072)
+        self.assertTrue(len(data.shape) == 3)
+        self.assertTrue(data.shape[0] == 3)
+        self.assertTrue(data.shape[1] == 32)
+        self.assertTrue(data.shape[2] == 32)
         self.assertTrue(0 <= int(label) <= 99)
 
 
@@ -69,8 +75,10 @@ class TestCifar100Test(unittest.TestCase):
         # long time, randomly check 1 sample
         idx = np.random.randint(0, 10000)
         data, label = cifar[idx]
-        self.assertTrue(len(data.shape) == 1)
-        self.assertTrue(data.shape[0] == 3072)
+        self.assertTrue(len(data.shape) == 3)
+        self.assertTrue(data.shape[0] == 3)
+        self.assertTrue(data.shape[1] == 32)
+        self.assertTrue(data.shape[2] == 32)
         self.assertTrue(0 <= int(label) <= 99)
 
 
diff --git a/python/paddle/tests/test_datasets.py b/python/paddle/tests/test_datasets.py
index 1e50ff60aa5c3039c21d6e1e3a714c32000462c7..1e0d6dbacf6c4c5a781aaa40440921fe1a281ca9 100644
--- a/python/paddle/tests/test_datasets.py
+++ b/python/paddle/tests/test_datasets.py
@@ -103,12 +103,14 @@ class TestMNISTTest(unittest.TestCase):
 
 class TestMNISTTrain(unittest.TestCase):
     def test_main(self):
-        mnist = MNIST(mode='train', chw_format=False)
+        mnist = MNIST(mode='train')
         self.assertTrue(len(mnist) == 60000)
 
         for i in range(len(mnist)):
             image, label = mnist[i]
-            self.assertTrue(image.shape[0] == 784)
+            self.assertTrue(image.shape[0] == 1)
+            self.assertTrue(image.shape[1] == 28)
+            self.assertTrue(image.shape[2] == 28)
             self.assertTrue(label.shape[0] == 1)
             self.assertTrue(0 <= int(label) <= 9)
 
diff --git a/python/paddle/tests/test_dist_hapi_model.py b/python/paddle/tests/test_dist_hapi_model.py
index e75e08e3749e6ce629e88c486e4f87d9109dc709..db5b63c5ae0e29fa6f1274befd277c4e46c3a1b1 100644
--- a/python/paddle/tests/test_dist_hapi_model.py
+++ b/python/paddle/tests/test_dist_hapi_model.py
@@ -37,7 +37,11 @@ def get_cluster_from_args(selected_gpus):
     free_ports = find_free_ports(len(selected_gpus))
     if free_ports is not None:
         free_ports = list(free_ports)
-    return get_cluster(node_ips, node_ip, free_ports, selected_gpus)
+
+    trainer_endpoints = []
+    for ip in node_ips:
+        trainer_endpoints.append(["%s:%d" % (ip, port) for port in free_ports])
+    return get_cluster(node_ips, node_ip, trainer_endpoints, selected_gpus)
 
 
 def get_gpus(selected_gpus):
diff --git a/python/paddle/tests/test_model.py b/python/paddle/tests/test_model.py
index 7b79b25cbc3e98b802bad87386ad0572ec6ab8d7..62cc39c1f7b5303d98bffd9eb5814d4579a6d3f1 100644
--- a/python/paddle/tests/test_model.py
+++ b/python/paddle/tests/test_model.py
@@ -67,35 +67,6 @@ class LeNetDygraph(paddle.nn.Layer):
         return x
 
 
-class LeNetDeclarative(fluid.dygraph.Layer):
-    def __init__(self, num_classes=10, classifier_activation=None):
-        super(LeNetDeclarative, self).__init__()
-        self.num_classes = num_classes
-        self.features = Sequential(
-            Conv2d(
-                1, 6, 3, stride=1, padding=1),
-            ReLU(),
-            Pool2D(2, 'max', 2),
-            Conv2d(
-                6, 16, 5, stride=1, padding=0),
-            ReLU(),
-            Pool2D(2, 'max', 2))
-
-        if num_classes > 0:
-            self.fc = Sequential(
-                Linear(400, 120), Linear(120, 84), Linear(84, 10),
-                Softmax())  #Todo: accept any activation
-
-    @declarative
-    def forward(self, inputs):
-        x = self.features(inputs)
-
-        if self.num_classes > 0:
-            x = fluid.layers.flatten(x, 1)
-            x = self.fc(x)
-        return x
-
-
 class MnistDataset(MNIST):
     def __init__(self, mode, return_label=True, sample_num=None):
         super(MnistDataset, self).__init__(mode=mode)
@@ -416,12 +387,37 @@ class TestModelFunction(unittest.TestCase):
             shutil.rmtree(path)
             fluid.disable_dygraph() if dynamic else None
 
+    def test_dynamic_load(self):
+        mnist_data = MnistDataset(mode='train')
+        for new_optimizer in [True, False]:
+            path = tempfile.mkdtemp()
+            paddle.disable_static()
+            net = LeNet()
+            inputs = [InputSpec([None, 1, 28, 28], 'float32', 'x')]
+            labels = [InputSpec([None, 1], 'int64', 'label')]
+            if new_optimizer:
+                optim = paddle.optimizer.Adam(
+                    learning_rate=0.001, parameters=net.parameters())
+            else:
+                optim = fluid.optimizer.Adam(
+                    learning_rate=0.001, parameter_list=net.parameters())
+            model = Model(net, inputs, labels)
+            model.prepare(
+                optimizer=optim, loss=CrossEntropyLoss(reduction="sum"))
+            model.fit(mnist_data, batch_size=64, verbose=0)
+            model.save(path + '/test')
+            model.load(path + '/test')
+            shutil.rmtree(path)
+            paddle.enable_static()
+
     def test_dynamic_save_static_load(self):
         path = tempfile.mkdtemp()
         # dynamic saving
         device = paddle.set_device('cpu')
         fluid.enable_dygraph(device)
-        model = Model(MyModel(classifier_activation=None))
+        inputs = [InputSpec([None, 20], 'float32', 'x')]
+        labels = [InputSpec([None, 1], 'int64', 'label')]
+        model = Model(MyModel(classifier_activation=None), inputs, labels)
         optim = fluid.optimizer.SGD(learning_rate=0.001,
                                     parameter_list=model.parameters())
         model.prepare(optimizer=optim, loss=CrossEntropyLoss(reduction="sum"))
@@ -476,13 +472,54 @@ class TestModelFunction(unittest.TestCase):
             self.assertTrue(params[0].shape[1] == 10)
             fluid.disable_dygraph() if dynamic else None
 
+    def test_summary(self):
+        def _get_param_from_state_dict(state_dict):
+            params = 0
+            for k, v in state_dict.items():
+                params += np.prod(v.numpy().shape)
+            return params
+
+        for dynamic in [True, False]:
+            device = paddle.set_device('cpu')
+            fluid.enable_dygraph(device) if dynamic else None
+            net = MyModel()
+            inputs = [InputSpec([None, 20], 'float32', 'x')]
+            model = Model(net, inputs)
+            model.prepare()
+            params_info = model.summary()
+            gt_params = _get_param_from_state_dict(net.state_dict())
+
+            np.testing.assert_allclose(params_info['total_params'], gt_params)
+            print(params_info)
+
+            model.summary(input_size=(20))
+            model.summary(input_size=[(20)])
+            model.summary(input_size=(20), batch_size=2)
+
+    def test_summary_nlp(self):
+        paddle.enable_static()
+        nlp_net = paddle.nn.GRU(input_size=2, hidden_size=3, num_layers=3)
+        paddle.summary(nlp_net, (1, 2))
+
+    def test_summary_error(self):
+        with self.assertRaises(TypeError):
+            nlp_net = paddle.nn.GRU(input_size=2, hidden_size=3, num_layers=3)
+            paddle.summary(nlp_net, (1, '2'))
+
+        with self.assertRaises(ValueError):
+            nlp_net = paddle.nn.GRU(input_size=2, hidden_size=3, num_layers=3)
+            paddle.summary(nlp_net, (-1, -1))
+
+        paddle.disable_static()
+        nlp_net = paddle.nn.GRU(input_size=2, hidden_size=3, num_layers=3)
+        paddle.summary(nlp_net, (1, 2))
+
     def test_export_deploy_model(self):
         for dynamic in [True, False]:
-            fluid.enable_dygraph() if dynamic else None
-            # paddle.disable_static() if dynamic else None
+            paddle.disable_static() if dynamic else None
             prog_translator = ProgramTranslator()
             prog_translator.enable(False) if not dynamic else None
-            net = LeNetDeclarative()
+            net = LeNet()
             inputs = [InputSpec([None, 1, 28, 28], 'float32', 'x')]
             model = Model(net, inputs)
             model.prepare()
@@ -491,8 +528,9 @@ class TestModelFunction(unittest.TestCase):
                 os.makedirs(save_dir)
             tensor_img = np.array(
                 np.random.random((1, 1, 28, 28)), dtype=np.float32)
-            ori_results = model.test_batch(tensor_img)
+
             model.save(save_dir, training=False)
+            ori_results = model.test_batch(tensor_img)
             fluid.disable_dygraph() if dynamic else None
 
             place = fluid.CPUPlace() if not fluid.is_compiled_with_cuda(
@@ -509,6 +547,7 @@ class TestModelFunction(unittest.TestCase):
                 np.testing.assert_allclose(
                     results, ori_results, rtol=1e-5, atol=1e-7)
                 shutil.rmtree(save_dir)
+            paddle.enable_static()
 
 
 class TestRaiseError(unittest.TestCase):
@@ -520,6 +559,14 @@ class TestRaiseError(unittest.TestCase):
         with self.assertRaises(ValueError):
             model = Model(net, inputs, labels)
 
+    def test_input_without_input_spec(self):
+        for dynamic in [True, False]:
+            paddle.disable_static() if dynamic else None
+            net = MyModel(classifier_activation=None)
+            with self.assertRaises(TypeError):
+                model = Model(net)
+            paddle.enable_static()
+
 
 if __name__ == '__main__':
     unittest.main()
diff --git a/python/paddle/tests/test_text.py b/python/paddle/tests/test_text.py
index 43968896c18bda6445de46773899128e1bedff53..fa83b0cc6f3408bb8fdf33522b17664e35b8f503 100644
--- a/python/paddle/tests/test_text.py
+++ b/python/paddle/tests/test_text.py
@@ -28,6 +28,8 @@ from paddle import Model, set_device
 from paddle.static import InputSpec as Input
 from paddle.text import *
 
+paddle.enable_static()
+
 
 class ModuleApiTest(unittest.TestCase):
     @classmethod
diff --git a/python/paddle/text/datasets/uci_housing.py b/python/paddle/text/datasets/uci_housing.py
index a0d465eb1775431ffa0527dfae8031bebd6fc340..a8dfbc44a97127dd074ef5cbfc727aa535d56872 100644
--- a/python/paddle/text/datasets/uci_housing.py
+++ b/python/paddle/text/datasets/uci_housing.py
@@ -17,6 +17,7 @@ from __future__ import print_function
 import six
 import numpy as np
 
+import paddle
 from paddle.io import Dataset
 from paddle.dataset.common import _check_exists_and_download
 
@@ -88,6 +89,8 @@ class UCIHousing(Dataset):
         # read dataset into memory
         self._load_data()
 
+        self.dtype = paddle.get_default_dtype()
+
     def _load_data(self, feature_num=14, ratio=0.8):
         data = np.fromfile(self.data_file, sep=' ')
         data = data.reshape(data.shape[0] // feature_num, feature_num)
@@ -103,7 +106,8 @@ class UCIHousing(Dataset):
 
     def __getitem__(self, idx):
         data = self.data[idx]
-        return np.array(data[:-1]), np.array(data[-1:])
+        return np.array(data[:-1]).astype(self.dtype), \
+                np.array(data[-1:]).astype(self.dtype)
 
     def __len__(self):
         return len(self.data)
diff --git a/python/paddle/utils/__init__.py b/python/paddle/utils/__init__.py
index f6299980b3e5c0bd0c7551b6b51c9b067d7960b5..77f5ef7e9661e0ff8f4f200ff7e789ba475459c2 100644
--- a/python/paddle/utils/__init__.py
+++ b/python/paddle/utils/__init__.py
@@ -12,16 +12,18 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-from .plot import Ploter
 from .profiler import ProfilerOptions
 from .profiler import Profiler
 from .profiler import get_profiler
 from .deprecated import deprecated
+from .lazy_import import try_import
+from ..fluid.framework import unique_name
+from ..fluid.framework import load_op_library
+from ..fluid.framework import require_version
+
 from . import download
 
-__all__ = ['dump_config', 'Ploter', 'deprecated', 'download']
+__all__ = ['dump_config', 'deprecated', 'download']
 
 #TODO: define new api under this directory
-# __all__ = ['unique_name',
-#            'load_op_library',
-#            'require_version']
+__all__ += ['unique_name', 'load_op_library', 'require_version']
diff --git a/python/paddle/utils/deprecated.py b/python/paddle/utils/deprecated.py
index 08fd7e33479b331454f63f05f6240dd221591ee9..d4e21748b55326468edb2ba1e46114e8d66c0046 100644
--- a/python/paddle/utils/deprecated.py
+++ b/python/paddle/utils/deprecated.py
@@ -45,7 +45,7 @@ def deprecated(update_to="", since="", reason=""):
 
     def decorator(func):
         # TODO(zhiqiu): We temporally disable the warnings for 2.0-bata, and it should be re-enabled in the future.
-        return func
+        # return func
         """construct warning message, and return a decorated function or class."""
         assert isinstance(update_to, str), 'type of "update_to" must be str.'
         assert isinstance(since, str), 'type of "since" must be str.'
@@ -56,9 +56,10 @@ def deprecated(update_to="", since="", reason=""):
         _reason = reason.strip()
 
         msg = 'API "{}.{}" is deprecated'.format(func.__module__, func.__name__)
+
         if len(_since) > 0:
             msg += " since {}".format(_since)
-        msg += ", and may be removed in future versions."
+        msg += ", and will be removed in future versions."
         if len(_update_to) > 0:
             assert _update_to.startswith(
                 "paddle."
@@ -67,6 +68,11 @@ def deprecated(update_to="", since="", reason=""):
             msg += ' Please use "{}" instead.'.format(_update_to)
         if len(_reason) > 0:
             msg += "\n reason: {}".format(_reason)
+        if func.__doc__:
+            func.__doc__ = ('\n\nWarning: ' + msg + '\n') + func.__doc__
+        # TODO(Joejiong) Early returning the wrapper function, currently we disable the warning wrapper, 
+        # because the 2.0beta APIs are still under development, we will restore the warning functionality when 2.0 rc APIs become stable.
+        return func
 
         @functools.wraps(func)
         def wrapper(*args, **kwargs):
@@ -75,6 +81,7 @@ def deprecated(update_to="", since="", reason=""):
                2. since version is empty, in this case, API is deprecated in all versions.
                3. current version is newer than since version.
             """
+            msg = "\033[93mWarning %s \033[0m" % (msg)
             v_current = [int(i) for i in paddle.__version__.split(".")]
             v_current += [0] * (4 - len(v_current))
             v_since = [int(i) for i in _since.split(".")]
diff --git a/python/paddle/utils/lazy_import.py b/python/paddle/utils/lazy_import.py
new file mode 100644
index 0000000000000000000000000000000000000000..69a32b77a8f3da7a5a3432b4c0cbf746bd0b0833
--- /dev/null
+++ b/python/paddle/utils/lazy_import.py
@@ -0,0 +1,34 @@
+# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Lazy imports for heavy dependencies."""
+
+import importlib
+
+
+def try_import(module_name):
+    """Try importing a module, with an informative error message on failure."""
+    install_name = module_name
+    if module_name == 'cv2':
+        install_name = 'opencv-python'
+
+    try:
+        mod = importlib.import_module(module_name)
+        return mod
+    except ImportError:
+        err_msg = (
+            "Failed importing {}. This likely means that some paddle modules "
+            "requires additional dependencies that have to be "
+            "manually installed (usually with `pip install {}`). ").format(
+                module_name, install_name)
+        raise ImportError(err_msg)
diff --git a/python/paddle/utils/plot.py b/python/paddle/utils/plot.py
deleted file mode 100644
index ee651f2f0cd6f2e594a4e74c896baa924f70bbf5..0000000000000000000000000000000000000000
--- a/python/paddle/utils/plot.py
+++ /dev/null
@@ -1,116 +0,0 @@
-# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import os
-import six
-
-
-class PlotData(object):
-    def __init__(self):
-        self.step = []
-        self.value = []
-
-    def append(self, step, value):
-        self.step.append(step)
-        self.value.append(value)
-
-    def reset(self):
-        self.step = []
-        self.value = []
-
-
-class Ploter(object):
-    """
-        Plot input data in a 2D graph
-        
-        Args:
-            title: assign the title of input data.
-            step: x_axis of the data.
-            value: y_axis of the data.
-    """
-
-    def __init__(self, *args):
-        self.__args__ = args
-        self.__plot_data__ = {}
-        for title in args:
-            self.__plot_data__[title] = PlotData()
-        # demo in notebooks will use Ploter to plot figure, but when we convert
-        # the ipydb to py file for testing, the import of matplotlib will make the
-        # script crash. So we can use `export DISABLE_PLOT=True` to disable import
-        # these libs
-        self.__disable_plot__ = os.environ.get("DISABLE_PLOT")
-        if not self.__plot_is_disabled__():
-            import matplotlib.pyplot as plt
-            from IPython import display
-            self.plt = plt
-            self.display = display
-
-    def __plot_is_disabled__(self):
-        return self.__disable_plot__ == "True"
-
-    def append(self, title, step, value):
-        """
-        Feed data
-
-        Args:
-                title: assign the group data to this subtitle.
-                step: the x_axis of data.
-                value: the y_axis of data.
-            
-            Examples:
-                .. code-block:: python
-                plot_curve = Ploter("Curve 1","Curve 2")
-                plot_curve.append(title="Curve 1",step=1,value=1)
-        """
-        assert isinstance(title, six.string_types)
-        assert title in self.__plot_data__
-        data = self.__plot_data__[title]
-        assert isinstance(data, PlotData)
-        data.append(step, value)
-
-    def plot(self, path=None):
-        """
-            Plot data in a 2D graph
-
-            Args:
-                path: store the figure to this file path. Defaul None. 
-              
-            Examples:
-                .. code-block:: python
-                plot_curve = Ploter()
-                plot_cure.plot()
-        """
-        if self.__plot_is_disabled__():
-            return
-
-        titles = []
-        for title in self.__args__:
-            data = self.__plot_data__[title]
-            assert isinstance(data, PlotData)
-            if len(data.step) > 0:
-                titles.append(title)
-                self.plt.plot(data.step, data.value)
-        self.plt.legend(titles, loc='upper left')
-        if path is None:
-            self.display.clear_output(wait=True)
-            self.display.display(self.plt.gcf())
-        else:
-            self.plt.savefig(path)
-        self.plt.gcf().clear()
-
-    def reset(self):
-        for key in self.__plot_data__:
-            data = self.__plot_data__[key]
-            assert isinstance(data, PlotData)
-            data.reset()
diff --git a/python/paddle/vision/datasets/cifar.py b/python/paddle/vision/datasets/cifar.py
index 1193be26da56780058beadfe15640bc76533114a..c531f3d0e4e3d276d9831b2ac868af9b0761107d 100644
--- a/python/paddle/vision/datasets/cifar.py
+++ b/python/paddle/vision/datasets/cifar.py
@@ -19,6 +19,7 @@ import numpy as np
 import six
 from six.moves import cPickle as pickle
 
+import paddle
 from paddle.io import Dataset
 from paddle.dataset.common import _check_exists_and_download
 
@@ -113,6 +114,8 @@ class Cifar10(Dataset):
         # read dataset into memory
         self._load_data()
 
+        self.dtype = paddle.get_default_dtype()
+
     def _init_url_md5_flag(self):
         self.data_url = CIFAR10_URL
         self.data_md5 = CIFAR10_MD5
@@ -139,9 +142,10 @@ class Cifar10(Dataset):
 
     def __getitem__(self, idx):
         image, label = self.data[idx]
+        image = np.reshape(image, [3, 32, 32])
         if self.transform is not None:
             image = self.transform(image)
-        return image, label
+        return image.astype(self.dtype), np.array(label).astype('int64')
 
     def __len__(self):
         return len(self.data)
diff --git a/python/paddle/vision/datasets/flowers.py b/python/paddle/vision/datasets/flowers.py
index 1c0f41123e2313d9db6f5e846d133ecdebc7f1af..2251333fd8d281bd07402fbbf3a05fea47a69cce 100644
--- a/python/paddle/vision/datasets/flowers.py
+++ b/python/paddle/vision/datasets/flowers.py
@@ -21,6 +21,7 @@ import numpy as np
 import scipy.io as scio
 from PIL import Image
 
+import paddle
 from paddle.io import Dataset
 from paddle.dataset.common import _check_exists_and_download
 
@@ -104,6 +105,8 @@ class Flowers(Dataset):
         # read dataset into memory
         self._load_anno()
 
+        self.dtype = paddle.get_default_dtype()
+
     def _load_anno(self):
         self.name2mem = {}
         self.data_tar = tarfile.open(self.data_file)
@@ -124,7 +127,7 @@ class Flowers(Dataset):
         if self.transform is not None:
             image = self.transform(image)
 
-        return image, label.astype('int64')
+        return image.astype(self.dtype), label.astype('int64')
 
     def __len__(self):
         return len(self.indexes)
diff --git a/python/paddle/vision/datasets/folder.py b/python/paddle/vision/datasets/folder.py
index 725fd9acafbab7b6adaf07139d02da8e2c9aaada..19d913504bdf7b09de9d888c0caa5cc1c049ac57 100644
--- a/python/paddle/vision/datasets/folder.py
+++ b/python/paddle/vision/datasets/folder.py
@@ -14,9 +14,10 @@
 
 import os
 import sys
-import cv2
 
+import paddle
 from paddle.io import Dataset
+from paddle.utils import try_import
 
 __all__ = ["DatasetFolder", "ImageFolder"]
 
@@ -143,6 +144,8 @@ class DatasetFolder(Dataset):
         self.samples = samples
         self.targets = [s[1] for s in samples]
 
+        self.dtype = paddle.get_default_dtype()
+
     def _find_classes(self, dir):
         """
         Finds the class folders in a dataset.
@@ -191,6 +194,7 @@ IMG_EXTENSIONS = ('.jpg', '.jpeg', '.png', '.ppm', '.bmp', '.pgm', '.tif',
 
 
 def cv2_loader(path):
+    cv2 = try_import('cv2')
     return cv2.imread(path)
 
 
diff --git a/python/paddle/vision/datasets/mnist.py b/python/paddle/vision/datasets/mnist.py
index a98561333921d182c0b3a3f486c90a94e79b6a3d..16c39e56ef0d65ba89bb611c62e0e957b840a826 100644
--- a/python/paddle/vision/datasets/mnist.py
+++ b/python/paddle/vision/datasets/mnist.py
@@ -19,6 +19,7 @@ import gzip
 import struct
 import numpy as np
 
+import paddle
 from paddle.io import Dataset
 from paddle.dataset.common import _check_exists_and_download
 
@@ -44,8 +45,6 @@ class MNIST(Dataset):
             :attr:`download` is True. Default None
         label_path(str): path to label file, can be set None if
             :attr:`download` is True. Default None
-        chw_format(bool): If set True, the output shape is [1, 28, 28],
-            otherwise, output shape is [1, 784]. Default True.
         mode(str): 'train' or 'test' mode. Default 'train'.
         download(bool): whether to download dataset automatically if
             :attr:`image_path` :attr:`label_path` is not set. Default True
@@ -70,14 +69,12 @@ class MNIST(Dataset):
     def __init__(self,
                  image_path=None,
                  label_path=None,
-                 chw_format=True,
                  mode='train',
                  transform=None,
                  download=True):
         assert mode.lower() in ['train', 'test'], \
                 "mode should be 'train' or 'test', but got {}".format(mode)
         self.mode = mode.lower()
-        self.chw_format = chw_format
         self.image_path = image_path
         if self.image_path is None:
             assert download, "image_path is not set and downloading automatically is disabled"
@@ -99,6 +96,8 @@ class MNIST(Dataset):
         # read dataset into memory
         self._parse_dataset()
 
+        self.dtype = paddle.get_default_dtype()
+
     def _parse_dataset(self, buffer_size=100):
         self.images = []
         self.labels = []
@@ -139,10 +138,6 @@ class MNIST(Dataset):
                                                       cols)).astype('float32')
                     offset_img += struct.calcsize(fmt_images)
 
-                    images = images / 255.0
-                    images = images * 2.0
-                    images = images - 1.0
-
                     for i in range(buffer_size):
                         self.images.append(images[i, :])
                         self.labels.append(
@@ -150,11 +145,10 @@ class MNIST(Dataset):
 
     def __getitem__(self, idx):
         image, label = self.images[idx], self.labels[idx]
-        if self.chw_format:
-            image = np.reshape(image, [1, 28, 28])
+        image = np.reshape(image, [1, 28, 28])
         if self.transform is not None:
             image = self.transform(image)
-        return image, label
+        return image.astype(self.dtype), label.astype('int64')
 
     def __len__(self):
         return len(self.labels)
diff --git a/python/paddle/vision/datasets/voc2012.py b/python/paddle/vision/datasets/voc2012.py
index ae14ea3016363c828d17ba34aca8e1a6663ecf76..5fc9d7c38153e5d8c10da5275f3bb11164b12e54 100644
--- a/python/paddle/vision/datasets/voc2012.py
+++ b/python/paddle/vision/datasets/voc2012.py
@@ -19,6 +19,7 @@ import tarfile
 import numpy as np
 from PIL import Image
 
+import paddle
 from paddle.io import Dataset
 from paddle.dataset.common import _check_exists_and_download
 
@@ -96,6 +97,8 @@ class VOC2012(Dataset):
         # read dataset into memory
         self._load_anno()
 
+        self.dtype = paddle.get_default_dtype()
+
     def _load_anno(self):
         self.name2mem = {}
         self.data_tar = tarfile.open(self.data_file)
@@ -127,7 +130,7 @@ class VOC2012(Dataset):
         label = np.array(label)
         if self.transform is not None:
             data = self.transform(data)
-        return data, label
+        return data.astype(self.dtype), label.astype(self.dtype)
 
     def __len__(self):
         return len(self.data)
diff --git a/python/paddle/vision/transforms/functional.py b/python/paddle/vision/transforms/functional.py
index b5668fa8c7d6812664512a58faf836b5d9f09300..acceb111e6f84bc444dcfc7427653f77e3eedd3a 100644
--- a/python/paddle/vision/transforms/functional.py
+++ b/python/paddle/vision/transforms/functional.py
@@ -18,10 +18,11 @@ import random
 import math
 import functools
 
-import cv2
 import numbers
 import numpy as np
 
+from paddle.utils import try_import
+
 if sys.version_info < (3, 3):
     Sequence = collections.Sequence
     Iterable = collections.Iterable
@@ -54,8 +55,8 @@ def flip(image, code):
     Accordding to the code (the type of flip), flip the input image
 
     Args:
-        image: Input image, with (H, W, C) shape
-        code: Code that indicates the type of flip.
+        image (np.ndarray): Input image, with (H, W, C) shape
+        code (int): Code that indicates the type of flip.
             -1 : Flip horizontally and vertically
             0 : Flip vertically
             1 : Flip horizontally
@@ -77,18 +78,28 @@ def flip(image, code):
             # flip horizontally
             F.flip(fake_img, 1)
     """
+    cv2 = try_import('cv2')
     return cv2.flip(image, flipCode=code)
 
 
 @keepdims
-def resize(img, size, interpolation=cv2.INTER_LINEAR):
+def resize(img, size, interpolation=1):
     """
     resize the input data to given size
 
     Args:
-        input: Input data, could be image or masks, with (H, W, C) shape
-        size: Target size of input data, with (height, width) shape.
-        interpolation: Interpolation method.
+        input (np.ndarray): Input data, could be image or masks, with (H, W, C) shape
+        size (int|list|tuple): Target size of input data, with (height, width) shape.
+        interpolation (int, optional): Interpolation method.
+            0 : cv2.INTER_NEAREST 
+            1 : cv2.INTER_LINEAR 
+            2 : cv2.INTER_CUBIC 
+            3 : cv2.INTER_AREA 
+            4 : cv2.INTER_LANCZOS4 
+            5 : cv2.INTER_LINEAR_EXACT
+            7 : cv2.INTER_MAX 
+            8 : cv2.WARP_FILL_OUTLIERS 
+            16: cv2.WARP_INVERSE_MAP 
 
     Examples:
         .. code-block:: python
@@ -102,7 +113,7 @@ def resize(img, size, interpolation=cv2.INTER_LINEAR):
 
             F.resize(fake_img, (200, 150))
     """
-
+    cv2 = try_import('cv2')
     if isinstance(interpolation, Sequence):
         interpolation = random.choice(interpolation)
 
@@ -179,6 +190,8 @@ def pad(img, padding, fill=(0, 0, 0), padding_mode='constant'):
     assert padding_mode in ['constant', 'edge', 'reflect', 'symmetric'], \
         'Expected padding mode be either constant, edge, reflect or symmetric, but got {}'.format(padding_mode)
 
+    cv2 = try_import('cv2')
+
     PAD_MOD = {
         'constant': cv2.BORDER_CONSTANT,
         'edge': cv2.BORDER_REPLICATE,
@@ -214,18 +227,22 @@ def pad(img, padding, fill=(0, 0, 0), padding_mode='constant'):
 
 
 @keepdims
-def rotate(img,
-           angle,
-           interpolation=cv2.INTER_LINEAR,
-           expand=False,
-           center=None):
+def rotate(img, angle, interpolation=1, expand=False, center=None):
     """Rotates the image by angle.
 
     Args:
         img (numpy.ndarray): Image to be rotated.
         angle (float|int): In degrees clockwise order.
-        interpolation (int, optional):
-            interpolation: Interpolation method.
+        interpolation (int, optional): Interpolation method. Default: 1.
+            0 : cv2.INTER_NEAREST 
+            1 : cv2.INTER_LINEAR 
+            2 : cv2.INTER_CUBIC 
+            3 : cv2.INTER_AREA 
+            4 : cv2.INTER_LANCZOS4 
+            5 : cv2.INTER_LINEAR_EXACT
+            7 : cv2.INTER_MAX 
+            8 : cv2.WARP_FILL_OUTLIERS 
+            16: cv2.WARP_INVERSE_MAP 
         expand (bool|optional): Optional expansion flag.
             If true, expands the output image to make it large enough to hold the entire rotated image.
             If false or omitted, make the output image the same size as the input image.
@@ -250,8 +267,9 @@ def rotate(img,
             fake_img = rotate(fake_img, 10)
             print(fake_img.shape)
     """
-    dtype = img.dtype
+    cv2 = try_import('cv2')
 
+    dtype = img.dtype
     h, w, _ = img.shape
     point = center or (w / 2, h / 2)
     M = cv2.getRotationMatrix2D(point, angle=-angle, scale=1)
@@ -312,6 +330,7 @@ def to_grayscale(img, num_output_channels=1):
             fake_img = to_grayscale(fake_img)
             print(fake_img.shape)
     """
+    cv2 = try_import('cv2')
 
     if num_output_channels == 1:
         img = cv2.cvtColor(img, cv2.COLOR_RGB2GRAY)
diff --git a/python/paddle/vision/transforms/transforms.py b/python/paddle/vision/transforms/transforms.py
index 14809e0c1acaa1b6d5a494e6e3df1801e1c8f61b..9ea828271765cb8976909724bbf6079aa02c2460 100644
--- a/python/paddle/vision/transforms/transforms.py
+++ b/python/paddle/vision/transforms/transforms.py
@@ -17,7 +17,6 @@ from __future__ import division
 import math
 import sys
 import random
-import cv2
 
 import numpy as np
 import numbers
@@ -26,6 +25,7 @@ import collections
 import warnings
 import traceback
 
+from paddle.utils import try_import
 from . import functional as F
 
 if sys.version_info < (3, 3):
@@ -214,7 +214,16 @@ class Resize(object):
             smaller edge of the image will be matched to this number.
             i.e, if height > width, then image will be rescaled to
             (size * height / width, size)
-        interpolation (int): Interpolation mode of resize. Default: cv2.INTER_LINEAR.
+        interpolation (int, optional): Interpolation mode of resize. Default: 1.
+            0 : cv2.INTER_NEAREST 
+            1 : cv2.INTER_LINEAR 
+            2 : cv2.INTER_CUBIC 
+            3 : cv2.INTER_AREA 
+            4 : cv2.INTER_LANCZOS4 
+            5 : cv2.INTER_LINEAR_EXACT
+            7 : cv2.INTER_MAX 
+            8 : cv2.WARP_FILL_OUTLIERS 
+            16: cv2.WARP_INVERSE_MAP 
 
     Examples:
     
@@ -232,7 +241,7 @@ class Resize(object):
             print(fake_img.shape)
     """
 
-    def __init__(self, size, interpolation=cv2.INTER_LINEAR):
+    def __init__(self, size, interpolation=1):
         assert isinstance(size, int) or (isinstance(size, Iterable) and
                                          len(size) == 2)
         self.size = size
@@ -252,6 +261,16 @@ class RandomResizedCrop(object):
         output_size (int|list|tuple): Target size of output image, with (height, width) shape.
         scale (list|tuple): Range of size of the origin size cropped. Default: (0.08, 1.0)
         ratio (list|tuple): Range of aspect ratio of the origin aspect ratio cropped. Default: (0.75, 1.33)
+        interpolation (int, optional): Interpolation mode of resize. Default: 1.
+            0 : cv2.INTER_NEAREST 
+            1 : cv2.INTER_LINEAR 
+            2 : cv2.INTER_CUBIC 
+            3 : cv2.INTER_AREA 
+            4 : cv2.INTER_LANCZOS4 
+            5 : cv2.INTER_LINEAR_EXACT
+            7 : cv2.INTER_MAX 
+            8 : cv2.WARP_FILL_OUTLIERS 
+            16: cv2.WARP_INVERSE_MAP 
 
     Examples:
     
@@ -273,7 +292,7 @@ class RandomResizedCrop(object):
                  output_size,
                  scale=(0.08, 1.0),
                  ratio=(3. / 4, 4. / 3),
-                 interpolation=cv2.INTER_LINEAR):
+                 interpolation=1):
         if isinstance(output_size, int):
             self.output_size = (output_size, output_size)
         else:
@@ -328,7 +347,16 @@ class CenterCropResize(object):
     Args:
         size (int|list|tuple): Target size of output image, with (height, width) shape.
         crop_padding (int): Center crop with the padding. Default: 32.
-        interpolation (int): Interpolation mode of resize. Default: cv2.INTER_LINEAR.
+        interpolation (int, optional): Interpolation mode of resize. Default: 1.
+            0 : cv2.INTER_NEAREST 
+            1 : cv2.INTER_LINEAR 
+            2 : cv2.INTER_CUBIC 
+            3 : cv2.INTER_AREA 
+            4 : cv2.INTER_LANCZOS4 
+            5 : cv2.INTER_LINEAR_EXACT
+            7 : cv2.INTER_MAX 
+            8 : cv2.WARP_FILL_OUTLIERS 
+            16: cv2.WARP_INVERSE_MAP 
 
     Examples:
     
@@ -346,7 +374,7 @@ class CenterCropResize(object):
             print(fake_img.shape)
     """
 
-    def __init__(self, size, crop_padding=32, interpolation=cv2.INTER_LINEAR):
+    def __init__(self, size, crop_padding=32, interpolation=1):
         if isinstance(size, int):
             self.size = (size, size)
         else:
@@ -661,6 +689,7 @@ class ContrastTransform(object):
         if self.value == 0:
             return img
 
+        cv2 = try_import('cv2')
         dtype = img.dtype
         img = img.astype(np.float32)
         alpha = np.random.uniform(max(0, 1 - self.value), 1 + self.value)
@@ -701,6 +730,8 @@ class SaturationTransform(object):
         if self.value == 0:
             return img
 
+        cv2 = try_import('cv2')
+
         dtype = img.dtype
         img = img.astype(np.float32)
         alpha = np.random.uniform(max(0, 1 - self.value), 1 + self.value)
@@ -742,6 +773,7 @@ class HueTransform(object):
         if self.value == 0:
             return img
 
+        cv2 = try_import('cv2')
         dtype = img.dtype
         img = img.astype(np.uint8)
         hsv_img = cv2.cvtColor(img, cv2.COLOR_BGR2HSV_FULL)
@@ -1036,7 +1068,16 @@ class RandomRotate(object):
         degrees (sequence or float or int): Range of degrees to select from.
             If degrees is a number instead of sequence like (min, max), the range of degrees
             will be (-degrees, +degrees) clockwise order.
-        interpolation (int|optional): Interpolation mode of resize. Default: cv2.INTER_LINEAR.
+        interpolation (int, optional): Interpolation mode of resize. Default: 1.
+            0 : cv2.INTER_NEAREST 
+            1 : cv2.INTER_LINEAR 
+            2 : cv2.INTER_CUBIC 
+            3 : cv2.INTER_AREA 
+            4 : cv2.INTER_LANCZOS4 
+            5 : cv2.INTER_LINEAR_EXACT
+            7 : cv2.INTER_MAX 
+            8 : cv2.WARP_FILL_OUTLIERS 
+            16: cv2.WARP_INVERSE_MAP 
         expand (bool|optional): Optional expansion flag. Default: False.
             If true, expands the output to make it large enough to hold the entire rotated image.
             If false or omitted, make the output image the same size as the input image.
@@ -1061,11 +1102,7 @@ class RandomRotate(object):
             print(fake_img.shape)
     """
 
-    def __init__(self,
-                 degrees,
-                 interpolation=cv2.INTER_LINEAR,
-                 expand=False,
-                 center=None):
+    def __init__(self, degrees, interpolation=1, expand=False, center=None):
         if isinstance(degrees, numbers.Number):
             if degrees < 0:
                 raise ValueError(
diff --git a/python/requirements.txt b/python/requirements.txt
index e278a1b824cc3829f1b67bc3a0cf643840990bb9..6a88d61a94c182db3320e4de30f45be4c852f70f 100644
--- a/python/requirements.txt
+++ b/python/requirements.txt
@@ -1,4 +1,3 @@
-opencv-python<=4.2.0.32
 requests>=2.20.0
 numpy>=1.13, <=1.16.4 ; python_version<"3.5"
 numpy>=1.13 ; python_version>="3.5"
@@ -13,12 +12,10 @@ scipy ; python_version>"3.5"
 nltk ; python_version>="3.5"
 rarfile
 Pillow
-graphviz
 six
 decorator
 prettytable
-objgraph
 astor
 pathlib
-netifaces
-psutil
+netifaces ; platform_system != "Windows"
+netifaces ; python_version>="3.5" and platform_system == "Windows"
diff --git a/python/setup.py.in b/python/setup.py.in
index 64ac2b9b9a4d210c59193e117c6000986bfb07a0..467c5cb86779b80e51794cf800226d64534e8676 100644
--- a/python/setup.py.in
+++ b/python/setup.py.in
@@ -155,7 +155,9 @@ packages=['paddle',
           'paddle.distributed.fleet.utils',
           'paddle.framework',
           'paddle.jit',
+          'paddle.inference',
           'paddle.fluid',
+          'paddle.fluid.inference',
           'paddle.fluid.dygraph',
           'paddle.fluid.dygraph.dygraph_to_static',
           'paddle.fluid.dygraph.amp',
@@ -236,9 +238,6 @@ if sys.version_info >= (3,7):
         setup_requires_tmp+=[setup_requires_i]
     setup_requires = setup_requires_tmp
 
-if '${CMAKE_SYSTEM_PROCESSOR}' not in ['arm', 'armv7-a', 'aarch64']:
-    setup_requires+=['opencv-python']
-
 # the prefix is sys.prefix which should always be usr
 paddle_bins = ''
 
diff --git a/python/unittest_py/requirements.txt b/python/unittest_py/requirements.txt
new file mode 100644
index 0000000000000000000000000000000000000000..3b6a7546616678c21d4241da42f0965d4b85f2e6
--- /dev/null
+++ b/python/unittest_py/requirements.txt
@@ -0,0 +1,4 @@
+PyGithub
+coverage
+pycrypto
+mock
diff --git a/tools/check_api_approvals.sh b/tools/check_api_approvals.sh
index b787ae625017d783a7221006ddd6867c21e238e8..943b8c01e8cc0c0e0a41e9b01951939f454c3181 100644
--- a/tools/check_api_approvals.sh
+++ b/tools/check_api_approvals.sh
@@ -39,9 +39,9 @@ fi
 
 api_spec_diff=`python ${PADDLE_ROOT}/tools/check_api_source_without_core_ops.py ${PADDLE_ROOT}/paddle/fluid/API_DEV.source.md5  ${PADDLE_ROOT}/paddle/fluid/API_PR.source.md5` 
 if [ "$api_spec_diff" != "" ]; then
+    echo_line="${echo_line}Related APIs: ${api_spec_diff}\n"
     echo_line="You must have one RD (zhiqiu (Recommend) or phlrain) approval for the api change for the opreator-related api without 'core.ops'.\n"
     echo_line="${echo_line}For more details, please click [https://github.com/PaddlePaddle/Paddle/wiki/paddle_api_development_manual.md]\n"
-    echo_line="${echo_line}Related APIs: ${api_spec_diff}\n"
     check_approval 1 6888866 43953930
 fi
 
diff --git a/tools/check_file_diff_approvals.sh b/tools/check_file_diff_approvals.sh
index 1e5179d0282d7f35c4232d9b9783cb831e83f462..84254cc89bb8eef12a95189416cd29cce828f5ca 100644
--- a/tools/check_file_diff_approvals.sh
+++ b/tools/check_file_diff_approvals.sh
@@ -287,12 +287,19 @@ fi
 pip install PyGithub
 # For getting PR related data
 wget https://paddle-ci.gz.bcebos.com/blk/block.txt --no-check-certificate
+wget https://sys-p0.bj.bcebos.com/bk-ci/bk.txt --no-check-certificate
 HASUTFIXED=`python ${PADDLE_ROOT}/tools/check_ut.py | grep "has unit-test to be fixed" || true`
 if [ "${HASUTFIXED}" != "" ]; then
   echo_line="${HASUTFIXED} You must have one RD (chalsliu (Recommend) or kolinwei) approval.\n"
   check_approval 1 45041955 22165420
 fi
 
+HASUTFIXED=`python ${PADDLE_ROOT}/tools/check_ut.py | grep "has benchmark issue to be fixed" || true`
+if [ "${HASUTFIXED}" != "" ]; then
+    echo_line="${HASUTFIXED} You must have one RD (hysunflower or xiegegege or Xreki) approval.\n"
+  check_approval 1 52739577 46314656 12538138
+fi
+
 if [ -n "${echo_list}" ];then
   echo "****************"
   echo -e "${echo_list[@]}"
diff --git a/tools/check_ut.py b/tools/check_ut.py
index 7b5e5a4f1c55574edc3f28dac76ebf1d932748d7..f5fe4c687dd7828f001ddbab744d66931e37f532 100644
--- a/tools/check_ut.py
+++ b/tools/check_ut.py
@@ -27,9 +27,12 @@ class PRChecker(object):
         self.github = Github(os.getenv('GITHUB_API_TOKEN'), timeout=60)
         self.repo = None
 
-    def check(self):
-        """ check pr. """
-        filename = 'block.txt'
+    def check(self, filename, msg):
+        """ 
+        Args:
+            filename (str): File to get block names.  
+            msg (str): Error message.  
+        """
         pr_id = os.getenv('GIT_PR_ID')
         if not pr_id:
             print('No PR ID')
@@ -44,12 +47,10 @@ class PRChecker(object):
         with open(filename) as f:
             for l in f:
                 if l.rstrip('\r\n') == user:
-                    print('{} has unit-test to be fixed, so CI failed.'.format(
-                        user))
-                    exit(1)
-        exit(0)
+                    print('{} {}'.format(user, msg))
 
 
 if __name__ == '__main__':
     pr_checker = PRChecker()
-    pr_checker.check()
+    pr_checker.check('block.txt', 'has unit-test to be fixed, so CI failed.')
+    pr_checker.check('bk.txt', 'has benchmark issue to be fixed, so CI failed.')
diff --git a/tools/coverage/coverage_diff.py b/tools/coverage/coverage_diff.py
index 051348d358f459a67d39dd9ca798721a82aa2233..38f671fe4089d1f94caafaf26640e4df75870f55 100644
--- a/tools/coverage/coverage_diff.py
+++ b/tools/coverage/coverage_diff.py
@@ -90,12 +90,12 @@ def get_info_file_lines(info_file, diff_file):
                 continue
 
             elif line.startswith('LF:'):
-                print 'LF:{}'.format(current_lf)
+                print('LF:{}'.format(current_lf))
 
                 continue
 
             elif line.startswith('LH:'):
-                print 'LH:{}'.format(current_lh)
+                print('LH:{}'.format(current_lh))
 
                 continue
 
diff --git a/tools/coverage/coverage_diff_list.py b/tools/coverage/coverage_diff_list.py
index 57222da4d9818ebbcb0b9aeea6937494038eecdf..8975185edadfbd567a428bbd90523923f5ab675d 100644
--- a/tools/coverage/coverage_diff_list.py
+++ b/tools/coverage/coverage_diff_list.py
@@ -40,7 +40,7 @@ def filter_by(list_file, max_rate):
             except:
                 pass
 
-            print name, rate
+            print(name, rate)
 
 
 if __name__ == '__main__':
diff --git a/tools/coverage/coverage_lines.py b/tools/coverage/coverage_lines.py
index eb846cc9f249a3ff177fde437362a1122f4409a5..cdec5b8b1bb1873f8b9ef761e9d8575c89fee234 100644
--- a/tools/coverage/coverage_lines.py
+++ b/tools/coverage/coverage_lines.py
@@ -33,7 +33,7 @@ def get_lines(info_file):
                 hits += 1
 
     if total == 0:
-        print 'no data found'
+        print('no data found')
         exit()
 
     return hits / total
@@ -47,17 +47,17 @@ if __name__ == '__main__':
     expected = float(sys.argv[2])
 
     if not os.path.isfile(info_file):
-        print 'info file {} is not exists, ignored'.format(info_file)
+        print('info file {} is not exists, ignored'.format(info_file))
         exit()
 
     actual = get_lines(info_file)
     actual = round(actual, 3)
 
     if actual < expected:
-        print 'expected >= {} %, actual {} %, failed'.format(
-            round(expected * 100, 1), round(actual * 100, 1))
+        print('expected >= {} %, actual {} %, failed'.format(
+            round(expected * 100, 1), round(actual * 100, 1)))
 
         exit(1)
 
-    print 'expected >= {} %, actual {} %, passed'.format(
-        round(expected * 100, 1), round(actual * 100, 1))
+    print('expected >= {} %, actual {} %, passed'.format(
+        round(expected * 100, 1), round(actual * 100, 1)))
diff --git a/tools/coverage/paddle_coverage.sh b/tools/coverage/paddle_coverage.sh
index d54434b738db5b5e6192f7d3bf9e48edbebc5b7c..008b35d01ca565a6d32265f595dd2d6aa55be707 100644
--- a/tools/coverage/paddle_coverage.sh
+++ b/tools/coverage/paddle_coverage.sh
@@ -5,7 +5,7 @@ set -xe
 PADDLE_ROOT="$( cd "$( dirname "${BASH_SOURCE[0]}")/../../" && pwd )"
 
 # install lcov
-curl -o /lcov-1.14.tar.gz -x "" -s https://paddle-ci.gz.bcebos.com/coverage/lcov-1.14.tar.gz
+curl -o /lcov-1.14.tar.gz -x "" -s https://paddle-ci.gz.bcebos.com/coverage/lcov-1.14.tar.gz || exit 101
 tar -xf /lcov-1.14.tar.gz -C /
 cd /lcov-1.14
 make install
@@ -14,7 +14,7 @@ make install
 
 cd /paddle/build
 
-python ${PADDLE_ROOT}/tools/coverage/gcda_clean.py ${GIT_PR_ID}
+python3 ${PADDLE_ROOT}/tools/coverage/gcda_clean.py ${GIT_PR_ID}
 
 lcov --capture -d ./ -o coverage.info --rc lcov_branch_coverage=0
 
@@ -53,9 +53,9 @@ gen_full_html_report || true
 function gen_diff_html_report() {
     if [ "${GIT_PR_ID}" != "" ]; then
 
-        COVERAGE_DIFF_PATTERN="`python ${PADDLE_ROOT}/tools/coverage/pull_request.py files ${GIT_PR_ID}`"
+        COVERAGE_DIFF_PATTERN="`python3 ${PADDLE_ROOT}/tools/coverage/pull_request.py files ${GIT_PR_ID}`"
 
-        python ${PADDLE_ROOT}/tools/coverage/pull_request.py diff ${GIT_PR_ID} > git-diff.out
+        python3 ${PADDLE_ROOT}/tools/coverage/pull_request.py diff ${GIT_PR_ID} > git-diff.out
     fi
 
     lcov --extract coverage-full.info \
@@ -63,7 +63,7 @@ function gen_diff_html_report() {
         -o coverage-diff.info \
         --rc lcov_branch_coverage=0
 
-    python ${PADDLE_ROOT}/tools/coverage/coverage_diff.py coverage-diff.info git-diff.out > coverage-diff.tmp
+    python3 ${PADDLE_ROOT}/tools/coverage/coverage_diff.py coverage-diff.info git-diff.out > coverage-diff.tmp
 
     mv -f coverage-diff.tmp coverage-diff.info
 
@@ -82,7 +82,7 @@ set -x
 
 coverage xml -i -o python-coverage.xml
 
-python ${PADDLE_ROOT}/tools/coverage/python_coverage.py > python-coverage.info
+python3 ${PADDLE_ROOT}/tools/coverage/python_coverage.py > python-coverage.info
 
 # python full html report
 #
@@ -143,5 +143,6 @@ echo "Assert Python Diff Coverage"
 python ${PADDLE_ROOT}/tools/coverage/coverage_lines.py python-coverage-diff.info 0.9 || PYTHON_COVERAGE_LINES_ASSERT=1
 
 if [ "$COVERAGE_LINES_ASSERT" = "1" ] || [ "$PYTHON_COVERAGE_LINES_ASSERT" = "1" ]; then
+    echo "exit 9" > /tmp/paddle_coverage.result
     exit 9
 fi
diff --git a/tools/coverage/pull_request.py b/tools/coverage/pull_request.py
index 979f476d2a1f3fbf28d43e2b717deb7b2d1b0fff..105460032f7db538eaf7a193776bf8085e2837a1 100644
--- a/tools/coverage/pull_request.py
+++ b/tools/coverage/pull_request.py
@@ -40,7 +40,7 @@ def get_files(args):
     pull = get_pull(args.pull_id)
 
     for file in pull.get_files():
-        print '/paddle/{}'.format(file.filename)
+        print('/paddle/{}'.format(file.filename))
 
 
 def diff(args):
@@ -55,8 +55,8 @@ def diff(args):
     pull = get_pull(args.pull_id)
 
     for file in pull.get_files():
-        print '+++ {}'.format(file.filename)
-        print file.patch
+        print('+++ {}'.format(file.filename))
+        print(file.patch)
 
 
 if __name__ == '__main__':
diff --git a/tools/coverage/python_coverage.py b/tools/coverage/python_coverage.py
index ba67e12249bb0ccee608ea120321eaa3a2ccbc91..8ad9d85c1bf6b5ed542fb8469173c4f1815050a4 100644
--- a/tools/coverage/python_coverage.py
+++ b/tools/coverage/python_coverage.py
@@ -12,10 +12,7 @@ root = tree.getroot()
 
 sources = root.findall('sources/source')
 
-if len(sources) > 1:
-    exit(1)
-
-source = sources[0].text
+source = sources[-1].text
 
 for clazz in root.findall('packages/package/classes/class'):
     clazz_filename = clazz.attrib.get('filename')
@@ -28,8 +25,8 @@ for clazz in root.findall('packages/package/classes/class'):
     if not path.exists(clazz_filename):
         continue
 
-    print 'TN:'
-    print 'SF:{}'.format(clazz_filename)
+    print('TN:')
+    print('SF:{}'.format(clazz_filename))
 
     branch_index = 0
 
@@ -50,16 +47,16 @@ for clazz in root.findall('packages/package/classes/class'):
             taken = int(taken)
 
             for _ in range(taken):
-                print 'BRDA:{},{},{},{}'.format(line_number, 0, branch_index,
-                                                line_hits)
+                print('BRDA:{},{},{},{}'.format(line_number, 0, branch_index,
+                                                line_hits))
                 branch_index += 1
 
             if line_missing_branches:
                 for missing_branch in line_missing_branches.split(','):
-                    print 'BRDA:{},{},{},{}'.format(line_number, 0,
-                                                    branch_index, 0)
+                    print('BRDA:{},{},{},{}'.format(line_number, 0,
+                                                    branch_index, 0))
                     branch_index += 1
 
-        print 'DA:{},{}'.format(line_number, line_hits)
+        print('DA:{},{}'.format(line_number, line_hits))
 
-    print 'end_of_record'
+    print('end_of_record')
diff --git a/tools/enforce/count_all_enforce.sh b/tools/enforce/count_all_enforce.sh
index c1b7508de0361b7a9036557f88fd0b10f326dcc6..683b73614d29bb42871c63dc94d365626d0375ad 100644
--- a/tools/enforce/count_all_enforce.sh
+++ b/tools/enforce/count_all_enforce.sh
@@ -39,7 +39,7 @@
 #     Valid PADDLE_ENFORCE{_**} & PADDLE_THROW Count: 1706
 #     Invalid PADDLE_ENFORCE{_**} & PADDLE_THROW Count: 4572
 
-ROOT_DIR=../paddle/fluid
+ROOT_DIR=../../paddle/fluid
 ALL_PADDLE_CHECK_CNT=0
 VALID_PADDLE_CHECK_CNT=0
 
diff --git a/tools/enforce/count_enforce_by_dir.sh b/tools/enforce/count_enforce_by_dir.sh
index 03233d417ac88eef775e1ca6a77d0600a4faa361..3cb13edf7cc27f6a0de45080a7b90e4b4e24b6b5 100644
--- a/tools/enforce/count_enforce_by_dir.sh
+++ b/tools/enforce/count_enforce_by_dir.sh
@@ -59,7 +59,7 @@
 
 . ./count_all_enforce.sh --source-only
 
-ROOT_DIR=../paddle/fluid
+ROOT_DIR=../../paddle/fluid
 
 function count_dir_independently(){
     local sub_dir_total_check_cnt=0
diff --git a/tools/enforce/count_enforce_by_file.sh b/tools/enforce/count_enforce_by_file.sh
index 1858bd0fd17aac7273318ddbb37fc0d9c512f48d..c1e2903c092ce4124c55566679e081dbe3a03445 100644
--- a/tools/enforce/count_enforce_by_file.sh
+++ b/tools/enforce/count_enforce_by_file.sh
@@ -57,7 +57,14 @@ FILE_WHITE_LIST="\
     random_crop_op.h \
     elementwise_op_function.cu.h \
     fused_elemwise_activation_op.cc \
-    auc_op.cu"
+    auc_op.cu \
+    unsqueeze_op.h \
+    unsqueeze_op.cc \
+    enforce.h \
+    errors_test.cc \
+    cross_entropy.cu \
+    cross_entropy.h \
+    unpooling.cu"
 
 function count_file_recursively(){
     dir_name=$1
diff --git a/tools/get_cpu_info.sh b/tools/get_cpu_info.sh
new file mode 100755
index 0000000000000000000000000000000000000000..a1881f551da1ca022c186c50c667e51dff89f9be
--- /dev/null
+++ b/tools/get_cpu_info.sh
@@ -0,0 +1,49 @@
+#!/bin/bash
+
+if [ "`uname -s`" != "Linux" ]; then
+  echo "Current scenario only support in Linux yet!"
+  exit 0
+fi
+
+echo "********** Hardware Information **********"
+sockets=`grep 'physical id' /proc/cpuinfo | sort -u | wc -l`
+cores_per_socket=`grep 'core id' /proc/cpuinfo | sort -u | wc -l`
+ht=`lscpu |grep "per core" |awk -F':' '{print $2}'|xargs`
+physical_cores=$((sockets * cores_per_socket))
+virtual_cores=`grep 'processor' /proc/cpuinfo | sort -u | wc -l`
+numa_nodes=`lscpu |grep "NUMA node(s)"|awk -F':' '{print $2}'|xargs`
+echo "CPU Name               : `cat /proc/cpuinfo |grep -i "model name" |uniq |awk -F ':' '{print $2}'|xargs`"
+echo "CPU Family             : `lscpu |grep \"CPU family\" |awk -F':' '{print $2}'|xargs`"
+echo "Socket Number          : $sockets"
+echo "Cores Per Socket       : $cores_per_socket"
+echo "Total Physical Cores   : $physical_cores"
+echo "Total Virtual Cores    : $virtual_cores"
+if [ $ht -eq 1 ]; then
+  echo "Hyper Threading        : OFF"
+  if [ $physical_cores -ne $virtual_cores ]; then
+    echo "Error: HT logical error"
+  fi
+else
+  echo "Hyper Threading        : ON"
+  if [ $physical_cores -ge $virtual_cores ]; then
+    echo "Error: HT logical error"
+  fi
+fi
+echo "NUMA Nodes             : $numa_nodes"
+if [ $numa_nodes -lt $sockets ]; then
+  echo "Warning: NUMA node is not enough for the best performance,\
+ at least $sockets"
+fi
+
+echo "********** Software Information **********"
+echo "OS Version             : `cat /proc/version`"
+echo "Kernel Release Version : `uname -r`"
+echo "Kernel Patch Version   : `uname -v`"
+echo "GCC Version            :`gcc --version | head -n 1|awk -F '\\\(GCC\\\)' '{print $2}'`"
+if command -v cmake >/dev/null 2>&1; then 
+  cmake_ver=`cmake --version | head -n 1 | awk -F 'version' '{print $2}'`
+else
+  cmake_ver=" Not installed"
+fi
+echo "CMake Version          :$cmake_ver"
+echo "******************************************"
diff --git a/tools/get_pr_ut.py b/tools/get_pr_ut.py
new file mode 100644
index 0000000000000000000000000000000000000000..970f89551c579b1554db8f878c63dace0d715930
--- /dev/null
+++ b/tools/get_pr_ut.py
@@ -0,0 +1,74 @@
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+""" For the PR that only modified the unit test, get cases in pull request. """
+
+import os
+import json
+from github import Github
+
+PADDLE_ROOT = os.getenv('PADDLE_ROOT', '/paddle/')
+
+
+class PRChecker(object):
+    """ PR Checker. """
+
+    def __init__(self):
+        self.github = Github(os.getenv('GITHUB_API_TOKEN'), timeout=60)
+        self.repo = self.github.get_repo('PaddlePaddle/Paddle')
+        self.pr = None
+
+    def init(self):
+        """ Get pull request. """
+        pr_id = os.getenv('GIT_PR_ID')
+        if not pr_id:
+            print('No PR ID')
+            exit(0)
+        self.pr = self.repo.get_pull(int(pr_id))
+
+    def get_pr_files(self):
+        """ Get files in pull request. """
+        page = 0
+        file_list = []
+        while True:
+            files = self.pr.get_files().get_page(page)
+            if not files:
+                break
+            for f in files:
+                file_list.append(PADDLE_ROOT + f.filename)
+            page += 1
+        return file_list
+
+    def get_pr_ut(self):
+        """ Get unit tests in pull request. """
+        ut_list = []
+        file_ut_map = None
+        cmd = 'wget -q --no-check-certificate https://sys-p0.bj.bcebos.com/prec/file_ut.json'
+        os.system(cmd)
+        with open('file_ut.json') as jsonfile:
+            file_ut_map = json.load(jsonfile)
+        for f in self.get_pr_files():
+            if f not in file_ut_map:
+                return ''
+            if f.endswith('.h') or f.endswith('.cu'):
+                return ''
+            else:
+                ut_list.extend(file_ut_map.get(f))
+        ut_list = list(set(ut_list))
+        return ' '.join(ut_list)
+
+
+if __name__ == '__main__':
+    pr_checker = PRChecker()
+    pr_checker.init()
+    print(pr_checker.get_pr_ut())
diff --git a/tools/test_runner.py b/tools/test_runner.py
index 9b9f165e7368364bbb0a78d6dcbbe4be0d6bf98b..bad98f9b5c3e80c80277528cf03519bc9ffac375 100644
--- a/tools/test_runner.py
+++ b/tools/test_runner.py
@@ -17,12 +17,14 @@ from __future__ import print_function
 import unittest
 import os
 import sys
+import paddle
 import paddle.fluid as fluid
 import importlib
 from six.moves import cStringIO
 
 
 def main():
+    paddle.enable_static()
     sys.path.append(os.getcwd())
     some_test_failed = False
     for module_name in sys.argv[1:]:
@@ -44,6 +46,7 @@ def main():
                             'failed\n',
                             buffer.getvalue(),
                             file=sys.stderr)
+    paddle.disable_static()
 
     if some_test_failed:
         exit(1)
diff --git a/tools/windows/build_compile_environment.bat b/tools/windows/build_compile_environment.bat
new file mode 100644
index 0000000000000000000000000000000000000000..16665ac4aafddca323c2f453f5fcdd78aa0949ed
--- /dev/null
+++ b/tools/windows/build_compile_environment.bat
@@ -0,0 +1,190 @@
+:: Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+::
+:: Licensed under the Apache License, Version 2.0 (the "License");
+:: you may not use this file except in compliance with the License.
+:: You may obtain a copy of the License at
+::
+::     http://www.apache.org/licenses/LICENSE-2.0
+::
+:: Unless required by applicable law or agreed to in writing, software
+:: distributed under the License is distributed on an "AS IS" BASIS,
+:: WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+:: See the License for the specific language governing permissions and
+:: limitations under the License.
+::
+:: ===============================
+:: Build Paddle compile enviroment
+:: ===============================
+:: Description:
+::   
+::   Install compile enviroment for xly CI.
+::
+::   Include:
+::     1. CMake 3.17.0
+::     2. Git 2.28.0
+::     3. Python 3.7.8
+::     4. Visual Studio 2015 with update 3
+::     5. CUDA 10 [miss cudnn]
+::     6. java jre [not complete]
+::     7. xly agent [not complete]
+
+:: Echo command is not required.
+@echo off
+
+:: ===== start step 0: wget tool =====
+:: Download wget for windows when there is not wget tool.
+echo ">>>>>>>> step [0/7]: wget tool"
+wget --help > nul 2> nul || call:install_wget
+goto cmake
+
+:install_wget
+echo There is not wget in this PC, will download wget 1.20.
+echo Download package from https://eternallybored.org/misc/wget/1.20/64/wget.exe ...
+certutil -urlcache -split -f https://eternallybored.org/misc/wget/1.20/64/wget.exe > nul 2> nul
+if %errorlevel% == 0 (
+  echo Download wget tool into %cd% success.
+) else (
+  echo Error***** Download wget tool failed, please download it before rerun.
+  exit /b 1
+) 
+goto :eof
+:: ===== end step 0: wget tool =====
+
+:: ===== start step 1: cmake =====
+:: Download CMake-3.17.0 and add in PATH when it not installed.
+:: TODO: limit version >= 3.17.0
+:cmake
+echo ">>>>>>>> step [1/7]: CMake 3.17.0"
+cmake --help > nul 2> nul || call :install_cmake
+goto git
+
+:install_cmake
+echo There is not cmake in this PC, will install cmake-3.17.0.
+echo Download package from https://cmake.org/files/v3.17/cmake-3.17.0-win64-x64.msi ...
+wget -O cmake-3.17.0-win64-x64.msi https://cmake.org/files/v3.17/cmake-3.17.0-win64-x64.msi
+echo Install cmake-3.17.0 ...
+:: /passive [silent installation]
+:: /norestart [do not restart]
+:: ADD_CMAKE_TO_PATH = System [add CMake to the system PATH for all users]
+start /wait cmake-3.17.0-win64-x64.msi /passive /norestart ADD_CMAKE_TO_PATH=System
+if %errorlevel% == 0 (
+  echo Install CMake-3.17.0 success!
+) else (
+  echo Error***** Install Cmake-3.17.0 failed, please re-install it manually.
+)
+del cmake-3.17.0-win64-x64.msi
+goto :eof
+:: ===== end step 1: cmake =====
+
+:: ===== start step 2: Git =====
+:: Download Git-2.28.0 and add in PATH when it not installed.
+:: TODO: limit version >= 2.28.0
+:git
+echo ">>>>>>>> step [2/8]: Git 2.28.0"
+git --help > nul 2> nul || call :install_git
+goto python
+
+:install_git
+echo There is not git in this PC, will install Git-2.28.0.
+echo Download package from https://github.com/git-for-windows/git/releases/download/v2.28.0.windows.1/Git-2.28.0-64-bit.exe ...
+wget -O Git-2.28.0-64-bit.exe https://github.com/git-for-windows/git/releases/download/v2.28.0.windows.1/Git-2.28.0-64-bit.exe
+echo Install Git-2.28.0 ...
+:: /SILENT [silent install]
+:: /ALLUSERS [add path for all users]
+:: /NORESTART [do not restart]
+start /wait Git-2.28.0-64-bit.exe /SILENT /ALLUSERS /NORESTART
+if %errorlevel% == 0 (
+  echo Install Git-2.28.0 success!
+) else (
+  echo Error***** Install Git-2.28.0 failed, please re-install it manually.
+)
+del Git-2.28.0-64-bit.exe
+goto :eof
+:: ===== end step 2: Git =====
+
+:: ===== start step 3: Python =====
+:: Download Python-3.7.8 and add in PATH when it not installed.
+:: TODO: limit version >= 3.7.8
+:python
+echo ">>>>>>>> step [3/7]: Python 3.7.8"
+python -V 2>&1 | findstr /C:"Python 3.7.8" > nul 2> nul || call :install_python
+goto vs2015
+
+:install_python
+echo There is not Python in this PC, will install Python-3.7.8.
+echo Download package from https://npm.taobao.org/mirrors/python/3.7.8/python-3.7.8-amd64.exe ...
+wget -O python-3.7.8-amd64.exe https://npm.taobao.org/mirrors/python/3.7.8/python-3.7.8-amd64.exe
+echo Install Python-3.7.8 ...
+:: /passive [silent install]
+:: InstallAllUsers [add path for all users]
+:: PrependPath [add script/install into PATH]
+:: TargetDir [install directory]
+start /wait python-3.7.8-amd64.exe /passive InstallAllUsers=1 PrependPath=1 TargetDir=C:\Python37
+if %errorlevel% == 0 (
+  echo Install python-3.7.8 success!
+) else (
+  echo Error***** Install python-3.7.8 failed, please re-install it manually.
+)
+del python-3.7.8-amd64.exe
+goto :eof
+:: ===== end step 3: Python =====
+
+:: ===== start step 4: Visual Studio 2015 =====
+:: Download Visual Studio 2015 when it not installed.
+:vs2015
+echo ">>>>>>>> step [4/7]: Visual Studio 2015"
+cmd /C "C:\Program Files (x86)\Microsoft Visual Studio 14.0\VC\vcvarsall.bat" amd64 > nul 2> nul || call :install_visual_studio
+goto :cuda10
+
+:install_visual_studio
+echo There is not Visual Studio in this PC, will install VS2015.
+echo Download package from "https://download.my.visualstudio.com/pr/en_visual_studio_professional_2015_with_update_3_x86_x64_web_installer_8922978.exe"
+wget -O vs_installer.exe "https://download.my.visualstudio.com/pr/en_visual_studio_professional_2015_with_update_3_x86_x64_web_installer_8922978.exe?t=9ee7a96d-ca80-4b84-af2c-7dd86996a0aa&e=1600103404&h=3cdea1e81c04aa4e846f5314972c46eb&su=1"
+echo Install Visual Studio 2015 ...
+:: /passive [silent install]
+:: /norestart [no restart]
+:: /NoRefresh [no refresh]
+:: /InstallSelectableItems NativeLanguageSupport_Group [select Visual C++ for installing]
+start /wait visual_installer.exe /passive /norestart /NoRefresh /InstallSelectableItems NativeLanguageSupport_Group
+if %errorlevel% == 0 (
+  echo Install Visual Studio 2015 success!
+) else (
+  echo Error***** Install Visual Studio 2015 failed, please re-install it manually.
+)
+del vs_installer.exe
+goto :eof
+:: ===== end step 4: Visual Studio 2015 =====
+
+:: ===== start step 5: CUDA 10 =====
+:cuda10
+echo ">>>>>>>> step [5/7]: CUDA 10.0"
+nvcc --version > nul 2> nul || call :install_cuda
+goto java-jre
+
+:install_cuda
+echo There is not CUDA in this PC, will install CUDA-10.0.
+echo Download package from "https://developer.download.nvidia.cn/compute/cuda/10.0/secure/Prod/network_installers/cuda_10.0.130_win10_network.exe"
+wget -O cuda_installer.exe "https://developer.download.nvidia.cn/compute/cuda/10.0/secure/Prod/network_installers/cuda_10.0.130_win10_network.exe?hG7oBtA2CnxZG7d39onmBdtzrIa2cOukrmW8I0qk3h36vb2Sj0yYGjMElJlxlNhjx8Xu5RlbmdBhCWvP2QcEqMjCoKCXe5lOgr5uIIso_7LqrotgQHbZRZSVBYRT4bIAHPVSPrr4_4KczKvI9Nf3mbO9RJ2Vj6ECD5QphRMJBus0KKNVxO1gsplVL5qaCnE"
+echo Install CUDA-10.0 ...
+:: -s [silent install]
+start /wait cuda_installer.exe -s
+if %errorlevel% == 0 (
+  echo Install CUDA-10.0 success!
+) else (
+  echo Error***** Install CUDA-10.0 failed, please re-install it manually.
+)
+del cuda_installer.exe
+goto :eof
+:: ===== end step 5: CUDA 10 =====
+
+:: ===== start step 6: java jre =====
+:java-jre
+echo ">>>>>>>> step [6/7]: java jre"
+goto xly-agent
+:: ===== end step 6: java jre =====
+
+:: ===== start step 7: xly agent =====
+:xly-agent
+echo ">>>>>>>> step [7/7]: xly agent"
+goto :eof
+:: ===== end step 8: xly agent =====
\ No newline at end of file
diff --git a/tools/wlist.json b/tools/wlist.json
index 20f6a9cbaedb391995b3757612ec24f2061a8a81..5591f90da4ba807871663e56fe4e3b11bf2fbd8f 100644
--- a/tools/wlist.json
+++ b/tools/wlist.json
@@ -105,8 +105,6 @@
         "convert_dist_to_sparse_program",
         "load_persistables_for_increment",
         "load_persistables_for_inference",
-        "cache",
-        "buffered",
         "xmap_readers",
         "Metric.reset",
         "Metric.update",